Joint Estimation model now emits calls in all formats.

The whole GenotypeCall framework needs to be changed, but this will work for the time being.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1902 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-23 03:07:28 +00:00
parent a6dc8cd44e
commit 6c338eccb8
3 changed files with 75 additions and 16 deletions

View File

@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected Logger logger;
protected double heterozygosity;
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
protected boolean ALL_BASE_MODE;
protected boolean GENOTYPE_MODE;
protected boolean POOLED_INPUT;
protected int POOL_SIZE;
protected double LOD_THRESHOLD;
protected double CONFIDENCE_THRESHOLD;
protected double MINIMUM_ALLELE_FREQUENCY;
protected int maxDeletionsInPileup;
protected String assumedSingleSample;
@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
baseModel = UAC.baseModel;
heterozygosity = UAC.heterozygosity;
defaultPlatform = UAC.defaultPlatform;
ALL_BASE_MODE = UAC.ALL_BASES;
GENOTYPE_MODE = UAC.GENOTYPE;
POOLED_INPUT = UAC.POOLED;
POOL_SIZE = UAC.POOLSIZE;
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
maxDeletionsInPileup = UAC.MAX_DELETIONS;
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;

View File

@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
private double[] PofFs = new double[BaseUtils.BASES.length];
// the minimum and actual number of points in our allele frequency estimation
private static final int MIN_ESTIMATION_POINTS = 100; //1000;
private static final int MIN_ESTIMATION_POINTS = 100;
private int frequencyEstimationPoints;
// the GenotypeLikelihoods map
@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// run joint estimation for the full GL contexts
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
return createCalls(ref, contexts);
//double lod = overall.getPofD() - overall.getPofNull();
//logger.debug("lod=" + lod);
@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// generate the calls
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
return null;
}
private void initializeAlleleFrequencies(int numSamples) {
@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
private void normalizeFromLog10(double[] array) {
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = findMaxEntry(array);
double maxValue = findMaxEntry(array).first;
for (int i = 0; i < array.length; i++)
array[i] = Math.pow(10, array[i] - maxValue);
@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
array[i] /= sum;
}
private static double findMaxEntry(double[] array) {
// returns the maximum value in the array and its index
private static Pair<Double, Integer> findMaxEntry(double[] array) {
int index = 0;
double max = array[0];
for (int index = 1; index < array.length; index++) {
if ( array[index] > max )
max = array[index];
for (int i = 1; i < array.length; i++) {
if ( array[i] > max ) {
max = array[i];
index = i;
}
}
return max;
return new Pair<Double, Integer>(max, index);
}
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
char base = Character.toLowerCase(altAllele);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
}
}
verboseWriter.println();
}
private Pair<List<GenotypeCall>, GenotypeMetaData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts) {
// first, find the alt allele with maximum confidence
int indexOfMax = 0;
char baseOfMax = ref;
double maxConfidence = Double.MIN_VALUE;
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( PofFs[baseIndex] > maxConfidence ) {
indexOfMax = baseIndex;
baseOfMax = altAllele;
maxConfidence = PofFs[baseIndex];
}
}
}
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1);
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
// TODO -- generate strand score
double strandScore = 0.0;
GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess);
// generate calls only if we pass the threshold or we want ref calls
if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) {
for ( String sample : GLs.keySet() ) {
// get the pileup
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
pileup.setIncludeDeletionsInPileupString(true);
// TODO -- fix GenotypeCall code so that each call doesn't need its own pileup
// create the call
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
calls.add(call);
// TODO -- fix GenotypeCall code so that UG tells it which genotypes to use
// TODO -- all of the intelligence for making calls should be in UG
}
}
return new Pair<List<GenotypeCall>, GenotypeMetaData>(calls, metadata);
}
}

View File

@ -51,22 +51,28 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
public boolean GENOTYPE = false;
@Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
public boolean ALL_BASES = false;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
public String VERBOSE = null;
// control the error modes
@Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
public String ASSUME_SINGLE_SAMPLE = null;
// control the various parameters to be used
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
public double LOD_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
// control the various parameters to be used
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
public double LOD_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
public Integer MAX_DELETIONS = 1;