diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index f62d181ce..c2e2af0db 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected Logger logger; protected double heterozygosity; protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; + protected boolean ALL_BASE_MODE; protected boolean GENOTYPE_MODE; protected boolean POOLED_INPUT; protected int POOL_SIZE; protected double LOD_THRESHOLD; + protected double CONFIDENCE_THRESHOLD; protected double MINIMUM_ALLELE_FREQUENCY; protected int maxDeletionsInPileup; protected String assumedSingleSample; @@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable { baseModel = UAC.baseModel; heterozygosity = UAC.heterozygosity; defaultPlatform = UAC.defaultPlatform; + ALL_BASE_MODE = UAC.ALL_BASES; GENOTYPE_MODE = UAC.GENOTYPE; POOLED_INPUT = UAC.POOLED; POOL_SIZE = UAC.POOLSIZE; LOD_THRESHOLD = UAC.LOD_THRESHOLD; + CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD; MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY; maxDeletionsInPileup = UAC.MAX_DELETIONS; assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index ec341ece2..e78a0cedd 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo private double[] PofFs = new double[BaseUtils.BASES.length]; // the minimum and actual number of points in our allele frequency estimation - private static final int MIN_ESTIMATION_POINTS = 100; //1000; + private static final int MIN_ESTIMATION_POINTS = 100; private int frequencyEstimationPoints; // the GenotypeLikelihoods map @@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // run joint estimation for the full GL contexts initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL); calculateAlleleFrequencyPosteriors(ref, context.getLocation()); - + return createCalls(ref, contexts); //double lod = overall.getPofD() - overall.getPofNull(); //logger.debug("lod=" + lod); @@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // generate the calls //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF()); //return new Pair, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata); - return null; } private void initializeAlleleFrequencies(int numSamples) { @@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo private void normalizeFromLog10(double[] array) { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = findMaxEntry(array); + double maxValue = findMaxEntry(array).first; for (int i = 0; i < array.length; i++) array[i] = Math.pow(10, array[i] - maxValue); @@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo array[i] /= sum; } - private static double findMaxEntry(double[] array) { + // returns the maximum value in the array and its index + private static Pair findMaxEntry(double[] array) { + int index = 0; double max = array[0]; - for (int index = 1; index < array.length; index++) { - if ( array[index] > max ) - max = array[index]; + for (int i = 1; i < array.length; i++) { + if ( array[i] > max ) { + max = array[i]; + index = i; + } } - return max; + return new Pair(max, index); } private void printAlleleFrequencyData(char ref, GenomeLoc loc) { @@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo char base = Character.toLowerCase(altAllele); int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex])); + verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); } } verboseWriter.println(); } + + private Pair, GenotypeMetaData> createCalls(char ref, HashMap contexts) { + // first, find the alt allele with maximum confidence + int indexOfMax = 0; + char baseOfMax = ref; + double maxConfidence = Double.MIN_VALUE; + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele != ref ) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( PofFs[baseIndex] > maxConfidence ) { + indexOfMax = baseIndex; + baseOfMax = altAllele; + maxConfidence = PofFs[baseIndex]; + } + } + } + double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); + double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1); + + ArrayList calls = new ArrayList(); + // TODO -- generate strand score + double strandScore = 0.0; + GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess); + + // generate calls only if we pass the threshold or we want ref calls + if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) { + + for ( String sample : GLs.keySet() ) { + // get the pileup + AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL); + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + pileup.setIncludeDeletionsInPileupString(true); + // TODO -- fix GenotypeCall code so that each call doesn't need its own pileup + + // create the call + GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup); + calls.add(call); + + // TODO -- fix GenotypeCall code so that UG tells it which genotypes to use + // TODO -- all of the intelligence for making calls should be in UG + } + } + + return new Pair, GenotypeMetaData>(calls, metadata); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1609e0de6..4b00b83d9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -51,22 +51,28 @@ public class UnifiedArgumentCollection { @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false) public boolean GENOTYPE = false; - @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) + @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false) + public boolean ALL_BASES = false; + + @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) public String VERBOSE = null; // control the error modes - @Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) + @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) public String ASSUME_SINGLE_SAMPLE = null; - - // control the various parameters to be used - @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false) - public double LOD_THRESHOLD = Double.MIN_VALUE; - @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false) public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null; + + // control the various parameters to be used + @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false) + public double LOD_THRESHOLD = Double.MIN_VALUE; + + @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false) + public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE; + @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false) public Integer MAX_DELETIONS = 1;