diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index fae3d37ef..3241fa85f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -37,8 +37,16 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc // the alternate allele with the largest sum of quality scores protected Character bestAlternateAllele = null; + // are we at a 'trigger' track site? + protected boolean atTriggerTrack = false; - protected JointEstimateGenotypeCalculationModel() {} + // the standard filter to use for calls below the confidence threshold but above the emit threshold + protected static final Set filter = new HashSet(1); + + + protected JointEstimateGenotypeCalculationModel() { + filter.add("LowQual"); + } public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, char[] ref, GenomeLoc loc, Map stratifiedContexts) { return null; @@ -55,12 +63,12 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc initializeBestAlternateAllele(ref, contexts); // did we trigger on the provided track? - boolean triggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0; + atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0; // if there are no non-ref bases... if ( bestAlternateAllele == null ) { // if we don't want all bases, then we don't need to calculate genotype likelihoods - if ( !triggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) { + if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) { VariantCallContext vcc = new VariantCallContext(false); estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false); return vcc; @@ -80,7 +88,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc if ( verboseWriter != null ) printAlleleFrequencyData(ref, loc, frequencyEstimationPoints); - VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints, triggerTrack); + VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints); // technically, at this point our confidence in a reference call isn't accurately // estimated because it didn't take into account samples with no data @@ -190,7 +198,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); } - vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.CONFIDENCE_THRESHOLD; + vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; } protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { @@ -326,7 +334,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc return new HashMap(); } - protected VariantCallContext createCalls(RefMetaDataTracker tracker, char ref, Map contexts, GenomeLoc loc, int frequencyEstimationPoints, boolean triggerTrack) { + protected VariantCallContext createCalls(RefMetaDataTracker tracker, char ref, Map contexts, GenomeLoc loc, int frequencyEstimationPoints) { // only need to look at the most likely alternate allele int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele); @@ -350,8 +358,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc } // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( !triggerTrack && !UAC.ALL_BASES_MODE && ((!UAC.GENOTYPE_MODE && bestAFguess == 0) || phredScaledConfidence < UAC.CONFIDENCE_THRESHOLD) ) - return new VariantCallContext(phredScaledConfidence >= UAC.CONFIDENCE_THRESHOLD); + if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) + return new VariantCallContext(passesCallThreshold(phredScaledConfidence)); // output to beagle file if requested if ( beagleWriter != null ) { @@ -420,8 +428,20 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc attributes.put("SB", new Double(strandScore)); } - VariantContext vc = new VariantContext("UG_SNP_call", loc, alleles, genotypes, phredScaledConfidence/10.0, null, attributes); + VariantContext vc = new VariantContext("UG_SNP_call", loc, alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes); - return new VariantCallContext(vc, phredScaledConfidence >= UAC.CONFIDENCE_THRESHOLD); + return new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence)); + } + + protected boolean passesEmitThreshold(double conf, int bestAFguess) { + return (atTriggerTrack ? + (conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) : + ((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING))); + } + + protected boolean passesCallThreshold(double conf) { + return (atTriggerTrack ? + (conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) : + (conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING)); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1b06f3ff3..36d0362de 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -40,9 +40,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY; - @Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool -- no longer in use", required = false) - public int POOLSIZE = 0; - // control the output @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public boolean GENOTYPE_MODE = false; @@ -50,6 +47,18 @@ public class UnifiedArgumentCollection { @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false) public boolean ALL_BASES_MODE = false; + @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + + @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) + public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; + + @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) + public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; + @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; @@ -63,9 +72,6 @@ public class UnifiedArgumentCollection { // control the various parameters to be used - @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered", required = false) - public double CONFIDENCE_THRESHOLD = 50.0; - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 10; @@ -81,6 +87,9 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + @Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false) + public boolean CAP_BASE_QUALITY = false; + public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); @@ -93,7 +102,10 @@ public class UnifiedArgumentCollection { uac.NO_SLOD = NO_SLOD; uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; uac.defaultPlatform = defaultPlatform; - uac.CONFIDENCE_THRESHOLD = CONFIDENCE_THRESHOLD; + uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; + uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; + uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING; + uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE; uac.MAX_MISMATCHES = MAX_MISMATCHES; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a506ae774..68417917a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -105,9 +105,6 @@ public class UnifiedGenotyperEngine { if ( UAC.genotypeModel == GenotypeCalculationModel.Model.INDELS && !(genotypeWriter instanceof VCFGenotypeWriter) ) { throw new IllegalArgumentException("Attempting to use an output format other than VCF with indels. Please set the output format to VCF."); } - if ( UAC.POOLSIZE > 0 ) { - throw new IllegalArgumentException("Attempting to provide depreciated POOLSIZE parameter for retired POOLED model"); - } if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) { // the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads"); @@ -183,7 +180,7 @@ public class UnifiedGenotyperEngine { pileup = filterPileup(pileup, refContext); // don't call when there is no coverage - if ( pileup.size() == 0 ) + if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) return null; // stratify the AlignmentContext and cut by sample @@ -204,7 +201,7 @@ public class UnifiedGenotyperEngine { pileup = filterPileup(pileup, refContext); // don't call when there is no coverage - if ( pileup.size() == 0 ) + if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) return null; // are there too many deletions in the pileup? diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java index 03e7b5beb..5c67042ed 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java @@ -60,7 +60,7 @@ public class SnpCallRateByCoverageWalker extends LocusWalker, Strin public void initialize() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.CONFIDENCE_THRESHOLD = confidence; + uac.STANDARD_CONFIDENCE_FOR_CALLING = uac.STANDARD_CONFIDENCE_FOR_EMITTING = confidence; uac.ALL_BASES_MODE = true; UG = new UnifiedGenotyperEngine(getToolkit(), uac); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java index 489d5022b..939db258f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/contamination/FindContaminatingReadGroupsWalker.java @@ -68,7 +68,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker e = new HashMap(); - e.put( "-genotype", "02316b8892d439e23d6fbbc65232f921" ); - e.put( "-all_bases", "bcce54904e4c3352168bbfb39f2b9a2f" ); - e.put( "--min_base_quality_score 26", "a8a286e61af8a6b9ba72c2eab19573bb" ); - e.put( "--min_mapping_quality_score 26", "c599b0c3a1e42772e9d1f9ff7112d1e4" ); - e.put( "--max_mismatches_in_40bp_window 5", "46129c47e7d283d8950e029ed39dc1e8" ); + e.put( "-genotype", "310d3f8ac2a1100d82fe6241e201e1d8" ); + e.put( "-all_bases", "470196b3f9a2ed622383e25552972a3e" ); + e.put( "--min_base_quality_score 26", "a1142c69f1951a05d5a33215c69133af" ); + e.put( "--min_mapping_quality_score 26", "796d1ea790481a1a257609279f8076bb" ); + e.put( "--max_mismatches_in_40bp_window 5", "379bd56d4402e974127e0677ff51d040" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 " + entry.getKey(), 1, + "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 " + entry.getKey(), 1, Arrays.asList(entry.getValue())); executeTest(String.format("testParameter[%s]", entry.getKey()), spec); } @@ -93,10 +93,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testConfidence() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1, - Arrays.asList("64c8fffc735f72663f27b2257fff5583")); - executeTest("testConfidence", spec); + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, + Arrays.asList("7b41dc669e8a0abde089d7aeedab2941")); + executeTest("testConfidence1", spec1); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, + Arrays.asList("258617b8de0e6468bf6a0c740b51676b")); + executeTest("testConfidence2", spec2); } // -------------------------------------------------------------------------------------------------------------- @@ -106,19 +111,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testOtherOutput() { - String[] md5s = {"e9d7c27ed63d2e92a17ba44501ab1138", "8cba0b8752f18fc620b4697840bc7291"}; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper" + " -R " + oneKGLocation + "reference/human_b36_both.fasta" + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam" + - " -varout %s" + + " -varout /dev/null" + " -beagle %s" + - " -L 1:10,023,400-10,024,000" + - " -bm empirical" + - " -gm JOINT_ESTIMATE" + - " -vf VCF", - 2, - Arrays.asList(md5s)); + " -L 1:10,023,400-10,024,000", + 1, + Arrays.asList("5077d80806e24865b5c12553843a5f85")); executeTest(String.format("testOtherOutput"), spec); } @@ -137,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -vf " + entry.getKey(), 1, + "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -vf " + entry.getKey(), 1, Arrays.asList(entry.getValue())); executeTest(String.format("testOtherFormat[%s]", entry.getKey()), spec); } @@ -160,7 +161,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -vf GELI -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 --heterozygosity " + entry.getKey(), 1, + "-T UnifiedGenotyper -vf GELI -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 --heterozygosity " + entry.getKey(), 1, Arrays.asList(entry.getValue())); executeTest(String.format("testHeterozyosity[%s]", entry.getKey()), spec); } @@ -180,7 +181,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -vf GELI -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -gm JOINT_ESTIMATE -confidence 30 -bm " + entry.getKey(), 1, + "-T UnifiedGenotyper -vf GELI -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm " + entry.getKey(), 1, Arrays.asList(entry.getValue())); executeTest(String.format("testOtherBaseCallModel[%s]", entry.getKey()), spec); } @@ -199,11 +200,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + " -varout %s" + " -L 1:10,000,000-10,100,000" + - " -bm empirical" + - " -gm JOINT_ESTIMATE" + - " -vf GELI", + " -vf GELI", 1, - Arrays.asList("3c6d76d55d608482940cd725b87ef07d")); + Arrays.asList("f67c690bf2e4eee2bb7c58b6646a2a98")); executeTest(String.format("testMultiTechnologies"), spec); }