diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index 5a7793e95..0cc1a3a47 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -48,17 +48,17 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc // if there are no non-ref bases... if ( bestAlternateAllele == null ) { - // if we don't want all bases, then we can just return - - // todo -- we still need to calculate the confidence in the reference base. - // todo -- we can still include this optimization, but we should calculate a confidence score -// if ( !ALL_BASE_MODE && !GENOTYPE_MODE ) -// return new VariantCallContext(false); - + // if we don't want all bases, then we don't need to calculate genotype likelihoods + if ( !ALL_BASE_MODE && !GENOTYPE_MODE ) { + VariantCallContext vcc = new VariantCallContext(false); + estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false); + return vcc; + } // otherwise, choose any alternate allele (it doesn't really matter) bestAlternateAllele = (ref != 'A' ? 'A' : 'C'); } + // calculate likelihoods if there are non-ref bases initializeAlleleFrequencies(frequencyEstimationPoints); initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); @@ -69,8 +69,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc if ( verboseWriter != null ) printAlleleFrequencyData(ref, loc, frequencyEstimationPoints); - return createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints); - } + VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints); + + // technically, at this point our confidence in a reference call isn't accurately + // estimated because it didn't take into account samples with no data + if ( vcc.variation == null ) + estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true); + return vcc; + } protected int getNSamples(Map contexts) { return contexts.size(); @@ -151,6 +157,27 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc return AFs; } + private void estimateReferenceConfidence(VariantCallContext vcc, Map contexts, double theta, boolean ignoreCoveredSamples) { + + double P_of_ref = 1.0; + + // use the AF=0 prob if it's calculated + if ( ignoreCoveredSamples ) + P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)]; + + // for each sample that we haven't examined yet + for ( String sample : samples ) { + boolean isCovered = contexts.containsKey(sample); + if ( ignoreCoveredSamples && isCovered ) + continue; + + int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0; + P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); + } + + vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= CONFIDENCE_THRESHOLD; + } + protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { // initialization diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 6befd453c..99995bf1b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -83,7 +83,4 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - - @Argument(fullName = "max_coverage", shortName = "mc", doc = "Maximum reads at this locus (after filtering bad bases/reads) for it to be callable; to disable, provide value < 1 [default:10,000]", required = false) - public Integer MAX_READS_IN_PILEUP = 100000; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 2d50bc00c..c819e3f6a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -260,15 +260,19 @@ public class UnifiedGenotyper extends LocusWalker 0 && pileup.size() > UAC.MAX_READS_IN_PILEUP) ) + // don't call when there is no coverage + if ( pileup.size() == 0 ) return null; // are there too many deletions in the pileup? @@ -306,8 +310,7 @@ public class UnifiedGenotyper extends LocusWalker filteredPileup = new ArrayList(); for ( PileupElement p : pileup ) { - if ( p.getMappingQual() >= UAC.MIN_MAPPING_QUALTY_SCORE && - (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) && + if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) && AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES ) filteredPileup.add(p); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 16f465b6f..17b8d92cd 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("46232790dae2e99e79626de78836ba08")); + Arrays.asList("5ad3f97c886a3381821366caa9162c12")); executeTest("testMultiSamplePilot1 - Point Estimate EM", spec); } @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("683cfa48cc5e2d140286645873184c20")); + Arrays.asList("73c80566c8353958b2ac61932f0b3812")); executeTest("testMultiSamplePilot2 - Point Estimate EM", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testPooled1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, - Arrays.asList("f2b3799fe18010aa8cfd76b0e0782db7")); + Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4")); executeTest("testPooled1", spec); } @@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("42fc8d585f2f30dc6c58d413738e1f7c")); + Arrays.asList("5e0a92fbddeb9d6e35586d0488a1e5c7")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("c854552bbf4a33b8dca488ea5f0a4a32")); + Arrays.asList("68d00450e3c2129ea38c67171722b385")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("9dcb1d4af7c02804150a0f6c38be4e1e")); + Arrays.asList("8971ab1c9d2780e5e12e9bfc0b059cd1")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testParallelization() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1, - Arrays.asList("35637b1ec52f2a7c0551b87795c3060c")); + Arrays.asList("fb5d09eb8f1494d48032be7272699add")); executeTest("test parallelization", spec); } @@ -98,11 +98,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "e5d4287d27aa7734f0d57a8213f549ef" ); - e.put( "-all_bases", "3a291bb06764e3615230f467cc501096" ); - e.put( "--min_base_quality_score 26", "fb00499f249973c156ca64e30e7b2d91" ); - e.put( "--min_mapping_quality_score 26", "315c7951a783655445933ed9d83db2c2" ); - e.put( "--max_mismatches_in_40bp_window 5", "bca61f8afce9f64763a1225e9c614d38" ); + e.put( "-genotype", "41af43f6eaab72de553d865a3089bf54" ); + e.put( "-all_bases", "7cc1609aef6d6cc3dd7822c52c403750" ); + e.put( "--min_base_quality_score 26", "9596e2102369ced181f2a87d686faf2e" ); + e.put( "--min_mapping_quality_score 26", "130efb2b8bd7b495bf65c6477bcf83c8" ); + e.put( "--max_mismatches_in_40bp_window 5", "18935308954cf390b628c9226eccbe94" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -116,7 +116,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1, - Arrays.asList("c7427818f57cc3bf11e9ee98461c1a65")); + Arrays.asList("8c0e1fed37a2eac9eaaaa59e31350f43")); executeTest("testConfidence", spec); }