From a1d6461121c75ac4c58af86c32df70fbf9447eb0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 28 Nov 2012 11:35:41 -0500 Subject: [PATCH] Critical bugfix to AFCalcResult affecting UG/HC quality score emission thresholds As reported by Menachem Fromer: a critical bug in AFCalcResult: Specifically, the implementation: public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } seems incorrect and should probably be: getLog10PosteriorOfAFEq0ForAllele(allele) <= log10minPNonRef The issue here is that the 30 represents a Phred-scaled probability of *error* and it's currently being compared to a log probability of *non-error*. Instead, we need to require that our probability of error be less than the error threshold. This bug has only a minor impact on the calls -- hardly any sites change -- which is good. But the inverted logic effects multi-allelic sites significantly. Basically you only hit this logic with multiple alleles, and in that case it'\s including extra alt alleles incorrectly, and throwing out good ones. Change was to create a new function that properly handles thresholds that are PhredScaled quality scores: /** * Same as #isPolymorphic but takes a phred-scaled quality score as input */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); return isPolymorphic(allele, log10Threshold); } --- .../UnifiedGenotyperIntegrationTest.java | 8 +-- .../afcalc/AFCalcResultUnitTest.java | 56 +++++++++++++++++-- .../genotyper/UnifiedGenotyperEngine.java | 8 +-- .../genotyper/afcalc/AFCalcResult.java | 11 ++++ 4 files changed, 70 insertions(+), 13 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9212d0e53..88f2ab3ea 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94dc17d76d841f1d3a36160767ffa034")); + Arrays.asList("a373979d01c3a3fb20159235d27eb92c")); executeTest("test Multiple SNP alleles", spec); } @@ -197,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "5c75cecb523cac988beecd59186289ff"); } private void testOutputParameters(final String args, final String md5) { @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e")); + Arrays.asList("a5a81bf1b10be860a6a5272fb928e8eb")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352")); + Arrays.asList("ad52814cd6c45df424fc992699feead6")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index cbe2eb268..96e055e92 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -2,16 +2,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; public class AFCalcResultUnitTest extends BaseTest { private static class MyTest { @@ -79,4 +77,54 @@ public class AFCalcResultUnitTest extends BaseTest { final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()}; Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision"); } + + @DataProvider(name = "TestIsPolymorphic") + public Object[][] makeTestIsPolymorphic() { + List tests = new ArrayList(); + + final List pValues = new LinkedList(); + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) + for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + pValues.add(p + espilon); + + for ( final double pNonRef : pValues ) { + for ( final double pThreshold : pValues ) { + final boolean shouldBePoly = pNonRef >= pThreshold; + if ( pNonRef != pThreshold) + // let's not deal with numerical instability + tests.add(new Object[]{ pNonRef, pThreshold, shouldBePoly }); + } + } + + return tests.toArray(new Object[][]{}); + } + + private AFCalcResult makePolymorphicTestData(final double pNonRef) { + return new AFCalcResult( + new int[]{0}, + 1, + alleles, + MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), + log10Even, + Collections.singletonMap(C, Math.log10(pNonRef))); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold)); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphicQual(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final double qual = QualityUtils.phredScaleCorrectRate(pThreshold); + final boolean actualIsPoly = result.isPolymorphicPhredScaledQual(C, qual); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 97254c478..f22187363 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -378,11 +378,9 @@ public class UnifiedGenotyperEngine { if ( alternateAllele.isReference() ) continue; - // we are non-ref if the probability of being non-ref > the emit confidence. - // the emit confidence is phred-scaled, say 30 => 10^-3. - // the posterior AF > 0 is log10: -5 => 10^-5 - // we are non-ref if 10^-5 < 10^-3 => -5 < -3 - final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); // if the most likely AC is not 0, then this is a good alternate allele to use if ( isNonRef ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index a65772444..dbb0e8cdd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -234,10 +235,20 @@ public class AFCalcResult { * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 */ + @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Same as #isPolymorphic but takes a phred-scaled quality score as input + */ + public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { + if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); + final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + return isPolymorphic(allele, log10Threshold); + } + /** * Are any of the alleles polymorphic w.r.t. #isPolymorphic? *