From 0fcd358ace4c232236e577d53627ea06959b4766 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 19:33:47 -0400 Subject: [PATCH] Original EXACT model implementation lives, providing another reference (bi-allelic only) EXACT model -- Potentially a very fast implementation (it's very clean) but restricted to the biallelic case -- A starting point for future bi-allelic only optimized (logless) or generalized (bi-allelic general ploidy) implementations -- Added systematic unit tests covering this implementation, and comparing it to others -- Uncovered a nasty normalization bug in StateTracker that was capping our likelihoods at 0, even after summing up multiple likelihoods, which is just not safe to do and was causing us to lose likelihood in some cases -- Removed the restriction that a likelihood be <= 0 in StateTracker, and the protection for these cases in GeneralPloidyExactAFCalc which just wasn't right --- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +- .../genotyper/afcalc/AFCalcUnitTest.java | 149 ++++++++++++++---- .../afcalc/OriginalDiploidExactAFCalc.java | 38 +++-- .../genotyper/afcalc/StateTracker.java | 14 +- 4 files changed, 149 insertions(+), 56 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 51b7fb633..2b247430c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ab967fbe1..ef4318a40 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest { new GetGLsTest(model, 1, genotypes, priors, priorName); // tri-allelic - if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) ) for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) new GetGLsTest(model, 2, genotypes, priors, priorName); } @@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest { return GetGLsTest.getTests(GetGLsTest.class); } - @DataProvider(name = "badGLs") - public Object[][] createBadGLs() { - final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); - final int nSamples = genotypes.size(); +// @DataProvider(name = "badGLs") +// public Object[][] createBadGLs() { +// final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); +// final int nSamples = genotypes.size(); +// +// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); +// +// final int nPriorValues = 2*nSamples+1; +// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors +// for ( AFCalc model : Arrays.asList(indCalc) ) { +// final String priorName = "flat"; +// new GetGLsTest(model, 2, genotypes, priors, priorName); +// } +// +// return GetGLsTest.getTests(GetGLsTest.class); +// } - final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); - - final int nPriorValues = 2*nSamples+1; - final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - for ( AFCalc model : Arrays.asList(indCalc) ) { - final String priorName = "flat"; - new GetGLsTest(model, 2, genotypes, priors, priorName); - } - - return GetGLsTest.getTests(GetGLsTest.class); - } +// +// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") +// public void testBadGLs(GetGLsTest cfg) { +// testResultSimple(cfg); +// } @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") public void testBiallelicGLs(GetGLsTest cfg) { @@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") - public void testBadGLs(GetGLsTest cfg) { - testResultSimple(cfg); - } - private static class NonInformativeData { final Genotype nonInformative; final List called; @@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors for ( AFCalc model : calcs ) { + if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc ) + continue; + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); for ( int rotation = 0; rotation < nSamples; rotation++ ) { @@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest { "Actual pNonRef not within tolerance " + tolerance + " of expected"); } + @DataProvider(name = "PNonRefBiallelicSystematic") + public Object[][] makePNonRefBiallelicSystematic() { + List tests = new ArrayList(); + + final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000); + final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true)); + + for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) { + + if ( false ) { // for testing only + tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))}); + } else { + if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK + + // test all combinations of PLs for 1 sample + for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + + + final List> smallDiploidPLs = new LinkedList>(); + for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) { + for ( int i = 0; i < 2; i++ ) { + List pls = new ArrayList(Collections.nCopies(3, nonRefPL)); + pls.set(i, 0); + smallDiploidPLs.add(pls); + } + } + + for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + final List> removeBadPLs(List> listOfPLs) { + List> clean = new LinkedList>(); + + for ( final List PLs : listOfPLs ) { + int x = PLs.get(0); + boolean bad = false; + for ( int pl1 : PLs ) + if ( pl1 > x ) + bad = true; + else + x = pl1; + if ( ! bad ) clean.add(PLs); + } + + return clean; + } + + private List toGenotypes(final List> PLsPerSample) { + final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List genotypes = new ArrayList(PLsPerSample.size()); + + for ( final List PLs : PLsPerSample ) { + final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3])); + final int min = MathUtils.arrayMin(pls); + for ( int i = 0; i < pls.length; i++ ) pls[i] -= min; + genotypes.add(makePL(nocall, pls)); + } + + return genotypes; + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic") + private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) { + //logger.warn("Running " + modelType + " with " + genotypes); + final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human); + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human); + + final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C)); + vcb.genotypes(genotypes); + + final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + final double tolerance = 1e-3; + Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(), + "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE())); + } + // -------------------------------------------------------------------------------- // // Test priors diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index dea38e46c..ac4634666 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -20,15 +21,22 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; - final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); - final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double pNonRef = lastK > 0 ? 0.0 : -1000.0; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); - return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); } /** @@ -68,11 +76,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { } } - public int linearExact(final VariantContext vc, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyLikelihoods, - double[] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -81,7 +89,7 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { double maxLog10L = Double.NEGATIVE_INFINITY; boolean done = false; - int lastK = -1; + int lastK = -1, mleK = -1; for (int k=0; k <= numChr && ! done; k++ ) { final double[] kMinus0 = logY.getkMinus0(); @@ -127,7 +135,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { // can we abort early? lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; @@ -136,6 +148,6 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { logY.rotate(); } - return lastK; + return new Pair(lastK, mleK); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 301891a99..b82ec1d29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -131,14 +131,14 @@ final class StateTracker { /** * @return the likelihoods summed across all AC values for AC > 0 */ - private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + private double getLog10LikelihoodOfAFNotZero() { if ( log10LikelihoodsForAFGt0Sum == null ) { if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; else log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); } - return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + return log10LikelihoodsForAFGt0Sum; } /** @@ -162,7 +162,7 @@ final class StateTracker { @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); @@ -210,7 +210,7 @@ final class StateTracker { * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToLikelihoodsCache(log10LofK); @@ -227,7 +227,7 @@ final class StateTracker { * @param log10PofK the posterior of our current configuration state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { if ( log10PofK > log10MAP ) { @@ -236,7 +236,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) private void addToLikelihoodsCache(final double log10LofK) { // add to the cache log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; @@ -250,7 +249,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { @@ -259,7 +257,7 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero;