From 99c9031cb4c20cf996202329ca17978ea1fde59e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 20:41:33 -0400 Subject: [PATCH] Merge AFCalcResultTracker into StateTracker, cleanup -- These two classes were really the same, and now they are actually the same! -- Cleanuped the interfaces, removed duplicate data -- Added lots of contracts, some of which found numerical issues with GeneralPloidyExactAFCalc (which have been patched over but not fixed) -- Moved goodProbability and goodProbabilityVector utilities to MathUtils. Very useful for contracts! --- .../afcalc/GeneralPloidyExactAFCalc.java | 320 ++++++------------ ...neralPloidyAFCalculationModelUnitTest.java | 9 +- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 30 +- .../genotyper/afcalc/AFCalcResult.java | 55 +-- .../genotyper/afcalc/AFCalcResultTracker.java | 256 -------------- .../genotyper/afcalc/DiploidExactAFCalc.java | 55 ++- .../afcalc/OriginalDiploidExactAFCalc.java | 4 - .../genotyper/afcalc/StateTracker.java | 304 ++++++++++++++--- .../broadinstitute/sting/utils/MathUtils.java | 33 ++ 9 files changed, 440 insertions(+), 626 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 0e97c090c..3916c2549 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -69,8 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); - return resultFromTracker(vc, log10AlleleFrequencyPriors); + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } /** @@ -171,13 +170,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param resultTracker object to fill with output values */ - protected static void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { final ArrayList genotypeLikelihoods = getGLs(GLs, true); @@ -196,24 +193,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { if ( genotypeLikelihoods.size() <= 1 ) { // no meaningful GLs at all, just set the tracker to non poly values - resultTracker.reset(); // just mimic-ing call below - resultTracker.setLog10LikelihoodOfAFzero(0.0); + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); } else { for (int p=1; p ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(); @@ -230,16 +227,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - StateTracker stateTracker = new StateTracker(); while ( !ACqueue.isEmpty() ) { - resultTracker.incNEvaluations(); + getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset); - - // adjust max likelihood seen if needed - if ( log10LofKs > stateTracker.getMaxLog10L()) - stateTracker.update(log10LofKs, ACset.getACcounts()); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -260,39 +252,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param resultTracker AFResult object - * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood */ - private static double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final AFCalcResultTracker resultTracker, - final StateTracker stateTracker, - final LinkedList ACqueue, - final HashMap indexesToACset) { + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); // add to new pool if (!Double.isInfinite(log10LofK)) newPool.add(set); - // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { - if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { return log10LofK; } @@ -323,67 +308,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } - /** - * Naive combiner of two multiallelic pools - number of alt alleles must be the same. - * Math is generalization of biallelic combiner. - * - * For vector K representing an allele count conformation, - * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) - * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) - * @param originalPool First log-likelihood pool GL vector - * @param yy Second pool GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param numAlleles Number of alleles - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - */ - public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { -/* - final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); - final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); - - if (dim1 != originalPool.getLength() || dim2 != yy.length) - throw new ReviewedStingException("BUG: Inconsistent vector length"); - - if (ploidy2 == 0) - return; - - final int newPloidy = ploidy1 + ploidy2; - - // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) - // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); - final double[] x = originalPool.getLikelihoodsAsVector(true); - while(firstIterator.hasNext()) { - x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); - firstIterator.next(); - } - - GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - final double[] y = yy.clone(); - while(secondIterator.hasNext()) { - y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); - secondIterator.next(); - } - - // initialize output to -log10(choose(m1+m2,[k1 k2...]) - final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); - - - // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K - while(outputIterator.hasNext()) { - final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); - double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); - - originalPool.add(likelihood, set, outputIterator.getLinearIndex()); - outputIterator.next(); - } -*/ - } +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } /** * Compute likelihood of a particular AC conformation and update AFresult object @@ -394,15 +379,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param resultTracker AFResult object * @return log-likehood of requested conformation */ - private static double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2, - final AFCalcResultTracker resultTracker) { + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { final int newPloidy = ploidy1 + ploidy2; @@ -420,8 +403,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; set.getLog10Likelihoods()[0] = log10Lof0; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -464,14 +447,14 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - resultTracker.updateMLEifNeeded(log10LofK, altCounts); + getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, altCounts); + getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); return log10LofK; } @@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { return (sum == ploidy); } - /** - * Combines naively two biallelic pools (of arbitrary size). - * For two pools of size m1 and m2, we can compute the combined likelihood as: - * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) - * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i - * @param newPLVector Second GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - * @return Combined likelihood vector - */ - public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, - final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { - - final int newPloidy = ploidy1 + ploidy2; - - final double[] combinedLikelihoods = new double[1+newPloidy]; - - /** Pre-fill result array and incorporate weights into input vectors - * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) - * and L2(k) = Pr(D|AC2=k) * choose(m2,k) - * equation reduces to - * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) - * which is just plain convolution of L1 and L2 (with pre-existing vector) - */ - - // intialize result vector to -infinity - Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); - - final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); - for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) - x[k] = Double.NEGATIVE_INFINITY; - - final double[] y = newPLVector.clone(); - - - final double log10Lof0 = x[0]+y[0]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - - double maxElement = log10Lof0; - int maxElementIdx = 0; - int[] alleleCounts = new int[1]; - for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { - double[] acc = new double[k+1]; - Arrays.fill(acc,Double.NEGATIVE_INFINITY); - double innerMax = Double.NEGATIVE_INFINITY; - - for (int j=0; j <=k; j++) { - double x1,y1; - - - if (k-j>=0 && k-j < y.length) - y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); - else - continue; - - if (j < x.length) - x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); - else - continue; - - if (Double.isInfinite(x1) || Double.isInfinite(y1)) - continue; - acc[j] = x1 + y1; - if (acc[j] > innerMax) - innerMax = acc[j]; - else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) - break; - } - combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); - maxElementIdx = k; - double maxDiff = combinedLikelihoods[k] - maxElement; - if (maxDiff > 0) - maxElement = combinedLikelihoods[k]; - else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { - break; - } - - alleleCounts[0] = k; - resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); - - - } - - - return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); - } - - /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * including updating the PL's, and assign genotypes accordingly @@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * * @return genotype */ - private static void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { final int numNewAltAlleles = allelesToUse.size() - 1; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 48f282901..1b3a4c0c0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - - final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; - -// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele]; Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 07f88c9e3..927fadd94 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -50,7 +50,7 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private final AFCalcResultTracker resultTracker; + private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { @@ -62,7 +62,7 @@ public abstract class AFCalc implements Cloneable { this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } public void enableProcessLog(final File exactCallsLog) { @@ -83,10 +83,10 @@ public abstract class AFCalc implements Cloneable { public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); - if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); // reset the result, so we can store our new result there - resultTracker.reset(); + stateTracker.reset(); final VariantContext vcWorking = reduceScope(vc); @@ -100,10 +100,20 @@ public abstract class AFCalc implements Cloneable { return result; } - @Deprecated - protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { - resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); - return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() > 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); } // --------------------------------------------------------------------------- @@ -162,8 +172,8 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public AFCalcResultTracker getResultTracker() { - return resultTracker; + public StateTracker getStateTracker() { + return stateTracker; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 7cacb2060..a65772444 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -83,8 +83,8 @@ public class AFCalcResult { if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); - if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); this.alleleCountsOfMLE = alleleCountsOfMLE; this.nEvaluations = nEvaluations; @@ -147,7 +147,7 @@ public class AFCalcResult { * Due to computational / implementation constraints this may be smaller than * the actual list of alleles requested * - * @return a non-empty list of alleles used during genotyping + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele */ @Ensures({"result != null", "! result.isEmpty()"}) public List getAllelesUsedInGenotyping() { @@ -159,7 +159,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFEq0() { return log10PosteriorsOfAC[AF0]; } @@ -169,7 +169,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFGT0() { return log10PosteriorsOfAC[AF1p]; } @@ -179,7 +179,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFEq0() { return log10LikelihoodsOfAC[AF0]; } @@ -189,7 +189,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFGT0() { return log10LikelihoodsOfAC[AF1p]; } @@ -199,7 +199,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFEq0() { return log10PriorsOfAC[AF0]; } @@ -209,7 +209,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFGT0() { return log10PriorsOfAC[AF1p]; } @@ -263,7 +263,7 @@ public class AFCalcResult { * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping * @return the log10 probability that allele is segregating at this site */ - @Ensures("goodLog10Probability(result)") + @Ensures("MathUtils.goodLog10Probability(result)") public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { final Double log10pNonRef = log10pNonRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); @@ -279,7 +279,7 @@ public class AFCalcResult { * @return freshly allocated log10 normalized posteriors vector */ @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") - @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) @@ -287,29 +287,6 @@ public class AFCalcResult { return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - /** * Computes the offset into linear vectors indexed by alt allele for allele * @@ -331,14 +308,4 @@ public class AFCalcResult { else return index - 1; } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @return true if result is really well formed - */ - private static boolean goodLog10Probability(final double result) { - return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java deleted file mode 100644 index 5c926a4d8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - * - * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? - */ -class AFCalcResultTracker { - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - protected double log10MLE; - protected double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int LIKELIHOODS_CACHE_SIZE = 5000; - private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; - private int currentLikelihoodsCacheIndex = 0; - protected Double log10LikelihoodsMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - private int[] AClimits; - - int nEvaluations = 0; - - /** - * The list of alleles actually used in computing the AF - */ - private List allelesUsedInGenotyping = null; - - /** - * Create a results object capability of storing results for calls with up to maxAltAlleles - * - * @param maxAltAlleles an integer >= 1 - */ - public AFCalcResultTracker(final int maxAltAlleles) { - if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); - - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - - reset(); - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MLE - * - * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, - * starting from index 0 (i.e., the first alt allele is at 0). The vector is always - * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values - * are meaningful. - * - * @return a vector with allele counts, not all of which may be meaningful - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MAP - * - * @see #getAlleleCountsOfMLE() for the encoding of results in this vector - * - * @return a non-null vector of ints - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return - */ - public double getLog10LikelihoodOfAFNotZero() { - if ( log10LikelihoodsMatrixSum == null ) { - if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have - log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; - else - log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - } - return log10LikelihoodsMatrixSum; - } - - public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { - return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { - final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; - final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); - for ( int i = 0; i < subACOfMLE.length; i++ ) { - final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); - } - - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); - } - - // -------------------------------------------------------------------------------- - // - // Protected mutational methods only for use within the calculation models themselves - // - // -------------------------------------------------------------------------------- - - /** - * Reset the data in this results object, so that it can be used in a subsequent AF calculation - * - * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer - */ - protected void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentLikelihoodsCacheIndex = 0; - log10LikelihoodsMatrixSum = null; - allelesUsedInGenotyping = null; - nEvaluations = 0; - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - } - - /** - * Tell this result we used one more evaluation cycle - */ - protected void incNEvaluations() { - nEvaluations++; - } - - protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToLikelihoodsCache(log10LofK); - - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToLikelihoodsCache(final double log10LofK) { - // add to the cache - log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { - final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - log10LikelihoodsMatrixValues[0] = temporarySum; - currentLikelihoodsCacheIndex = 1; - } - } - - protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } - - protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { - if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) - throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); - - this.allelesUsedInGenotyping = allelesUsedInGenotyping; - } - - protected void setAClimits(int[] AClimits) { - this.AClimits = AClimits; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 49915c515..6b345dcf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,10 +36,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { @@ -60,29 +56,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); - // keep processing while we have AC conformations that need to be calculated - final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); - while ( !ACqueue.isEmpty() ) { - getResultTracker().incNEvaluations(); // keep track of the number of evaluations + getStateTracker().incNEvaluations(); // keep track of the number of evaluations // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( stateTracker.withinMaxACs(set.getACcounts()) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - // adjust max likelihood seen if needed - stateTracker.update(log10LofKs, set.getACcounts()); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } - return resultFromTracker(vc, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } @Override @@ -153,23 +141,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -188,7 +174,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -213,9 +199,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -223,8 +209,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final StateTracker stateTracker, - final int[] newSetCounts, + private void updateACset(final int[] newSetCounts, final int numChr, final ExactACset dependentSet, final int PLsetIndex, @@ -246,8 +231,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -258,8 +242,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; } @@ -281,14 +265,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 093bf47d5..88f5e06e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -16,10 +16,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 19e253277..3eb32d35e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -1,35 +1,85 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + /** - * Keeps track of the best state seen by the exact model and the max states to visit - * allowing us to abort the search before we visit the entire matrix of AC x samples + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) */ final class StateTracker { - public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - final private int[] maxACsToConsider; - - private ExactACcounts ACsAtMax = null; - private double maxLog10L = Double.NEGATIVE_INFINITY; - - public StateTracker() { - this(null); - } - - public StateTracker(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 /** - * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state - * - * @param log10LofKs the likelihood of our current configuration state + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > getMaxLog10L()) { - this.setMaxLog10L(log10LofKs); - this.ACsAtMax = ACs; - } + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); } /** @@ -39,58 +89,200 @@ final class StateTracker { * @param log10LofK the log10 likelihood of the configuration we're considering analyzing * @return true if the configuration cannot meaningfully contribute to our likelihood sum */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; } /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } - final int[] otherACcounts = otherACs.getCounts(); + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; } /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + * Returns the likelihoods summed across all AC values for AC > 0 + * + * @return */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; + private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + } - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); + /** + * @return + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC + * + * @return + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + // TODO -- replace with more meaningful computation + // TODO -- refactor this calculation into the ref calculation + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); } - return true; + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); } - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); } - public double getMaxLog10L() { - return maxLog10L; + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; } - public void setMaxLog10L(double maxLog10L) { - this.maxLog10L = maxLog10L; + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3740d5d7c..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1194,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that