Merge AFCalcResultTracker into StateTracker, cleanup

-- These two classes were really the same, and now they are actually the same!
-- Cleanuped the interfaces, removed duplicate data
-- Added lots of contracts, some of which found numerical issues with GeneralPloidyExactAFCalc (which have been patched over but not fixed)
-- Moved goodProbability and goodProbabilityVector utilities to MathUtils.  Very useful for contracts!
This commit is contained in:
Mark DePristo 2012-10-17 20:41:33 -04:00
parent 9c63cee9fc
commit 99c9031cb4
9 changed files with 440 additions and 626 deletions

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -69,8 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
return resultFromTracker(vc, log10AlleleFrequencyPriors);
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
/**
@ -171,13 +170,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors
* @param resultTracker object to fill with output values
*/
protected static void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
protected void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
@ -196,24 +193,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
if ( genotypeLikelihoods.size() <= 1 ) {
// no meaningful GLs at all, just set the tracker to non poly values
resultTracker.reset(); // just mimic-ing call below
resultTracker.setLog10LikelihoodOfAFzero(0.0);
getStateTracker().reset(); // just mimic-ing call below
getStateTracker().setLog10LikelihoodOfAFzero(0.0);
} else {
for (int p=1; p<genotypeLikelihoods.size(); p++) {
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, resultTracker);
getStateTracker().reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p),
combinedPloidy, ploidyPerPool, numAlleles, log10AlleleFrequencyPriors);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
}
}
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
public CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool,
double[] newGL,
int originalPloidy,
int newGLPloidy,
int numAlleles,
final double[] log10AlleleFrequencyPriors) {
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
@ -230,16 +227,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
StateTracker stateTracker = new StateTracker();
while ( !ACqueue.isEmpty() ) {
resultTracker.incNEvaluations();
getStateTracker().incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
// adjust max likelihood seen if needed
if ( log10LofKs > stateTracker.getMaxLog10L())
stateTracker.update(log10LofKs, ACset.getACcounts());
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
// clean up memory
indexesToACset.remove(ACset.getACcounts());
@ -260,39 +252,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector
* @param resultTracker AFResult object
* @param stateTracker max likelihood observed so far
* @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue
* @return max log likelihood
*/
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final AFCalcResultTracker resultTracker,
final StateTracker stateTracker,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
private double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.getACcounts().getCounts().length;
final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
// add to new pool
if (!Double.isInfinite(log10LofK))
newPool.add(set);
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
// TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
return log10LofK;
}
@ -323,67 +308,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
}
/**
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
* Math is generalization of biallelic combiner.
*
* For vector K representing an allele count conformation,
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
* @param originalPool First log-likelihood pool GL vector
* @param yy Second pool GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param numAlleles Number of alleles
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param resultTracker Af calculation result object
*/
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
/*
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
if (dim1 != originalPool.getLength() || dim2 != yy.length)
throw new ReviewedStingException("BUG: Inconsistent vector length");
if (ploidy2 == 0)
return;
final int newPloidy = ploidy1 + ploidy2;
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
final double[] x = originalPool.getLikelihoodsAsVector(true);
while(firstIterator.hasNext()) {
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
firstIterator.next();
}
GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
final double[] y = yy.clone();
while(secondIterator.hasNext()) {
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
secondIterator.next();
}
// initialize output to -log10(choose(m1+m2,[k1 k2...])
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
while(outputIterator.hasNext()) {
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
outputIterator.next();
}
*/
}
// /**
// * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
// * Math is generalization of biallelic combiner.
// *
// * For vector K representing an allele count conformation,
// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
// * @param originalPool First log-likelihood pool GL vector
// * @param yy Second pool GL vector
// * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
// * @param ploidy2 Ploidy of second pool
// * @param numAlleles Number of alleles
// * @param log10AlleleFrequencyPriors Array of biallelic priors
// * @param resultTracker Af calculation result object
// */
// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
// final double[] log10AlleleFrequencyPriors,
// final AFCalcResultTracker resultTracker) {
///*
// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
//
// if (dim1 != originalPool.getLength() || dim2 != yy.length)
// throw new ReviewedStingException("BUG: Inconsistent vector length");
//
// if (ploidy2 == 0)
// return;
//
// final int newPloidy = ploidy1 + ploidy2;
//
// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
// final double[] x = originalPool.getLikelihoodsAsVector(true);
// while(firstIterator.hasNext()) {
// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
// firstIterator.next();
// }
//
// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
// final double[] y = yy.clone();
// while(secondIterator.hasNext()) {
// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
// secondIterator.next();
// }
//
// // initialize output to -log10(choose(m1+m2,[k1 k2...])
// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
//
//
// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
// while(outputIterator.hasNext()) {
// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
//
// originalPool.add(likelihood, set, outputIterator.getLinearIndex());
// outputIterator.next();
// }
//*/
// }
/**
* Compute likelihood of a particular AC conformation and update AFresult object
@ -394,15 +379,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool
* @param resultTracker AFResult object
* @return log-likehood of requested conformation
*/
private static double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2,
final AFCalcResultTracker resultTracker) {
private double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2) {
final int newPloidy = ploidy1 + ploidy2;
@ -420,8 +403,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.getLog10Likelihoods()[0] = log10Lof0;
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return log10Lof0;
} else {
@ -464,14 +447,14 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts);
return log10LofK;
}
@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
return (sum == ploidy);
}
/**
* Combines naively two biallelic pools (of arbitrary size).
* For two pools of size m1 and m2, we can compute the combined likelihood as:
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
* @param newPLVector Second GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param resultTracker Af calculation result object
* @return Combined likelihood vector
*/
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final int newPloidy = ploidy1 + ploidy2;
final double[] combinedLikelihoods = new double[1+newPloidy];
/** Pre-fill result array and incorporate weights into input vectors
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
* equation reduces to
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
* which is just plain convolution of L1 and L2 (with pre-existing vector)
*/
// intialize result vector to -infinity
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
x[k] = Double.NEGATIVE_INFINITY;
final double[] y = newPLVector.clone();
final double log10Lof0 = x[0]+y[0];
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
double maxElement = log10Lof0;
int maxElementIdx = 0;
int[] alleleCounts = new int[1];
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
double[] acc = new double[k+1];
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
double innerMax = Double.NEGATIVE_INFINITY;
for (int j=0; j <=k; j++) {
double x1,y1;
if (k-j>=0 && k-j < y.length)
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
else
continue;
if (j < x.length)
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
else
continue;
if (Double.isInfinite(x1) || Double.isInfinite(y1))
continue;
acc[j] = x1 + y1;
if (acc[j] > innerMax)
innerMax = acc[j];
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
break;
}
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
maxElementIdx = k;
double maxDiff = combinedLikelihoods[k] - maxElement;
if (maxDiff > 0)
maxElement = combinedLikelihoods[k];
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
break;
}
alleleCounts[0] = k;
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
}
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
}
/**
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
* including updating the PL's, and assign genotypes accordingly
@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
*
* @return genotype
*/
private static void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
private void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
final int numNewAltAlleles = allelesToUse.size() - 1;

View File

@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy);
calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele];
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}

View File

@ -50,7 +50,7 @@ public abstract class AFCalc implements Cloneable {
protected Logger logger = defaultLogger;
private SimpleTimer callTimer = new SimpleTimer();
private final AFCalcResultTracker resultTracker;
private final StateTracker stateTracker;
private ExactCallLogger exactCallLogger = null;
protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
@ -62,7 +62,7 @@ public abstract class AFCalc implements Cloneable {
this.nSamples = nSamples;
this.maxAlternateAllelesToGenotype = maxAltAlleles;
this.maxAlternateAllelesForIndels = maxAltAllelesForIndels;
this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
}
public void enableProcessLog(final File exactCallsLog) {
@ -83,10 +83,10 @@ public abstract class AFCalc implements Cloneable {
public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
// reset the result, so we can store our new result there
resultTracker.reset();
stateTracker.reset();
final VariantContext vcWorking = reduceScope(vc);
@ -100,10 +100,20 @@ public abstract class AFCalc implements Cloneable {
return result;
}
@Deprecated
protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors);
/**
* Convert the final state of the state tracker into our result as an AFCalcResult
*
* Assumes that stateTracker has been updated accordingly
*
* @param vcWorking the VariantContext we actually used as input to the calc model (after reduction)
* @param log10AlleleFrequencyPriors the priors by AC vector
* @return a AFCalcResult describing the result of this calculation
*/
@Requires("stateTracker.getnEvaluations() > 0")
@Ensures("result != null")
protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors);
}
// ---------------------------------------------------------------------------
@ -162,8 +172,8 @@ public abstract class AFCalc implements Cloneable {
return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels);
}
public AFCalcResultTracker getResultTracker() {
return resultTracker;
public StateTracker getStateTracker() {
return stateTracker;
}
}

View File

@ -83,8 +83,8 @@ public class AFCalcResult {
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
this.alleleCountsOfMLE = alleleCountsOfMLE;
this.nEvaluations = nEvaluations;
@ -147,7 +147,7 @@ public class AFCalcResult {
* Due to computational / implementation constraints this may be smaller than
* the actual list of alleles requested
*
* @return a non-empty list of alleles used during genotyping
* @return a non-empty list of alleles used during genotyping, the first of which is the reference allele
*/
@Ensures({"result != null", "! result.isEmpty()"})
public List<Allele> getAllelesUsedInGenotyping() {
@ -159,7 +159,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PosteriorOfAFEq0() {
return log10PosteriorsOfAC[AF0];
}
@ -169,7 +169,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PosteriorOfAFGT0() {
return log10PosteriorsOfAC[AF1p];
}
@ -179,7 +179,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFEq0() {
return log10LikelihoodsOfAC[AF0];
}
@ -189,7 +189,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFGT0() {
return log10LikelihoodsOfAC[AF1p];
}
@ -199,7 +199,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PriorOfAFEq0() {
return log10PriorsOfAC[AF0];
}
@ -209,7 +209,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PriorOfAFGT0() {
return log10PriorsOfAC[AF1p];
}
@ -263,7 +263,7 @@ public class AFCalcResult {
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
* @return the log10 probability that allele is segregating at this site
*/
@Ensures("goodLog10Probability(result)")
@Ensures("MathUtils.goodLog10Probability(result)")
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
@ -279,7 +279,7 @@ public class AFCalcResult {
* @return freshly allocated log10 normalized posteriors vector
*/
@Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length")
@Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
@Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) {
final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length];
for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ )
@ -287,29 +287,6 @@ public class AFCalcResult {
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
}
/**
* Check that the log10 prob vector vector is well formed
*
* @param vector
* @param expectedSize
* @param shouldSumToOne
*
* @return true if vector is well-formed, false otherwise
*/
private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
if ( vector.length != expectedSize ) return false;
for ( final double pr : vector ) {
if ( ! goodLog10Probability(pr) )
return false;
}
if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 )
return false;
return true; // everything is good
}
/**
* Computes the offset into linear vectors indexed by alt allele for allele
*
@ -331,14 +308,4 @@ public class AFCalcResult {
else
return index - 1;
}
/**
* Checks that the result is a well-formed log10 probability
*
* @param result a supposedly well-formed log10 probability value
* @return true if result is really well formed
*/
private static boolean goodLog10Probability(final double result) {
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
}
}

View File

@ -1,256 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
*/
class AFCalcResultTracker {
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
protected double log10MLE;
protected double log10MAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE];
private int currentLikelihoodsCacheIndex = 0;
protected Double log10LikelihoodsMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
private int[] AClimits;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public AFCalcResultTracker(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MAP
*
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
*
* @return a non-null vector of ints
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* Returns the likelihoods summed across all AC values for AC > 0
*
* @return
*/
public double getLog10LikelihoodOfAFNotZero() {
if ( log10LikelihoodsMatrixSum == null ) {
if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
}
return log10LikelihoodsMatrixSum;
}
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
// TODO -- replace with more meaningful computation
// TODO -- refactor this calculation into the ref calculation
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
for ( int i = 0; i < subACOfMLE.length; i++ ) {
final Allele allele = allelesUsedInGenotyping.get(i+1);
final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentLikelihoodsCacheIndex = 0;
log10LikelihoodsMatrixSum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
}
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
log10LikelihoodsMatrixValues[0] = temporarySum;
currentLikelihoodsCacheIndex = 1;
}
}
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
protected void setAClimits(int[] AClimits) {
this.AClimits = AClimits;
}
}

View File

@ -36,10 +36,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
}
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
return new StateTracker();
}
@Override
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
@ -60,29 +56,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker());
while ( !ACqueue.isEmpty() ) {
getResultTracker().incNEvaluations(); // keep track of the number of evaluations
getStateTracker().incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
if ( stateTracker.withinMaxACs(set.getACcounts()) ) {
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker());
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors);
// adjust max likelihood seen if needed
stateTracker.update(log10LofKs, set.getACcounts());
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
return resultFromTracker(vc, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
@Override
@ -153,23 +141,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
private double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final StateTracker stateTracker,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final double[] log10AlleleFrequencyPriors) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker);
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors);
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// can we abort early because the log10Likelihoods are so small?
if ( stateTracker.abort(log10LofK, set.getACcounts()) ) {
if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
@ -188,7 +174,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
@ -213,9 +199,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
@ -223,8 +209,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private void updateACset(final StateTracker stateTracker,
final int[] newSetCounts,
private void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
@ -246,8 +231,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
private void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final double[] log10AlleleFrequencyPriors) {
set.getLog10Likelihoods()[0] = 0.0; // the zero case
final int totalK = set.getACsum();
@ -258,8 +242,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
@ -281,14 +265,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// update the MLE if necessary
resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
// apply the priors over each alternate allele
for ( final int ACcount : set.getACcounts().getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
}
private void pushData(final ExactACset targetSet,

View File

@ -16,10 +16,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
}
@Override
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];

View File

@ -1,35 +1,85 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Keeps track of the best state seen by the exact model and the max states to visit
* allowing us to abort the search before we visit the entire matrix of AC x samples
* Keeps track of the state information during the exact model AF calculation.
*
* Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior
* values, the likelihood of the AF == 0 state, and the number of evaluations needed
* by the calculation to compute the P(AF == 0)
*/
final class StateTracker {
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
final private int[] maxACsToConsider;
private ExactACcounts ACsAtMax = null;
private double maxLog10L = Double.NEGATIVE_INFINITY;
public StateTracker() {
this(null);
}
public StateTracker(final int[] maxACsToConsider) {
this.maxACsToConsider = maxACsToConsider;
}
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
/**
* Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state
*
* @param log10LofKs the likelihood of our current configuration state
* These variables are intended to contain the MLE and MAP (and their corresponding allele counts)
* of the site over all alternate alleles
*/
public void update(final double log10LofKs, final ExactACcounts ACs) {
if ( log10LofKs > getMaxLog10L()) {
this.setMaxLog10L(log10LofKs);
this.ACsAtMax = ACs;
}
protected double log10MLE;
protected double log10MAP;
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*/
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
/**
* A vector of log10 likelihood values seen, for future summation. When the size of the
* vector is exceeed -- because we've pushed more posteriors than there's space to hold
* -- we simply sum up the existing values, make that the first value, and continue.
*/
private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE];
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
private int log10LikelihoodsForAFGt0CacheIndex = 0;
/**
* The actual sum of the likelihoods. Null if the sum hasn't been computed yet
*/
protected Double log10LikelihoodsForAFGt0Sum = null;
/**
* Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
*/
private double log10LikelihoodOfAFzero = 0.0;
/**
* The number of evaluates we've gone through in the AFCalc
*/
private int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public StateTracker(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
@ -39,58 +89,200 @@ final class StateTracker {
* @param log10LofK the log10 likelihood of the configuration we're considering analyzing
* @return true if the configuration cannot meaningfully contribute to our likelihood sum
*/
public boolean tooLowLikelihood(final double log10LofK) {
return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY;
private boolean tooLowLikelihood(final double log10LofK) {
return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY;
}
/**
* Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider?
* @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
*/
private boolean isLowerAC(final ExactACcounts otherACs) {
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < otherACcounts.length; i++ ) {
if ( alleleCountsOfMLE[i] > otherACcounts[i] )
return false;
}
return true;
}
/**
* Should we stop exploring paths from ACs, given it's log10LofK
*
* @param otherACs the set of otherACs that we want to know if we should consider analyzing
* @return true if otherACs is a state worth considering, or false otherwise
* @param log10LofK the log10LofK of these ACs
* @param ACs the ACs of this state
* @return return true if there's no reason to continue with subpaths of AC, or false otherwise
*/
public boolean withinMaxACs(final ExactACcounts otherACs) {
if ( maxACsToConsider == null )
return true;
protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) {
return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs));
}
final int[] otherACcounts = otherACs.getCounts();
@Ensures("result != null")
protected int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
for ( int i = 0; i < maxACsToConsider.length; i++ ) {
// consider one more than the max AC to collect a bit more likelihood mass
if ( otherACcounts[i] > maxACsToConsider[i] + 1 )
return false;
}
return true;
@Ensures("result >= 0")
protected int getnEvaluations() {
return nEvaluations;
}
/**
* returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
* Returns the likelihoods summed across all AC values for AC > 0
*
* @return
*/
public boolean isLowerAC(final ExactACcounts otherACs) {
if ( ACsAtMax == null )
return true;
private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
if ( log10LikelihoodsForAFGt0Sum == null ) {
if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
}
return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
}
final int[] myACcounts = this.ACsAtMax.getCounts();
final int[] otherACcounts = otherACs.getCounts();
/**
* @return
*/
private double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
for ( int i = 0; i < myACcounts.length; i++ ) {
if ( myACcounts[i] > otherACcounts[i] )
return false;
/**
* Convert this state to an corresponding AFCalcResult.
*
* Assumes that the values in this state have been filled in with meaningful values during the calculation.
* For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful
* values, etc.
*
* @param log10PriorsByAC
*
* @return
*/
@Requires("allelesUsedInGenotyping != null")
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
// TODO -- replace with more meaningful computation
// TODO -- refactor this calculation into the ref calculation
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
for ( int i = 0; i < subACOfMLE.length; i++ ) {
final Allele allele = allelesUsedInGenotyping.get(i+1);
final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return true;
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
public boolean abort( final double log10LofK, final ExactACcounts ACs ) {
return tooLowLikelihood(log10LofK) && isLowerAC(ACs);
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED;
log10LikelihoodsForAFGt0CacheIndex = 0;
log10LikelihoodsForAFGt0Sum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(alleleCountsOfMLE, 0);
Arrays.fill(alleleCountsOfMAP, 0);
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
}
public double getMaxLog10L() {
return maxLog10L;
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
public void setMaxLog10L(double maxLog10L) {
this.maxLog10L = maxLog10L;
/**
* Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state
*
* @param log10LofK the likelihood of our current configuration state, cannot be the 0 state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"})
@Ensures("log10MLE == Math.max(log10LofK, log10MLE)")
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length);
}
}
/**
* Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state
*
* @param log10PofK the posterior of our current configuration state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"})
@Ensures("log10MAP == Math.max(log10PofK, log10MAP)")
protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) {
if ( log10PofK > log10MAP ) {
log10MAP = log10PofK;
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length);
}
}
@Requires({"MathUtils.goodLog10Probability(log10LofK)"})
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
log10LikelihoodsForAFGt0[0] = temporarySum;
log10LikelihoodsForAFGt0CacheIndex = 1;
}
}
@Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"})
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
@Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"})
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
/**
* Set the list of alleles used in genotyping
*
* @param allelesUsedInGenotyping the list of alleles, where the first allele is reference
*/
@Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"})
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
if ( allelesUsedInGenotyping.get(0).isNonReference() )
throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
}

View File

@ -1194,6 +1194,39 @@ public class MathUtils {
return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
}
/**
* Check that the log10 prob vector vector is well formed
*
* @param vector
* @param expectedSize
* @param shouldSumToOne
*
* @return true if vector is well-formed, false otherwise
*/
public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
if ( vector.length != expectedSize ) return false;
for ( final double pr : vector ) {
if ( ! goodLog10Probability(pr) )
return false;
}
if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 )
return false;
return true; // everything is good
}
/**
* Checks that the result is a well-formed log10 probability
*
* @param result a supposedly well-formed log10 probability value
* @return true if result is really well formed
*/
public static boolean goodLog10Probability(final double result) {
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
}
/**
* A utility class that computes on the fly average and standard deviation for a stream of numbers.
* The number of observations does not have to be known in advance, and can be also very big (so that