Original EXACT model implementation lives, providing another reference (bi-allelic only) EXACT model

-- Potentially a very fast implementation (it's very clean) but restricted to the biallelic case
-- A starting point for future bi-allelic only optimized (logless) or generalized (bi-allelic general ploidy) implementations
-- Added systematic unit tests covering this implementation, and comparing it to others
-- Uncovered a nasty normalization bug in StateTracker that was capping our likelihoods at 0, even after summing up multiple likelihoods, which is just not safe to do and was causing us to lose likelihood in some cases
-- Removed the restriction that a likelihood be <= 0 in StateTracker, and the protection for these cases in GeneralPloidyExactAFCalc which just wasn't right
This commit is contained in:
Mark DePristo 2012-10-19 19:33:47 -04:00
parent 0fb8274507
commit 0fcd358ace
4 changed files with 149 additions and 56 deletions

View File

@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts);
getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts);
getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
return log10LofK;
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest {
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
List<AFCalc> calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest {
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest {
return GetGLsTest.getTests(GetGLsTest.class);
}
@DataProvider(name = "badGLs")
public Object[][] createBadGLs() {
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
final int nSamples = genotypes.size();
// @DataProvider(name = "badGLs")
// public Object[][] createBadGLs() {
// final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
// final int nSamples = genotypes.size();
//
// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
//
// final int nPriorValues = 2*nSamples+1;
// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
// for ( AFCalc model : Arrays.asList(indCalc) ) {
// final String priorName = "flat";
// new GetGLsTest(model, 2, genotypes, priors, priorName);
// }
//
// return GetGLsTest.getTests(GetGLsTest.class);
// }
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
final int nPriorValues = 2*nSamples+1;
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
for ( AFCalc model : Arrays.asList(indCalc) ) {
final String priorName = "flat";
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
return GetGLsTest.getTests(GetGLsTest.class);
}
//
// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
// public void testBadGLs(GetGLsTest cfg) {
// testResultSimple(cfg);
// }
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testBiallelicGLs(GetGLsTest cfg) {
@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest {
testResultSimple(cfg);
}
@Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
public void testBadGLs(GetGLsTest cfg) {
testResultSimple(cfg);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest {
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2);
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
for ( AFCalc model : calcs ) {
if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc )
continue;
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest {
"Actual pNonRef not within tolerance " + tolerance + " of expected");
}
@DataProvider(name = "PNonRefBiallelicSystematic")
public Object[][] makePNonRefBiallelicSystematic() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Integer> bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000);
final List<List<Integer>> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true));
for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) {
if ( false ) { // for testing only
tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))});
} else {
if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK
// test all combinations of PLs for 1 sample
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
final List<List<Integer>> smallDiploidPLs = new LinkedList<List<Integer>>();
for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) {
for ( int i = 0; i < 2; i++ ) {
List<Integer> pls = new ArrayList<Integer>(Collections.nCopies(3, nonRefPL));
pls.set(i, 0);
smallDiploidPLs.add(pls);
}
}
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
}
}
return tests.toArray(new Object[][]{});
}
final List<List<Integer>> removeBadPLs(List<List<Integer>> listOfPLs) {
List<List<Integer>> clean = new LinkedList<List<Integer>>();
for ( final List<Integer> PLs : listOfPLs ) {
int x = PLs.get(0);
boolean bad = false;
for ( int pl1 : PLs )
if ( pl1 > x )
bad = true;
else
x = pl1;
if ( ! bad ) clean.add(PLs);
}
return clean;
}
private List<Genotype> toGenotypes(final List<List<Integer>> PLsPerSample) {
final List<Allele> nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
final List<Genotype> genotypes = new ArrayList<Genotype>(PLsPerSample.size());
for ( final List<Integer> PLs : PLsPerSample ) {
final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3]));
final int min = MathUtils.arrayMin(pls);
for ( int i = 0; i < pls.length; i++ ) pls[i] -= min;
genotypes.add(makePL(nocall, pls));
}
return genotypes;
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic")
private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List<Genotype> genotypes) {
//logger.warn("Running " + modelType + " with " + genotypes);
final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human);
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human);
final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C));
vcb.genotypes(genotypes);
final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final double tolerance = 1e-3;
Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance,
"Actual pNonRef not within tolerance " + tolerance + " of expected");
Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(),
"Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE()));
}
// --------------------------------------------------------------------------------
//
// Test priors

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -20,15 +21,22 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final Pair<Integer, Integer> result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final int lastK = result.getFirst();
final int mleK = result.getSecond();
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1);
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0};
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors);
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef);
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10Likelihoods, true),
MathUtils.normalizeFromLog10(log10Priors, true),
log10pNonRefByAllele);
}
/**
@ -68,11 +76,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
}
}
public int linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
public Pair<Integer, Integer> linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
@ -81,7 +89,7 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
int lastK = -1, mleK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
@ -127,7 +135,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK > maxLog10L ) {
maxLog10L = log10LofK;
mleK = k;
}
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
@ -136,6 +148,6 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
logY.rotate();
}
return lastK;
return new Pair<Integer, Integer>(lastK, mleK);
}
}

View File

@ -131,14 +131,14 @@ final class StateTracker {
/**
* @return the likelihoods summed across all AC values for AC > 0
*/
private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
private double getLog10LikelihoodOfAFNotZero() {
if ( log10LikelihoodsForAFGt0Sum == null ) {
if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
}
return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
return log10LikelihoodsForAFGt0Sum;
}
/**
@ -162,7 +162,7 @@ final class StateTracker {
@Requires("allelesUsedInGenotyping != null")
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true);
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
@ -210,7 +210,7 @@ final class StateTracker {
* @param log10LofK the likelihood of our current configuration state, cannot be the 0 state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"})
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
@Ensures("log10MLE == Math.max(log10LofK, log10MLE)")
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
@ -227,7 +227,7 @@ final class StateTracker {
* @param log10PofK the posterior of our current configuration state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"})
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
@Ensures("log10MAP == Math.max(log10PofK, log10MAP)")
protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) {
if ( log10PofK > log10MAP ) {
@ -236,7 +236,6 @@ final class StateTracker {
}
}
@Requires({"MathUtils.goodLog10Probability(log10LofK)"})
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK;
@ -250,7 +249,6 @@ final class StateTracker {
}
}
@Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"})
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
@ -259,7 +257,7 @@ final class StateTracker {
}
}
@Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"})
@Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"})
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;