Continuing work on IndependentAllelesDiploidExactAFCalc

-- Continuing to get IndependentAllelesDiploidExactAFCalc working correctly.  A long way towards the right answer now, but still not there
-- Restored (but not tested) OriginalDiploidExactAFCalc, the clean diploid O(N) version for Ryan
-- MathUtils.normalizeFromLog10 no longer returns -Infinity when kept in log space, enforces the min log10 value there
-- New convenience method in VariantContext that looks up the allele index in the alleles
This commit is contained in:
Mark DePristo 2012-10-10 20:22:23 -04:00
parent 176b74095d
commit 6bbe750e03
8 changed files with 408 additions and 101 deletions

View File

@ -154,7 +154,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@DataProvider(name = "badGLs")
public Object[][] createBadGLs() {
final List<Genotype> genotypes = Arrays.asList(AA2, AB2, AC2);
final List<Genotype> genotypes = Arrays.asList(AB2, CC2, CC2, CC2);
final int nSamples = genotypes.size();
final ExactAFCalc indCalc = new IndependentAllelesDiploidExactAFCalc(nSamples, 4);
@ -169,13 +169,13 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(enabled = false, dataProvider = "wellFormedGLs")
@Test(enabled = true, dataProvider = "wellFormedGLs")
public void testBiallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() == 2 )
testResultSimple(cfg);
}
@Test(enabled = false, dataProvider = "wellFormedGLs")
@Test(enabled = true, dataProvider = "wellFormedGLs")
public void testTriallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() > 2 )
testResultSimple(cfg);
@ -236,7 +236,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs")
@Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
final AFCalcResult expected = onlyInformative.execute();
final AFCalcResult actual = withNonInformative.execute();
@ -251,9 +251,6 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
// final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount();
// Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations,
// "Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations);
Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
@ -264,20 +261,10 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
}
// TODO
// TODO -- enable when we understand the contract between AC_MAP and pNonRef
// TODO
// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP());
// if ( AC_MAP > 0 ) {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// } else {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// }
}
private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final ExactAFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
final double TOLERANCE = 1; // TODO -- tighten up tolerances
final double TOLERANCE = 2; // TODO -- tighten up tolerances -- cannot be tightened up until we finalize the independent alleles model
if ( ! onlyPosteriorsShouldBeEqual ) {
Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
@ -293,6 +280,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
if ( ! a.isReference() ) {
Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
// TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
// // TODO -- delete when general ploidy works properly with multi-allelics
// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
@ -300,7 +288,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
}
}
@Test(enabled = false, dataProvider = "Models")
@Test(enabled = true, dataProvider = "Models")
public void testLargeGLs(final ExactAFCalc calc) {
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
@ -311,7 +299,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test(enabled = false, dataProvider = "Models")
@Test(enabled = true, dataProvider = "Models")
public void testMismatchedGLs(final ExactAFCalc calc) {
final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
@ -415,7 +403,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "PNonRef")
@Test(enabled = true, dataProvider = "PNonRef")
private void testPNonRef(final VariantContext vcRoot,
ExactAFCalculationTestBuilder.ModelType modelType,
ExactAFCalculationTestBuilder.PriorType priorType,
@ -452,7 +440,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "Models")
@Test(enabled = true, dataProvider = "Models")
public void testBiallelicPriors(final ExactAFCalc model) {
final int REF_PL = 10;
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
@ -539,7 +527,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "MaxACsToVisit")
@Test(enabled = true, dataProvider = "MaxACsToVisit")
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final ExactAFCalculationTestBuilder.ModelType modelType) {
final int nAlts = requestedACs.size();
final ExactAFCalculationTestBuilder testBuilder
@ -604,7 +592,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "MaxACsGenotypes")
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();

View File

@ -4,13 +4,13 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
// SEE private/R/pls.R if you want the truth output for these tests
@ -54,16 +54,101 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "TestCombineGLsWithDrops")
public Object[][] makeTestCombineGLsWithDrops() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Integer> noDrops = Collections.emptySet();
final Set<Integer> drop1 = Collections.singleton(1);
final Set<Integer> drop2 = Collections.singleton(2);
// AA AB BB AC BC CC
// drop1 (B): AA AC CC
// drop2 (C): AA AB BB
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops});
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1});
return tests.toArray(new Object[][]{});
}
private Genotype makePL(final int ... PLs) {
return ExactAFCalculationModelUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
}
@Test(enabled = true, dataProvider = "TestCombineGLs")
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.<Integer>emptySet());
}
@Test(enabled = true, dataProvider = "TestCombineGLsWithDrops")
private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set<Integer> allelesToDrop) {
final IndependentAllelesDiploidExactAFCalc calc = new IndependentAllelesDiploidExactAFCalc(1, 4);
final Genotype combined = calc.combineGLs(testg, altIndex, nAlts);
final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts);
Assert.assertEquals(combined.getPL(), expected.getPL(),
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
}
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
@DataProvider(name = "TestMakeAlleleConditionalContexts")
public Object[][] makeTestMakeAlleleConditionalContexts() {
List<Object[]> tests = new ArrayList<Object[]>();
final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
final Genotype gACcombined = makePL(0, 2, 5);
final Genotype gAGcombined = makePL(0, 4, 9);
final Genotype gACdropped = makePL(0, 1, 2);
final Genotype gAGdropped = makePL(0, 3, 5);
// biallelic
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
// tri-allelic
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())});
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts")
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
final IndependentAllelesDiploidExactAFCalc calc = new IndependentAllelesDiploidExactAFCalc(1, 4);
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
final VariantContext actual = biAllelicVCs.get(i);
final VariantContext expected = expectedVCs.get(i);
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
for ( int j = 0; j < actual.getNSamples(); j++ )
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL());
}
}
}

View File

@ -121,6 +121,10 @@ class AFCalcResultTracker {
return log10LikelihoodsMatrixSum;
}
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
@ -141,7 +145,7 @@ class AFCalcResultTracker {
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()};
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Priors = new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)};
// TODO -- replace with more meaningful computation
@ -153,8 +157,7 @@ class AFCalcResultTracker {
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping,
MathUtils.normalizeFromLog10(log10Likelihoods, true, true), log10Priors, log10pNonRefByAllele);
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
// --------------------------------------------------------------------------------
@ -178,6 +181,7 @@ class AFCalcResultTracker {
log10LikelihoodsMatrixSum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
}
/**
@ -212,6 +216,7 @@ class AFCalcResultTracker {
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
log10LikelihoodsMatrixValues[0] = temporarySum;
currentLikelihoodsCacheIndex = 1;
}

View File

@ -67,7 +67,7 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc);
final List<AFCalcResult> independentResultTrackers = computeLog10PNonRefForEachAllele(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, independentResultTrackers, log10AlleleFrequencyPriors);
}
@ -79,47 +79,105 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
// TODO -- can be easily optimized (currently looks at all GLs via getGLs)
for ( int i = 0; i < allGLs.size(); i++ ) {
final double[] GLs = allGLs.get(i);
log10LikelihoodOfHomRef += GLs[0];
log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0];
}
return log10LikelihoodOfHomRef;
// // this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation
// final List<double[]> allGLs = getGLs(vc.getGenotypes(), false);
// final double[] log10LikelihoodOfHomRefs = new double[allGLs.size()];
//
// // TODO -- can be easily optimized (currently looks at all GLs via getGLs)
// for ( int i = 0; i < allGLs.size(); i++ ) {
// final double[] GLs = allGLs.get(i);
// log10LikelihoodOfHomRefs[i] = GLs[0];
// }
//
// return MathUtils.log10sumLog10(log10LikelihoodOfHomRefs);
}
protected List<AFCalcResult> computeLog10PNonRefForEachAllele(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final int nAltAlleles = vc.getNAlleles() - 1;
final List<AFCalcResult> resultTrackers = new ArrayList<AFCalcResult>(nAltAlleles);
/**
* Computes the conditional bi-allelic exact results
*
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
*
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
*
* it then computes the conditional probability on AF_c == 0:
*
* (2) P(D | AF_t > 0 && AF_c == 0)
*
* Thinking about this visually, we have the following likelihood matrix where each cell is
* the P(D | AF_c == i && AF_t == j):
*
* 0 AF_c > 0
* -----------------
* 0 | |
* |--|-------------
* a | |
* f | |
* _ | |
* t | |
* > | |
* 0 | |
*
* What we really want to know how
*
* (3) P(D | AF_c == 0 & AF_t == 0)
*
* compares with
*
* (4) P(D | AF_c > 0 || AF_t > 0)
*
* This is effectively asking for the value in the upper left vs. the sum of all cells.
*
* The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the
* band at the top where AF_t > 0 and AF_c == 0
*
* So (4) is actually (1) + (2).
*
* (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating
*
* (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything]
* (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use]
*
* This function implements the conditional likelihoods summation for any number of alt
* alleles (not just the tri-allelic case), where each subsequent variant context is
* further constrained such that each already considered allele x has AF_x == 0 in the
* compute.
*
* @param vc
* @param log10AlleleFrequencyPriors
* @return
*/
protected List<AFCalcResult> computeAlleleConditionalExact(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> results = new LinkedList<AFCalcResult>();
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
final List<Allele> biallelic = Arrays.asList(vc.getReference(), vc.getAlternateAllele(altI));
final VariantContext subvc = biallelicCombinedGLs(vc, biallelic, altI + 1);
for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) {
final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
resultTrackers.add(resultTracker);
results.add(resultTracker);
}
return resultTrackers;
return results;
}
protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List<Allele> biallelic, final int allele2) {
if ( rootVC.isBiallelic() )
protected List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
final int nAltAlleles = vc.getNAlleles() - 1;
final List<VariantContext> vcs = new LinkedList<VariantContext>();
final List<Allele> afZeroAlleles = new LinkedList<Allele>();
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
final Allele altAllele = vc.getAlternateAllele(altI);
final List<Allele> biallelic = Arrays.asList(vc.getReference(), altAllele);
vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1));
// TODO -- WE NEED TO TRUNCATE THE ALLELES TO COMPUTE THE TRUE POSTERIOR BUT MUST INCLUDE IT TO GET THE TRUE MLE
// afZeroAlleles.add(altAllele);
}
return vcs;
}
protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List<Allele> biallelic, final List<Allele> afZeroAlleles, final int allele2) {
if ( rootVC.isBiallelic() ) {
if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles);
return rootVC;
else {
} else {
final Set<Integer> allelesToDiscard = new HashSet<Integer>(rootVC.getAlleleIndices(afZeroAlleles));
final int nAlts = rootVC.getNAlleles() - 1;
final List<Genotype> biallelicGenotypes = new ArrayList<Genotype>(rootVC.getNSamples());
for ( final Genotype g : rootVC.getGenotypes() )
biallelicGenotypes.add(combineGLs(g, allele2, nAlts));
biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts));
final VariantContextBuilder vcb = new VariantContextBuilder(rootVC);
vcb.alleles(biallelic);
@ -143,14 +201,28 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
* XB = AB + BC
* BB = BB
*
* Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is
* useful in the case where you want to drop alleles (not combine them), such as above:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2)
*
* XX = AA (since X = A and C is dropped)
* XB = AB
* BB = BB
*
* This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly
* AF == 0.
*
* @param original the original multi-allelic genotype
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
* @param nAlts the total number of alt alleles
* @return a new biallelic genotype with appropriate PLs
*/
@Requires("original.hasLikelihoods()")
@Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"})
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) {
protected Genotype combineGLs(final Genotype original, final int altIndex, final Set<Integer> allelesToDiscard, final int nAlts ) {
if ( original.isNonInformative() )
return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make();
@ -161,6 +233,11 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
for ( int index = 0; index < normalizedPr.length; index++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
// just continue if we shouldn't include the pair because it's in the discard set
if ( discardAllelePair(pair, allelesToDiscard) )
continue;
if ( pair.alleleIndex1 == altIndex ) {
if ( pair.alleleIndex2 == altIndex )
// hom-alt case
@ -184,46 +261,33 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
}
protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set<Integer> allelesToDiscard) {
return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2);
}
/**
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
*
* Takes each independent result and merges it into the final result object
*
* Suppose you have L_af=0_1 = -1 and L_af=0_1 = -2 and L_af=1_1 = -3 and L_af=1_2 = 0. What does this mean?
* If says that along dimension 1, the AF is more likely to be ref (-1 vs. -3) while along dimension 2
* you are more likely to be alt (-2 vs. 0). The question is how to combine these into a meaningful
* composite likelihood. What we are interested in is:
*
* L(AF == 0 for all dimensions) vs. L(AF > 0 for any dimension)
*
* So what are these quantities? The problem is that the likelihoods aren't normalized, so we really cannot
* just add them together. What we really need are normalized probabilities so that we can compute:
*
* P(AF == 0 for all dimensions) => product_i for P(AF == 0, i)
* P(AF > 0 for any dimension) => sum_i for P(AF > 0, i)
*
* These probabilities can be computed straight off the likelihoods without a prior. It's just the prior-free
* normalization of the two likelihoods.
*
* @param independentPNonRefs the pNonRef result for each allele independently
* @param conditionalPNonRefResults the pNonRef result for each allele independently
*/
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
final double log10LikelihoodsOfACEq0,
final List<AFCalcResult> independentPNonRefs,
final List<AFCalcResult> conditionalPNonRefResults,
final double[] log10AlleleFrequencyPriors) {
int nEvaluations = 0;
final int nAltAlleles = independentPNonRefs.size();
final int nAltAlleles = conditionalPNonRefResults.size();
final int[] alleleCountsOfMLE = new int[nAltAlleles];
final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// this value is a sum in real space so we need to store values to sum up later
final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles];
//double log10LikelihoodsOfACEq0 = 0.0;
// TODO -- need to apply theta^alt prior after sorting by MLE
int altI = 0;
for ( final AFCalcResult independentPNonRef : independentPNonRefs ) {
for ( final AFCalcResult independentPNonRef : conditionalPNonRefResults ) {
final Allele altAllele = vc.getAlternateAllele(altI);
// MLE of altI allele is simply the MLE of this allele in altAlleles
@ -235,15 +299,9 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
log10PriorsOfAC[1] = independentPNonRef.getLog10PriorOfAFGT0();
}
// now we effectively have flat prior'd posteriors
final double[] log10NormalizedLikelihoods = MathUtils.normalizeFromLog10(
new double[]{
independentPNonRef.getLog10LikelihoodOfAFEq0(),
independentPNonRef.getLog10LikelihoodOfAFGT0() },
true);
// the AF > 0 case requires us to store the normalized likelihood for later summation
log10LikelihoodsOfACGt0[altI] = log10NormalizedLikelihoods[1];
//log10LikelihoodsOfACEq0 += independentPNonRef.getLog10LikelihoodOfAFEq0();
log10LikelihoodsOfACGt0[altI] = independentPNonRef.getLog10LikelihoodOfAFGT0();
// bind pNonRef for allele to the posterior value of the AF > 0
// TODO -- should incorporate the theta^alt prior here from the likelihood itself
@ -261,6 +319,6 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0
log10PriorsOfAC, log10pNonRefByAllele, independentPNonRefs);
log10PriorsOfAC, log10pNonRefByAllele, conditionalPNonRefResults);
}
}

View File

@ -0,0 +1,152 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
/**
* Original bi-allelic ~O(N) implementation. Kept here for posterity and reference
*/
public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
public OriginalDiploidExactAFCalc(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles);
}
public OriginalDiploidExactAFCalc(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
}
@Override
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
final public double[] getkMinus2() {
return kMinus2;
}
final public double[] getkMinus1() {
return kMinus1;
}
final public double[] getkMinus0() {
return kMinus0;
}
}
public int linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = MathUtils.approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = MathUtils.approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[k] = log10LofK;
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
}

View File

@ -5,7 +5,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
* allowing us to abort the search before we visit the entire matrix of AC x samples
*/
final class StateTracker {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
final private int[] maxACsToConsider;

View File

@ -594,8 +594,10 @@ public class MathUtils {
// we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
for (int i = 0; i < array.length; i++)
for (int i = 0; i < array.length; i++) {
array[i] -= maxValue;
array[i] = Math.max(array[i], LOG10_P_OF_ZERO);
}
return array;
}

View File

@ -1517,15 +1517,32 @@ public class VariantContext implements Feature { // to enable tribble integratio
return best;
}
/**
* Lookup the index of allele in this variant context
*
* @param allele the allele whose index we want to get
* @return the index of the allele into getAlleles(), or -1 if it cannot be found
*/
public int getAlleleIndex(final Allele allele) {
return getAlleles().indexOf(allele);
}
/**
* Return the allele index #getAlleleIndex for each allele in alleles
*
* @param alleles the alleles we want to look up
* @return a list of indices for each allele, in order
*/
public List<Integer> getAlleleIndices(final Collection<Allele> alleles) {
final List<Integer> indices = new LinkedList<Integer>();
for ( final Allele allele : alleles )
indices.add(getAlleleIndex(allele));
return indices;
}
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
int index = 1;
for ( Allele allele : getAlternateAlleles() ) {
if ( allele.equals(targetAllele) )
break;
index++;
}
final int index = getAlleleIndex(targetAllele);
if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this);
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
}
}