Proper likelihoods and posterior probability of the joint allele frequency in IndependentAllelesDiploidExactAFCalc

-- Fixed minor numerical stability issue in AFCalcResult
-- posterior of joint A/B/C is 1 - (1 - P(D | AF_b == 0)) x (1 - P(D | AF_c == 0)), for any number of alleles, obviously.  Now computes the joint posterior like this, and then back-calculates likelihoods that generate these posteriors given the priors.  It's not pretty but it's the best thing to do
This commit is contained in:
Mark DePristo 2012-10-15 20:23:07 -04:00
parent d1511e38ad
commit 6bd0ec8de4
2 changed files with 81 additions and 78 deletions

View File

@ -275,11 +275,11 @@ public class AFCalcResult {
// necessary because the posteriors may be so skewed that the log-space normalized value isn't // necessary because the posteriors may be so skewed that the log-space normalized value isn't
// good, so we have to try both log-space normalization as well as the real-space normalization if the // good, so we have to try both log-space normalization as well as the real-space normalization if the
// result isn't good // result isn't good
final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) ) if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) )
return logNormalized; return logNormalized;
else else
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true);
} }
/** /**

View File

@ -32,64 +32,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*; import java.util.*;
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { /**
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
@Override
public int compare(AFCalcResult o1, AFCalcResult o2) {
return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0());
}
}
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
final ReferenceDiploidExactAFCalc refModel;
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
}
@Override
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
return refModel.makeMaxLikelihood(vc, resultTracker);
}
private static class MyAFCalcResult extends AFCalcResult {
final List<AFCalcResult> supporting;
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
this.supporting = supporting;
}
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc);
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors);
}
protected final double computelog10LikelihoodOfRef(final VariantContext vc) {
// this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation
final List<double[]> allGLs = getGLs(vc.getGenotypes(), false);
double log10LikelihoodOfHomRef = 0.0;
// TODO -- can be easily optimized (currently looks at all GLs via getGLs)
for ( int i = 0; i < allGLs.size(); i++ ) {
final double[] GLs = allGLs.get(i);
log10LikelihoodOfHomRef += GLs[0];
//log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0];
}
return log10LikelihoodOfHomRef;
}
/**
* Computes the conditional bi-allelic exact results * Computes the conditional bi-allelic exact results
* *
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes: * Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
@ -124,20 +67,75 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
* *
* This is effectively asking for the value in the upper left vs. the sum of all cells. * This is effectively asking for the value in the upper left vs. the sum of all cells.
* *
* The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the * This class implements the conditional likelihoods summation for any number of alt
* band at the top where AF_t > 0 and AF_c == 0 * alleles, where each alt allele has its EXACT probability of segregating calculated by
* reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows:
* *
* So (4) is actually (1) + (2). * Suppose we have for a A/B/C site the following GLs:
* *
* (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating * AA AB BB AC BC CC
* *
* (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything] * and we want to get the bi-allelic GLs for X/B, where X is everything not B
* (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use]
* *
* This function implements the conditional likelihoods summation for any number of alt * XX = AA + AC + CC (since X = A or C)
* alleles (not just the tri-allelic case), where each subsequent variant context is * XB = AB + BC
* further constrained such that each already considered allele x has AF_x == 0 in the * BB = BB
* compute. *
* After each allele has its probability calculated we compute the joint posterior
* as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i
* prior for the ith least likely allele.
*/
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
/**
* The min. confidence of an allele to be included in the joint posterior.
*/
private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20);
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Sorts AFCalcResults by their posteriors of AF > 0, so the
*/
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
@Override
public int compare(AFCalcResult o1, AFCalcResult o2) {
return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0());
}
}
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
final ReferenceDiploidExactAFCalc refModel;
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
}
@Override
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
return refModel.makeMaxLikelihood(vc, resultTracker);
}
private static class MyAFCalcResult extends AFCalcResult {
final List<AFCalcResult> supporting;
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
this.supporting = supporting;
}
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, withMultiAllelicPriors);
}
/**
* *
* @param vc * @param vc
* @param log10AlleleFrequencyPriors * @param log10AlleleFrequencyPriors
@ -294,7 +292,6 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
*/ */
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
final double log10LikelihoodsOfACEq0,
final List<AFCalcResult> sortedResultsWithThetaNPriors) { final List<AFCalcResult> sortedResultsWithThetaNPriors) {
int nEvaluations = 0; int nEvaluations = 0;
final int nAltAlleles = sortedResultsWithThetaNPriors.size(); final int nAltAlleles = sortedResultsWithThetaNPriors.size();
@ -302,8 +299,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
final double[] log10PriorsOfAC = new double[2]; final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// this value is a sum in real space so we need to store values to sum up later // this value is a sum in log space
final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles]; double log10PosteriorOfACEq0Sum = 0.0;
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
@ -316,7 +313,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
// the AF > 0 case requires us to store the normalized likelihood for later summation // the AF > 0 case requires us to store the normalized likelihood for later summation
log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0(); if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR )
log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0();
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
@ -325,13 +323,18 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
nEvaluations += sortedResultWithThetaNPriors.nEvaluations; nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
} }
// the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C,
final double[] log10LikelihoodsOfAC = new double[]{ // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently
log10LikelihoodsOfACEq0, // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0
MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)}; final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
final double[] log10LikelihoodsOfAC = new double[] {
// L + prior = posterior => L = poster - prior
log10PosteriorOfACEq0Sum - log10PriorsOfAC[0],
log10PosteriorOfACGt0 - log10PriorsOfAC[1]
};
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0 MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized
log10pNonRefByAllele, sortedResultsWithThetaNPriors); log10pNonRefByAllele, sortedResultsWithThetaNPriors);
} }