The Exact model now computes both the likelihoods and posteriors (in separate arrays); likelihoods are used for assigning genotypes, not the posteriors.

This commit is contained in:
Eric Banks 2011-12-09 10:16:44 -05:00
parent aa4a8c5303
commit 442ceb6ad9
4 changed files with 47 additions and 26 deletions

View File

@ -62,24 +62,26 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
/**
* Must be overridden by concrete subclasses
* @param GLs genotype likelihoods
* @param Alleles Alleles corresponding to GLs
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
* @param GLs genotype likelihoods
* @param Alleles Alleles corresponding to GLs
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results
*/
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors);
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param log10AlleleFrequencyLikelihoods allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
protected abstract GenotypesContext assignGenotypes(VariantContext vc,
double[][] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood);
double[][] log10AlleleFrequencyLikelihoods,
int AFofMaxLikelihood);
}

View File

@ -52,15 +52,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
USE_MULTI_ALLELIC_CALCULATION = UAC.MULTI_ALLELIC;
}
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors) {
public void getLog10PNonRef(final GenotypesContext GLs,
final List<Allele> alleles,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
final int numAlleles = alleles.size();
if ( USE_MULTI_ALLELIC_CALCULATION )
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false);
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
else
linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyPosteriors);
linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
@ -125,6 +127,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(GenotypesContext GLs,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
@ -176,6 +179,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
@ -320,6 +324,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors,
final boolean preserveData) {
@ -344,7 +349,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
@ -359,13 +364,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
if ( DEBUG )
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
// clean up memory
if ( !preserveData ) {
@ -471,6 +477,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final ArrayList<double[]> genotypeLikelihoods,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
set.log10Likelihoods[0] = 0.0; // the zero case
@ -517,7 +524,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
// update the posteriors vector
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// determine the power of theta to use
@ -527,11 +533,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
nonRefAlleles++;
}
// update the posteriors vector which is a collapsed view of each of the various ACs
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
// for k=0 we still want to use theta
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][set.ACcounts.getCounts()[i]];
log10AlleleFrequencyPosteriors[i][set.ACcounts.getCounts()[i]] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][set.ACcounts.getCounts()[i]], log10LofK + prior);
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
}
}

View File

@ -77,6 +77,7 @@ public class UnifiedGenotyperEngine {
private final double[][] log10AlleleFrequencyPriorsIndels;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[][]> log10AlleleFrequencyLikelihoods = new ThreadLocal<double[][]>();
private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>();
// the priors object
@ -294,6 +295,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
@ -311,8 +313,9 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[0]);
@ -350,7 +353,7 @@ public class UnifiedGenotyperEngine {
}
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess);
// print out stats if we have a writer
if ( verboseWriter != null )
@ -369,16 +372,18 @@ public class UnifiedGenotyperEngine {
// the overall lod
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
@ -386,8 +391,9 @@ public class UnifiedGenotyperEngine {
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
@ -445,6 +451,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
@ -460,8 +467,9 @@ public class UnifiedGenotyperEngine {
return null;
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[0]);
@ -499,7 +507,7 @@ public class UnifiedGenotyperEngine {
}
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();

View File

@ -83,14 +83,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1];
final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1];
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyPosteriors, false);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {