Implemented the proper QUAL calculation for multi-allelic calls. Integration tests pass except for the ones making multi-allelic calls (duh) and one of the SLOD tests (which used to print 0 when one of the LODs was NaN but now we just don't print the SB annotation for that record).

This commit is contained in:
Eric Banks 2011-12-18 00:01:42 -05:00
parent 7c58d8e37d
commit 6dc52d42bf
4 changed files with 64 additions and 52 deletions

View File

@ -34,9 +34,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
*/
public class AlleleFrequencyCalculationResult {
// note that the cell at position zero in the likelihoods/posteriors array is actually probability of non-ref (since it's marginalized over all alleles)
final double[][] log10AlleleFrequencyLikelihoods;
final double[][] log10AlleleFrequencyPosteriors;
double log10LikelihoodOfAFzero = 0.0;
double log10PosteriorOfAFzero = 0.0;
AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];

View File

@ -407,14 +407,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
nonRefAlleles++;
}
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
// for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object
if ( nonRefAlleles == 0 ) {
result.log10LikelihoodOfAFzero = log10LofK;
result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0];
} else {
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
// for k=0 we still want to use theta
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
}
}
}

View File

@ -75,8 +75,9 @@ public class UnifiedGenotyperEngine {
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// the allele frequency likelihoods (allocated once as an optimization)
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[][] log10AlleleFrequencyPriorsSNPs;
@ -289,7 +290,9 @@ public class UnifiedGenotyperEngine {
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N));
posteriorsArray.set(new double[N + 2]);
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) {
@ -307,24 +310,20 @@ public class UnifiedGenotyperEngine {
}
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
// which alternate allele has the highest MLE AC?
int indexOfHighestAlt = -1;
int alleleCountOfHighestAlt = -1;
// determine which alternate alleles have AF>0
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]);
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]);
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
if ( indexOfBestAC != 0 && (AFresult.log10AlleleFrequencyPosteriors[i][0] != AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED || AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero) ) {
altAllelesToUse[i] = true;
bestGuessIsRef = false;
}
@ -332,19 +331,14 @@ public class UnifiedGenotyperEngine {
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
altAllelesToUse[i] = true;
}
// keep track of the "best" alternate allele to use
if ( indexOfBestAC > alleleCountOfHighestAlt) {
alleleCountOfHighestAlt = indexOfBestAC;
indexOfHighestAlt = i;
}
}
// calculate p(f>0)
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]);
// calculate p(f>0):
// because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them
double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
// TODO -- why not just 1.0 - normalizedPosteriors[0]?
double sum = 0.0;
for (int i = 1; i <= N; i++)
for (int i = 1; i <= N+1; i++) // N+1 because the cell at position zero in the original posteriors array is actually probability of non-ref (since it's marginalized over all alleles)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
@ -352,15 +346,17 @@ public class UnifiedGenotyperEngine {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero;
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
sum = AFresult.log10AlleleFrequencyPosteriors[0][0];
if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i];
sum += AFresult.log10AlleleFrequencyPosteriors[0][i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
@ -418,31 +414,31 @@ public class UnifiedGenotyperEngine {
// the overall lod
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero;
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero;
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -455,7 +451,8 @@ public class UnifiedGenotyperEngine {
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
attributes.put("SB", strandScore);
if ( !Double.isNaN(strandScore) )
attributes.put("SB", strandScore);
}
// finish constructing the resulting VC
@ -478,6 +475,12 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero;
System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1);
return MathUtils.normalizeFromLog10(normalizedPosteriors);
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
Map<String, AlignmentContext> stratifiedContexts = null;

View File

@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCallingParameters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" );
e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" );
e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead"));
Arrays.asList("8bd5028cf294850b8a95b3c0af23d728"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51"));
Arrays.asList("814dcd66950635a870602d90a1618cce"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}