Implemented the proper QUAL calculation for multi-allelic calls. Integration tests pass except for the ones making multi-allelic calls (duh) and one of the SLOD tests (which used to print 0 when one of the LODs was NaN but now we just don't print the SB annotation for that record).
This commit is contained in:
parent
7c58d8e37d
commit
6dc52d42bf
|
|
@ -34,9 +34,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
// note that the cell at position zero in the likelihoods/posteriors array is actually probability of non-ref (since it's marginalized over all alleles)
|
||||
final double[][] log10AlleleFrequencyLikelihoods;
|
||||
final double[][] log10AlleleFrequencyPosteriors;
|
||||
|
||||
double log10LikelihoodOfAFzero = 0.0;
|
||||
double log10PosteriorOfAFzero = 0.0;
|
||||
|
||||
AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
|
||||
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
|
||||
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
|
||||
|
|
|
|||
|
|
@ -407,14 +407,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
nonRefAlleles++;
|
||||
}
|
||||
|
||||
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
int AC = set.ACcounts.getCounts()[i];
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
// for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object
|
||||
if ( nonRefAlleles == 0 ) {
|
||||
result.log10LikelihoodOfAFzero = log10LofK;
|
||||
result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0];
|
||||
} else {
|
||||
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
int AC = set.ACcounts.getCounts()[i];
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
|
||||
// for k=0 we still want to use theta
|
||||
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,8 +75,9 @@ public class UnifiedGenotyperEngine {
|
|||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
|
||||
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
|
||||
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private final double[][] log10AlleleFrequencyPriorsSNPs;
|
||||
|
|
@ -289,7 +290,9 @@ public class UnifiedGenotyperEngine {
|
|||
if ( afcm.get() == null ) {
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N));
|
||||
posteriorsArray.set(new double[N + 2]);
|
||||
}
|
||||
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) {
|
||||
|
|
@ -307,24 +310,20 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
||||
// which alternate allele has the highest MLE AC?
|
||||
int indexOfHighestAlt = -1;
|
||||
int alleleCountOfHighestAlt = -1;
|
||||
|
||||
// determine which alternate alleles have AF>0
|
||||
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]);
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]);
|
||||
|
||||
// if the most likely AC is not 0, then this is a good alternate allele to use
|
||||
if ( indexOfBestAC != 0 ) {
|
||||
if ( indexOfBestAC != 0 && (AFresult.log10AlleleFrequencyPosteriors[i][0] != AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED || AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero) ) {
|
||||
altAllelesToUse[i] = true;
|
||||
bestGuessIsRef = false;
|
||||
}
|
||||
|
|
@ -332,19 +331,14 @@ public class UnifiedGenotyperEngine {
|
|||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
altAllelesToUse[i] = true;
|
||||
}
|
||||
|
||||
// keep track of the "best" alternate allele to use
|
||||
if ( indexOfBestAC > alleleCountOfHighestAlt) {
|
||||
alleleCountOfHighestAlt = indexOfBestAC;
|
||||
indexOfHighestAlt = i;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate p(f>0)
|
||||
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]);
|
||||
// calculate p(f>0):
|
||||
// because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them
|
||||
double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
|
||||
// TODO -- why not just 1.0 - normalizedPosteriors[0]?
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
for (int i = 1; i <= N+1; i++) // N+1 because the cell at position zero in the original posteriors array is actually probability of non-ref (since it's marginalized over all alleles)
|
||||
sum += normalizedPosteriors[i];
|
||||
double PofF = Math.min(sum, 1.0); // deal with precision errors
|
||||
|
||||
|
|
@ -352,15 +346,17 @@ public class UnifiedGenotyperEngine {
|
|||
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero;
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
sum = 0.0;
|
||||
sum = AFresult.log10AlleleFrequencyPosteriors[0][0];
|
||||
if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i];
|
||||
sum += AFresult.log10AlleleFrequencyPosteriors[0][i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
|
|
@ -418,31 +414,31 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the overall lod
|
||||
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero;
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero;
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
|
||||
|
|
@ -455,7 +451,8 @@ public class UnifiedGenotyperEngine {
|
|||
strandScore *= 10.0;
|
||||
//logger.debug(String.format("SLOD=%f", strandScore));
|
||||
|
||||
attributes.put("SB", strandScore);
|
||||
if ( !Double.isNaN(strandScore) )
|
||||
attributes.put("SB", strandScore);
|
||||
}
|
||||
|
||||
// finish constructing the resulting VC
|
||||
|
|
@ -478,6 +475,12 @@ public class UnifiedGenotyperEngine {
|
|||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||
}
|
||||
|
||||
private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) {
|
||||
normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero;
|
||||
System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1);
|
||||
return MathUtils.normalizeFromLog10(normalizedPosteriors);
|
||||
}
|
||||
|
||||
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
||||
Map<String, AlignmentContext> stratifiedContexts = null;
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testCallingParameters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" );
|
||||
e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" );
|
||||
e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||
Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead"));
|
||||
Arrays.asList("8bd5028cf294850b8a95b3c0af23d728"));
|
||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||
}
|
||||
|
||||
|
|
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51"));
|
||||
Arrays.asList("814dcd66950635a870602d90a1618cce"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue