From 6dc52d42bfbc8777dcc9307cb68463a461d811f7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 18 Dec 2011 00:01:42 -0500 Subject: [PATCH] Implemented the proper QUAL calculation for multi-allelic calls. Integration tests pass except for the ones making multi-allelic calls (duh) and one of the SLOD tests (which used to print 0 when one of the LODs was NaN but now we just don't print the SB annotation for that record). --- .../AlleleFrequencyCalculationResult.java | 4 + .../genotyper/ExactAFCalculationModel.java | 19 ++-- .../genotyper/UnifiedGenotyperEngine.java | 87 ++++++++++--------- .../UnifiedGenotyperIntegrationTest.java | 6 +- 4 files changed, 64 insertions(+), 52 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 66106f658..cc72fde11 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -34,9 +34,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; */ public class AlleleFrequencyCalculationResult { + // note that the cell at position zero in the likelihoods/posteriors array is actually probability of non-ref (since it's marginalized over all alleles) final double[][] log10AlleleFrequencyLikelihoods; final double[][] log10AlleleFrequencyPosteriors; + double log10LikelihoodOfAFzero = 0.0; + double log10PosteriorOfAFzero = 0.0; + AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index bc27916dd..aa743f86f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -407,14 +407,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { nonRefAlleles++; } - // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - int AC = set.ACcounts.getCounts()[i]; - result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + // for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object + if ( nonRefAlleles == 0 ) { + result.log10LikelihoodOfAFzero = log10LofK; + result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0]; + } else { + // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs + for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { + int AC = set.ACcounts.getCounts()[i]; + result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); - // for k=0 we still want to use theta - final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; + result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 7e5f388b2..533bf0c68 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -75,8 +75,9 @@ public class UnifiedGenotyperEngine { // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); - // the allele frequency likelihoods (allocated once as an optimization) + // the allele frequency likelihoods and posteriors (allocated once as an optimization) private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); + private ThreadLocal posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[][] log10AlleleFrequencyPriorsSNPs; @@ -289,7 +290,9 @@ public class UnifiedGenotyperEngine { if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); + posteriorsArray.set(new double[N + 2]); } + AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); // don't try to genotype too many alternate alleles if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) { @@ -307,24 +310,20 @@ public class UnifiedGenotyperEngine { } // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; - // which alternate allele has the highest MLE AC? - int indexOfHighestAlt = -1; - int alleleCountOfHighestAlt = -1; - // determine which alternate alleles have AF>0 boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]); + int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]); // if the most likely AC is not 0, then this is a good alternate allele to use - if ( indexOfBestAC != 0 ) { + if ( indexOfBestAC != 0 && (AFresult.log10AlleleFrequencyPosteriors[i][0] != AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED || AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero) ) { altAllelesToUse[i] = true; bestGuessIsRef = false; } @@ -332,19 +331,14 @@ public class UnifiedGenotyperEngine { else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { altAllelesToUse[i] = true; } - - // keep track of the "best" alternate allele to use - if ( indexOfBestAC > alleleCountOfHighestAlt) { - alleleCountOfHighestAlt = indexOfBestAC; - indexOfHighestAlt = i; - } } - // calculate p(f>0) - // TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]); + // calculate p(f>0): + // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them + double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); + // TODO -- why not just 1.0 - normalizedPosteriors[0]? double sum = 0.0; - for (int i = 1; i <= N; i++) + for (int i = 1; i <= N+1; i++) // N+1 because the cell at position zero in the original posteriors array is actually probability of non-ref (since it's marginalized over all alleles) sum += normalizedPosteriors[i]; double PofF = Math.min(sum, 1.0); // deal with precision errors @@ -352,15 +346,17 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; + phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - sum = 0.0; + sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; + if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + sum = 0.0; for (int i = 1; i <= N; i++) { - if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) break; - sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i]; + sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; } phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } @@ -418,31 +414,31 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); - //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; + double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; - double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; + double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); - clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; - double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; + double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -455,7 +451,8 @@ public class UnifiedGenotyperEngine { strandScore *= 10.0; //logger.debug(String.format("SLOD=%f", strandScore)); - attributes.put("SB", strandScore); + if ( !Double.isNaN(strandScore) ) + attributes.put("SB", strandScore); } // finish constructing the resulting VC @@ -478,6 +475,12 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } + private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; + System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + return MathUtils.normalizeFromLog10(normalizedPosteriors); + } + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { Map stratifiedContexts = null; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 6041f80b8..7fde73f0a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCallingParameters() { HashMap e = new HashMap(); e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); - e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" ); + e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead")); + Arrays.asList("8bd5028cf294850b8a95b3c0af23d728")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51")); + Arrays.asList("814dcd66950635a870602d90a1618cce")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); }