Merge pull request #735 from broadinstitute/vrr_fix_qual_calculation_independent_exact_af_calculator

Fixed the QUAL calculation of the EXACT_INDEPENDENT
This commit is contained in:
rpoplin 2014-09-29 14:57:04 -04:00
commit d21ea126c4
9 changed files with 101 additions and 72 deletions

View File

@ -48,8 +48,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
@ -167,9 +169,54 @@ import java.util.*;
// fast path for the very common bi-allelic use case
return independentResultTrackers.get(0);
} else {
final AFCalculationResult combinedAltAllelesResult = combineAltAlleleIndependentExact(vc,defaultPloidy,log10AlleleFrequencyPriors);
// we are a multi-allelic, so we need to actually combine the results
final List<AFCalculationResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, withMultiAllelicPriors);
return combineIndependentPNonRefs(vc, withMultiAllelicPriors, combinedAltAllelesResult);
}
}
private AFCalculationResult combineAltAlleleIndependentExact(final VariantContext vc, int defaultPloidy, double[] log10AlleleFrequencyPriors) {
final VariantContext combinedAltAllelesVariantContext = makeCombinedAltAllelesVariantContext(vc);
final AFCalculationResult resultTracker = biAlleleExactModel.getLog10PNonRef(combinedAltAllelesVariantContext, defaultPloidy, vc.getNAlleles() - 1, log10AlleleFrequencyPriors);
return resultTracker;
}
private VariantContext makeCombinedAltAllelesVariantContext(final VariantContext vc) {
final int nAltAlleles = vc.getNAlleles() - 1;
if ( nAltAlleles == 1 )
return vc;
else {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final Allele reference = vcb.getAlleles().get(0);
vcb.alleles(Arrays.asList(reference, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE));
final int genotypeCount = GenotypeLikelihoodCalculators.genotypeCount(2, vc.getNAlleles());
final double[] hetLikelihoods = new double[vc.getNAlleles() - 1];
final double[] homAltLikelihoods = new double[genotypeCount - hetLikelihoods.length - 1];
final double[] newLikelihoods = new double[3];
final List<Genotype> newGenotypes = new ArrayList<>(vc.getNSamples());
for (final Genotype oldGenotype : vc.getGenotypes()) {
final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype);
final List<Allele> oldAlleles = oldGenotype.getAlleles();
if (oldAlleles != null) {
final List<Allele> newAlleles = new ArrayList<>(oldAlleles.size());
for (int i = 0; i < oldAlleles.size(); i++) {
final Allele oldAllele = oldAlleles.get(i);
if (oldAllele.isReference())
newAlleles.add(reference);
else if (oldAllele.isNoCall())
newAlleles.add(Allele.NO_CALL);
else
newAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
}
gb.alleles(newAlleles);
}
if (combineAltAlleleLikelihoods(oldGenotype, genotypeCount, newLikelihoods, hetLikelihoods, homAltLikelihoods))
gb.PL(newLikelihoods);
newGenotypes.add(gb.make());
}
return vcb.genotypes(newGenotypes).make();
}
}
@ -427,18 +474,17 @@ import java.util.*;
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
*/
protected AFCalculationResult combineIndependentPNonRefs(final VariantContext vc,
final List<AFCalculationResult> sortedResultsWithThetaNPriors) {
final List<AFCalculationResult> sortedResultsWithThetaNPriors,
final AFCalculationResult combinedAltAllelesResult) {
int nEvaluations = 0;
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
final int[] alleleCountsOfMLE = new int[nAltAlleles];
final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs
double log10PosteriorOfACEq0Sum = 0.0;
double log10PosteriorOfACGt0Sum = 0.0;
boolean anyPoly = false;
for ( final AFCalculationResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
@ -446,16 +492,6 @@ import java.util.*;
// MLE of altI allele is simply the MLE of this allele in altAlleles
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
// the AF > 0 case requires us to store the normalized likelihood for later summation
if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) {
anyPoly = true;
log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0();
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
}
log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0();
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0());
@ -463,36 +499,32 @@ import java.util.*;
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
}
// If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation
if ( ! anyPoly ) {
log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0();
log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0();
}
// In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C,
// the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently
// log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0
//
// note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we
// use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where
// AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO
final double log10PosteriorOfACGt0;
if ( log10PosteriorOfACEq0Sum == 0.0 )
log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum;
else
log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
final double[] log10LikelihoodsOfAC = new double[] {
// L + prior = posterior => L = poster - prior
log10PosteriorOfACEq0Sum - log10PriorsOfAC[0],
log10PosteriorOfACGt0 - log10PriorsOfAC[1]
};
return new MyAFCalculationResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
// necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true),
MathUtils.normalizeFromLog10(new double[] { combinedAltAllelesResult.getLog10LikelihoodOfAFEq0(), combinedAltAllelesResult.getLog10LikelihoodOfAFGT0() }, true),
// priors incorporate multiple alt alleles, must be normalized
MathUtils.normalizeFromLog10(log10PriorsOfAC, true),
MathUtils.normalizeFromLog10(new double[] { combinedAltAllelesResult.getLog10PriorOfAFEq0(), combinedAltAllelesResult.getLog10PriorOfAFGT0() }, true),
log10pRefByAllele, sortedResultsWithThetaNPriors);
}
private boolean combineAltAlleleLikelihoods(final Genotype g, final int plMaxIndex, final double[] dest,
final double[] hetLikelihoods, final double[] homAltLikelihoods) {
final int[] pls = g.getPL();
if (pls == null)
return false;
int hetNextIndex = 0;
int homAltNextIndex = 0;
for (int plIndex = 1; plIndex < plMaxIndex; plIndex++) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(plIndex);
if (alleles.alleleIndex1 == 0 || alleles.alleleIndex2 == 0)
hetLikelihoods[hetNextIndex++] = pls[plIndex] * PHRED_2_LOG10_COEFF;
else
homAltLikelihoods[homAltNextIndex++] = pls[plIndex] * PHRED_2_LOG10_COEFF;
}
dest[0] = pls[0] * PHRED_2_LOG10_COEFF;
dest[1] = MathUtils.approximateLog10SumLog10(hetLikelihoods);
dest[2] = MathUtils.approximateLog10SumLog10(homAltLikelihoods);
return true;
}
}

View File

@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("34a92dd832bbb8ed53abf21ba88c6faa"));
Arrays.asList("053913cb29fee481158e1f497a4fffdc"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

View File

@ -65,7 +65,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("a15a28d854789c71ef2879dd4c606b1a"));
Arrays.asList("b43bf9147e30cc68068a91a5e8405767"));
executeTest("test MultiSample Pilot1", spec);
}
@ -97,7 +97,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("1a36b5c036e0452f526dc1a5fdd60929"));
Arrays.asList("37594ce48695bf443c9251f70006f2f0"));
executeTest("test Multiple SNP alleles", spec);
}
@ -113,7 +113,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("451a6a39d74bd3cc367da80f76b2f98d"));
Arrays.asList("ad2be9f69ae8c6776b3bfba069735f50"));
executeTest("test reverse trim", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("a3b299d1dd256fec5425e4175f747bfe"));
Arrays.asList("1cc9c3e45e0296bb33042b409db18ca4"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "29f216779f0db9a08f4907ea82f0c7fb");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f4018f734d64f1f88b3ac4b712311567");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -85,9 +85,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "f8bc02b296d89369fe7ddf3923743c26"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ad0b6274991d451a6491f3118f0e7b2b"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "956938fd1b5c7bf1b717326a7dd46d91"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "0a982ba98be666d56452791df32109d7"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "5edcfa5ab96bc327783484c2bbe1c06f"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b60c70fac56f129af78eaff9ad769557"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "1c3570461e96ad6d66c6abb0fd6ee865"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});
@ -218,7 +218,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("187e43b1034478edced1eb5c664bac34"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("7ef1f30d92178f75e5220b16508b47cd"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
}

View File

@ -85,12 +85,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "7bc718c6a604405e6d33c3073630cc66");
HCTest(CEUTRIO_BAM, "", "5468f50b4ed198e6e9b05a67c3103f72");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "07125724eb465a739df9c6ab191216b0");
HCTest(NA12878_BAM, "", "e85ada8486a4ed7231918187a100e1c3");
}
@Test
@ -111,7 +111,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMinBaseQuality() {
HCTest(NA12878_BAM, "-mbq 15", "07125724eb465a739df9c6ab191216b0");
HCTest(NA12878_BAM, "-mbq 15", "e85ada8486a4ed7231918187a100e1c3");
}
@Test
@ -126,7 +126,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "79b0f8fa5e42ef23f3d166d84d92fa23");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "129965453a3ad9a22aa241f8c4afcbbf");
}
@Test
@ -136,19 +136,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "10119fbb494e966a6f1b54307cfbeb8b");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "987c0bb684fc03bcc46cb619e8269fe4");
}
@Test
public void testHaplotypeCallerSingleSampleWithDbsnp() {
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "fded195a0436242673718e6dc083c172");
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "0f9f45384669cde243731ca803fa3a0b");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf" +
" -isr INTERSECTION -L " + GGA_INTERVALS_FILE,
"5bc8892b68e281a3ceb4f2d141f8c723");
"86a060e9514eaf90c14ddaa7e6d07259");
}
@Test

View File

@ -60,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "faaf2217a920225b6937d87e949d1a68"});
tests.add(new Object[]{nct, "8bcf149228e8845915733d6fd889a141"});
}
return tests.toArray(new Object[][]{});

View File

@ -66,7 +66,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("bc3d3eff337836af245b81f52b94d70c"));
Arrays.asList("27bc40f7cc46bdc347284d7522b2aa6c"));
executeTest("testUpdatePGT", spec);
}
@ -78,7 +78,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("9d9ddeb831e5512c5b1084ee22e65459"));
Arrays.asList("bb7775a555ee9859e18a28cbc044a160"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("aa0f9604bb496be143a6dde775e157fe"));
Arrays.asList("9b7f2ba1bde2e0a0eb3ebc0afb6bc513"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -139,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("49f8ff728246d08cd20cd1c1521651f9"));
Arrays.asList("8201cee7120dfdb3fdeace0ec511c7b1"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -186,7 +186,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-nda"),
1,
Arrays.asList("27cddeb2287827a58d79fc7d3ddad080"));
Arrays.asList("d50e5035488f63c574dcb8485ff61fcb"));
executeTest("testNDA", spec);
}
@ -204,7 +204,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"),
1,
Arrays.asList("c841b408e41596529009bf7f07de9b3f"));
Arrays.asList("bd58c026e9c8df4d4166f22cd0f0ce65"));
executeTest("testStandardConf", spec);
}
}

View File

@ -65,9 +65,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nt : Arrays.asList(1, 2) )
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "392dc99dc279082fc6e729b249adfa2b", nt, nct });
tests.add(new Object[]{ "BOTH", "18418ddc2bdbe20c38ece6dd18535be7", nt, nct });
}
return tests.toArray(new Object[][]{});
@ -79,7 +77,6 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
buildCommandLine(
"-T UnifiedGenotyper -R " + b37KGReference,
"--no_cmdline_in_header -G none",
//"--dbsnp " + b37dbSNP132,
"-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam",
"-L 20:10,000,000-10,100,000",
"-glm " + glm,