Merge pull request #473 from broadinstitute/eb_fix_qd_indel_normalization

The QD normalization for indels was busted and is now fixed.
This commit is contained in:
Ryan Poplin 2014-01-14 08:56:19 -08:00
commit 201ad398ac
11 changed files with 57 additions and 43 deletions

View File

@ -108,7 +108,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
continue;
depth += perReadAlleleLikelihoods.getNumberOfStoredElements();
} else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed
} else if ( genotype.hasDP() ) {
depth += genotype.getDP();
}
}
@ -117,13 +117,29 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
return null;
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
double QD = -10.0 * vc.getLog10PError() / ((double)depth * indelNormalizationFactor(altAlleleLength, ref != null));
// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);
Map<String, Object> map = new HashMap<>();
final Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.2f", QD));
return map;
}
/**
* Generate the indel normalization factor.
*
* @param altAlleleLength the average alternate allele length for the call
* @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length?
* @return a possitive double
*/
private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
}
/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,

View File

@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249"));
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("0c331915b07b42d726bc3d623aa9997b"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);
@ -384,10 +384,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
Assert.assertFalse(lineAnn == null);
final VariantContext vcAnn = codecAnn.decode(lineAnn);
if( vc.isBiallelic() ) {
Assert.assertTrue(vc.hasAttribute("QD"));
Assert.assertTrue(vcAnn.hasAttribute("QD"));
}
Assert.assertTrue(vc.hasAttribute("QD"));
Assert.assertTrue(vcAnn.hasAttribute("QD"));
}
Assert.assertFalse(lineIterator.hasNext());

View File

@ -69,16 +69,16 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "dac2d7969e109aee9ad2dad573759f58");
executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "0eec36459cf1f1e3e8739ab5b1cedb39");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "ceb105e3db0f2b993e3d725b0d60b6a3");
executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "73229442a8fe558e58dd5dd305eb2315");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "2787064918c7b391071a6ad4e5b0aba8");
}
}

View File

@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","50ebb7f74e5618acdd014dd87f2363fc");
}
@Test(enabled = true)

View File

@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92"));
Arrays.asList("1ad3943ae27a0062c52a19abe1c0d32c"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("0cbe889e03bab6512680ecaebd52c536"));
Arrays.asList("9b4ead3da021763704fcb9d80a5ee6ff"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("e10c49fcf9a128745c2b050a52798e58"));
Arrays.asList("8a0a751afdb2a8166432d9822e4d814c"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("475f8148123792064130faf9f9030fec"));
Arrays.asList("422a114943a9e3e9bf5872b82cbc6340"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("a7e4e1bd128424d46cffdd538b220074"));
Arrays.asList("01fec03933816e8d82aabe6e5b276dd5"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("903af514f70db9238064da311c4ea0de"));
Arrays.asList("e3c95f745ebf2d4f26759878966c5280"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3"));
Arrays.asList("4a45d5bd459565ec35c726894430e8df"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("a5b6d7b32953500d936d3dff512a6254"));
Arrays.asList("a78c663eff00b28b44f368f03b2acf1b"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" );
testHeterozosity( 0.01, "6053106407e09a6aefb78395a0e22ec4" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" );
testHeterozosity( 1.0 / 1850, "37666375278259c4d7dc800a0f73c1ca" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -203,7 +203,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207";
private final static String COMPRESSED_OUTPUT_MD5 = "c5c6af421cffa12fe6bdaced6cd41dd2";
@Test
public void testCompressedOutput() {
@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("c4248f02103e37e89b0f22c0d9c98492"));
Arrays.asList("630d1dcfb7650a9287d6723c38b0746a"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("96c7862d55e933b274cabe45c9c443d9"));
Arrays.asList("976e88e4accb4436ad9ac97df9477648"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -88,7 +88,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("02b521fe88a6606a29c12c0885c3debd"));
Arrays.asList("75503fce7521378f8c2170094aff29df"));
executeTest("test SingleSample Pilot2", spec);
}
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a973298b2801b80057bea88507e2858d"));
Arrays.asList("02c7804c8013ba1ead8e02b956b5e454"));
executeTest("test reverse trim", spec);
}

View File

@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6");
testReducedCalling("INDEL", "d593628b2bc144e987a9e75e5eee0001");
}

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88c10027c21712b1fe475c06cadd503c");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "ff19ae39b0695680ea670d53f6f9ce47");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -65,9 +65,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "96328c91cf9b06de231b37a22a7a7639"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ac25e9a78b89655197513bb0eb7a6845"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "dc0dde72131d562587acae967cf2031f"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"});

View File

@ -84,22 +84,22 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b");
HCTest(CEUTRIO_BAM, "", "f2ad35b5e0d181fb18da86a8971ce4f4");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295");
HCTest(NA12878_BAM, "", "06abde3268336a7cdb21970f12e819ba");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "3d1cb9acdf66547f88ad1742e8178044");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "af6f1f504ad771201aedc0157de8830a");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"b09437f11db40abd49195110e50692c2");
"fd43de437bbaf960499f67daedc6ef63");
}
@Test
@ -126,7 +126,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a3bb5f0bcec603287520841c559638f");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -173,7 +173,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("170896ddcfe06ec47e08aefefd99cf78"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("a43d6226a51eb525f0774f88e3778189"));
Arrays.asList("6ab05a77d2e79d21ba85fadf844a13ba"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -261,7 +261,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f"));
Arrays.asList("903af86b396ce88a6c8e4f4016fbe769"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -293,7 +293,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("19c2992541ede7407192660fdc1fadbf"));
Arrays.asList("824188743703bc09225c5b9c6b404ac1"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -301,7 +301,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16"));
Arrays.asList("14de866430f49c0026aafc1e34ed8250"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
}