From 2d350652387ae4751ac68cb621f5da5c5e0f8c95 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:44:48 -0500 Subject: [PATCH] QualityByDepth remaps QD values > 40 to a gaussian around 30 -- This is a temporarily fix / hack to deal with the very high QD values that are generated by the haplotype caller when nearby events occur within reads. In that case, the QUAL field can be many fold higher than normal, and results in an inflated QD value. This hack projects such high QD values back into the good range (as these are good variants in general) so they aren't filtered away by VQSR. -- The long-term solution to this problem is to move the HaplotypeCaller to the full bubble calling algorithm -- Update md5s --- .../gatk/walkers/annotator/QualByDepth.java | 33 ++++++++++++++++--- ...dGenotyperIndelCallingIntegrationTest.java | 4 +-- .../UnifiedGenotyperIntegrationTest.java | 4 +-- ...GenotyperNormalCallingIntegrationTest.java | 4 +-- ...dGenotyperReducedReadsIntegrationTest.java | 6 ++-- .../HaplotypeCallerIntegrationTest.java | 6 ++-- 6 files changed, 40 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 6f875b23c..a3fbcc439 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -61,10 +62,7 @@ import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. @@ -73,6 +71,7 @@ import java.util.Map; * reads associated with the samples with polymorphic genotypes. */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +// private final static Logger logger = Logger.getLogger(QualByDepth.class); public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -114,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); + QD = fixTooHighQD(QD); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * // TODO -- remove me when HaplotypeCaller bubble caller is live + * + * @param QD the raw QD score + * @return a QD value + */ + private double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + private final static double MAX_QD_BEFORE_FIXING = 35; + private final static double IDEAL_HIGH_QD = 30; + private final static double JITTER_SIGMA = 3; + public List getKeyNames() { return Arrays.asList("QD"); } public List getDescriptions() { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 670666fe2..8d0c1f04f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c")); + Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf")); + Arrays.asList("939da0bb73b706badd8a0def7446b384")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a0440aaed..15655622e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("68961b19a29ae224059c33ef41cdcb58")); + Arrays.asList("3a805f5b823ccac19aaec01a3016100e")); executeTest(String.format("test multiple technologies"), spec); } @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("9fcb234f7573209dec4dae86db091efd")); + Arrays.asList("25aa0259876692dc3c848a37369bac6a")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 49083e45b..2512dd5c2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("9fac00485419878749b03706ae6b852f")); + Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e")); executeTest("test Multiple SNP alleles", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("de2c5707c1805d17d70acaecd36b7372")); + Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index d65020dcc..b5fe79993 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304")); + Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f"); + testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21"); + testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8ed589c63..42eb09e6e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449"); + HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10"); } @Test(enabled = false) @@ -95,7 +95,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5af4782a0e1bc9b966b9e3ae76245919"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("cf0a1bfded656153578df6cf68aa68a2")); + Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a")); executeTest("HC calling on a ReducedRead BAM", spec); }