QualityByDepth remaps QD values > 40 to a gaussian around 30
-- This is a temporarily fix / hack to deal with the very high QD values that are generated by the haplotype caller when nearby events occur within reads. In that case, the QUAL field can be many fold higher than normal, and results in an inflated QD value. This hack projects such high QD values back into the good range (as these are good variants in general) so they aren't filtered away by VQSR. -- The long-term solution to this problem is to move the HaplotypeCaller to the full bubble calling algorithm -- Update md5s
This commit is contained in:
parent
0fd9f0e77c
commit
2d35065238
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -61,10 +62,7 @@ import org.broadinstitute.variant.variantcontext.Genotype;
|
|||
import org.broadinstitute.variant.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
|
||||
|
|
@ -73,6 +71,7 @@ import java.util.Map;
|
|||
* reads associated with the samples with polymorphic genotypes.
|
||||
*/
|
||||
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
// private final static Logger logger = Logger.getLogger(QualByDepth.class);
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -114,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( depth == 0 )
|
||||
return null;
|
||||
|
||||
double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
|
||||
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
|
||||
double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
|
||||
QD = fixTooHighQD(QD);
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* The haplotype caller generates very high quality scores when multiple events are on the
|
||||
* same haplotype. This causes some very good variants to have unusually high QD values,
|
||||
* and VQSR will filter these out. This code looks at the QD value, and if it is above
|
||||
* threshold we map it down to the mean high QD value, with some jittering
|
||||
*
|
||||
* // TODO -- remove me when HaplotypeCaller bubble caller is live
|
||||
*
|
||||
* @param QD the raw QD score
|
||||
* @return a QD value
|
||||
*/
|
||||
private double fixTooHighQD(final double QD) {
|
||||
if ( QD < MAX_QD_BEFORE_FIXING ) {
|
||||
return QD;
|
||||
} else {
|
||||
return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
|
||||
}
|
||||
}
|
||||
|
||||
private final static double MAX_QD_BEFORE_FIXING = 35;
|
||||
private final static double IDEAL_HIGH_QD = 30;
|
||||
private final static double JITTER_SIGMA = 3;
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("QD"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c"));
|
||||
Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
|
||||
Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf"));
|
||||
Arrays.asList("939da0bb73b706badd8a0def7446b384"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("68961b19a29ae224059c33ef41cdcb58"));
|
||||
Arrays.asList("3a805f5b823ccac19aaec01a3016100e"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("9fcb234f7573209dec4dae86db091efd"));
|
||||
Arrays.asList("25aa0259876692dc3c848a37369bac6a"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("9fac00485419878749b03706ae6b852f"));
|
||||
Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("de2c5707c1805d17d70acaecd36b7372"));
|
||||
Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
|
||||
Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
|
||||
testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21");
|
||||
testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449");
|
||||
HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
|
|
@ -95,7 +95,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5af4782a0e1bc9b966b9e3ae76245919");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53");
|
||||
}
|
||||
|
||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("cf0a1bfded656153578df6cf68aa68a2"));
|
||||
Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue