Merge pull request #109 from broadinstitute/md_qd_fix_for_high_depth

QualityByDepth remaps QD values > 40 to a gaussian around 30
This commit is contained in:
Ryan Poplin 2013-03-15 07:05:32 -07:00
commit daa0f8b551
6 changed files with 40 additions and 17 deletions

View File

@ -46,6 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -61,10 +62,7 @@ import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypesContext;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
@ -73,6 +71,7 @@ import java.util.Map;
* reads associated with the samples with polymorphic genotypes.
*/
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
// private final static Logger logger = Logger.getLogger(QualByDepth.class);
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -114,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( depth == 0 )
return null;
double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
QD = fixTooHighQD(QD);
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", QD));
return map;
}
/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,
* and VQSR will filter these out. This code looks at the QD value, and if it is above
* threshold we map it down to the mean high QD value, with some jittering
*
* // TODO -- remove me when HaplotypeCaller bubble caller is live
*
* @param QD the raw QD score
* @return a QD value
*/
private double fixTooHighQD(final double QD) {
if ( QD < MAX_QD_BEFORE_FIXING ) {
return QD;
} else {
return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
}
}
private final static double MAX_QD_BEFORE_FIXING = 35;
private final static double IDEAL_HIGH_QD = 30;
private final static double JITTER_SIGMA = 3;
public List<String> getKeyNames() { return Arrays.asList("QD"); }
public List<VCFInfoHeaderLine> getDescriptions() {

View File

@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c"));
Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf"));
Arrays.asList("939da0bb73b706badd8a0def7446b384"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

View File

@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("68961b19a29ae224059c33ef41cdcb58"));
Arrays.asList("3a805f5b823ccac19aaec01a3016100e"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("9fcb234f7573209dec4dae86db091efd"));
Arrays.asList("25aa0259876692dc3c848a37369bac6a"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("9fac00485419878749b03706ae6b852f"));
Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e"));
executeTest("test Multiple SNP alleles", spec);
}
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("de2c5707c1805d17d70acaecd36b7372"));
Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c");
}
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21");
testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942");
}

View File

@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449");
HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10");
}
@Test(enabled = false)
@ -95,7 +95,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5af4782a0e1bc9b966b9e3ae76245919");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("cf0a1bfded656153578df6cf68aa68a2"));
Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a"));
executeTest("HC calling on a ReducedRead BAM", spec);
}