Pulled out some hard-coded values from the read-threading and isActive code of the HC, and made them into a single argument.

In unifying the arguments it was clear that the values were inconsistent throughout the code, so now there's a
single value that is intended to be more liberal in what it allows in (in an attempt to increase sensitivity).

Very little code actually changes here, but just about every md5 in the HC integration tests are different (as
expected).  Added another integration test for the new argument.

To be used by David R to test his per-branch QC framework: does this commit make the HC look better against the KB?
This commit is contained in:
Eric Banks 2013-12-11 10:20:07 -05:00
parent abd4f552ba
commit 64d5bf650e
6 changed files with 45 additions and 40 deletions

View File

@ -341,6 +341,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
/**
* The minimum confidence needed for a given base for it to be used in variant calling.
*/
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public byte MIN_BASE_QUALTY_SCORE = 10;
/**
* Users should be aware that this argument can really affect the results of the variant calling and should exercise caution.
* Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the
@ -440,10 +446,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
protected boolean debugGraphTransformations = false;
@Hidden // TODO -- not currently useful
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
protected boolean useLowQualityBasesForAssembly = false;
@Hidden
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
@ -536,10 +538,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
// bases with quality less than or equal to this value are trimmed off the tails of the reads
private static final byte MIN_TAIL_QUALITY = 20;
private byte MIN_TAIL_QUALITY;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
// the minimum length of a read we'd consider using for genotyping
private final static int MIN_READ_LENGTH = 10;
@ -651,9 +652,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths);
assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails);
assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE);
MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1);
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
// setup the likelihood calculation engine
if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
@ -758,12 +761,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
for( final Map.Entry<String, AlignmentContext> sample : splitContexts.entrySet() ) {
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods;
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final List<Allele> alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
@ -1113,8 +1116,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
else if (useLowQualityBasesForAssembly)
clippedRead = myRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );

View File

@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("0c331915b07b42d726bc3d623aa9997b"));
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("b171258ed3ef5ae0d6c21fe04e5940fc"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "ff19ae39b0695680ea670d53f6f9ce47");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b5cd204b9dd6a5210b49d91186cf2b1d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"b787be740423b950f8529ccc838fabdd");
"cdf6d200324949a3484668774d2289d7");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"f74d68cbc1ecb66a7128258e111cd030");
"ccc3864207d700c00238066ec5ae33c5");
}
}

View File

@ -65,13 +65,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "96328c91cf9b06de231b37a22a7a7639"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ac25e9a78b89655197513bb0eb7a6845"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "dc0dde72131d562587acae967cf2031f"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "53aa13711a1ceec1453f21c705723f04"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "4bb8e44b2d04757f8b11ca6400828357"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8ddc291584f56e27d125b6a0523f2703"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "e53e164cc3f5cbd5fba083f2cdb98a88"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "a258dbbfabe88dad11d57151cd256831"});
return tests.toArray(new Object[][]{});
}

View File

@ -84,22 +84,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "f2ad35b5e0d181fb18da86a8971ce4f4");
HCTest(CEUTRIO_BAM, "", "abbfdcbf4bfed7547a48121091a7e16f");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "06abde3268336a7cdb21970f12e819ba");
HCTest(NA12878_BAM, "", "96f299a5cf411900b8eda3845c3ce465");
}
@Test
public void testHaplotypeCallerMinBaseQuality() {
HCTest(NA12878_BAM, "-mbq 15", "6509cfd0554ecbb92a1b303fbcc0fcca");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "3d1cb9acdf66547f88ad1742e8178044");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "83fe0694621bc1e0240f6f79eb6d6999");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "af6f1f504ad771201aedc0157de8830a");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "8b75034f9aa8435962da98eb521c8f0b");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -110,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"fd43de437bbaf960499f67daedc6ef63");
"fc271f21c0693e4fa6394e27d722fb52");
}
@Test
@ -126,7 +131,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a3bb5f0bcec603287520841c559638f");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "d3fc49d3d3c8b6439548133e03faa540");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -163,7 +168,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2");
HCTestNearbySmallIntervals(NA12878_BAM, "", "a415bc76231a04dc38412ff38aa0dc49");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -173,7 +178,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("170896ddcfe06ec47e08aefefd99cf78"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("763d4d8d84a4080db18235a413478660"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -222,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
Arrays.asList("12c56262ed30db1249b8d722e324357c"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -230,7 +235,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
Arrays.asList("1627cf5f3a97e8b73b3c095db46aef1b"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -244,7 +249,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("6ab05a77d2e79d21ba85fadf844a13ba"));
Arrays.asList("51e63c0431817ca1824b01e56341a8ae"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -253,7 +258,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
Arrays.asList("e39c73bbaf22b4751755d9f5bb2a8d3d"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -261,7 +266,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("903af86b396ce88a6c8e4f4016fbe769"));
Arrays.asList("a2ada5984fe835f7f2169f8393d122a6"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -270,7 +275,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("69db1045b5445a4f90843f368bd62814"));
Arrays.asList("c14d7f23dedea7e7ec99a90843320c1a"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -293,7 +298,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("824188743703bc09225c5b9c6b404ac1"));
Arrays.asList("ee73759f4372df678e7aa97346d87a70"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -301,7 +306,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("14de866430f49c0026aafc1e34ed8250"));
Arrays.asList("a9fa660910bf5e35267475f3b2d75766"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"});
tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"});
}
return tests.toArray(new Object[][]{});