From 7ae15dadbe21fdd8364da2c33d7e850e91a413c8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 21 Mar 2013 11:27:00 -0400 Subject: [PATCH] HC now by default only uses reads with MAPQ >= 20 for assembly and calling -- Previously we tried to include lots of these low mapping quality reads in the assembly and calling, but we effectively were just filtering them out anyway while generating an enormous amount of computational expense to handle them, as well as much larger memory requirements. The new version simply uses a read filter to remove them upfront. This causes no major problems -- at least, none that don't have other underlying causes -- compared to 10-11mb of the KB -- Update MD5s to reflect changes due to no longer including mmq < 20 by default --- .../haplotypecaller/HaplotypeCaller.java | 4 +- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 12 ++--- .../HCMappingQualityFilter.java | 44 +++++++++++++++++++ 4 files changed, 55 insertions(+), 11 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 31751d8f0..81ff3dfbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -56,7 +56,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -134,9 +134,9 @@ import java.util.*; @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionTraversalParameters(extension=85, maxRegion=300) +@ReadFilters({HCMappingQualityFilter.class}) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { - /** * A raw, unfiltered, highly sensitive callset in VCF format. */ diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 12dc71799..830152903 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "91f4880910e436bf5aca0abbebd58948"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6dd29d6fec056419ab0fa03a7d43d85e"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "f2add041ba1692db576ae9763a14b8a6"); + "84616464aed68f4d9bc9e08472eff9c0"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "383320e81a1a3bee880fcc6cd0564452"); + "e2d1023b846bfac31b4f7a3a4b90d931"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5ee0a6b81..1b98b2239 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75dbef605b28f02616b13bb5d8bf2fbd"); + HCTest(CEUTRIO_BAM, "", "9859b136d05085b5ec0833035289106a"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "fa8705a5d3ada66470019fa7ddcb9b2c"); + HCTest(NA12878_BAM, "", "27f660bf1c9a6ed7167d77022d401b73"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "d41a886f69a67e01af2ba1a6b4a681d9"); + "e25fc2196401a16347e0c730dbcbe828"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a38f6fade253577d205a00db3e67828"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "325d7d73e0bd86b6cb146b249eda959a"); } @Test @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("1e7b1bda6be5d3835ae318f2977cfbdd")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6d63f558259883262ea84f339acb767")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec97a0a65890169358842e765ff8dd15")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java new file mode 100644 index 000000000..3892ffe27 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.filters.ReadFilter; + +/** + * Filter out reads with low mapping qualities. + * + * @author mdepristo + */ +public class HCMappingQualityFilter extends ReadFilter { + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 20; + + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); + } +} \ No newline at end of file