From 33720b83ebd093dea536b43428f937994cfc9bc4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 10 Jun 2013 14:52:41 -0400 Subject: [PATCH] No longer merge overlapping fragments from HaplotypeCaller -- Merging overlapping fragments turns out to be a bad idea. In the case where you can safely merge the reads you only gain a small about of overlapping kmers, so the potential gains are relatively small. That's in contrast to the very large danger of merging reads inappropriately, such as when the reads only overlap in a repetitive region, and you artificially construct reads that look like the reference but actually may carry a larger true insertion w.r.t. the reference. Because this problem isn't limited to repetitive sequeuence, but in principle could occur in any sequence, it's just not safe to do this merging. Best to leave haplotype construction to the assembly graph. --- .../walkers/haplotypecaller/HaplotypeCaller.java | 14 +++----------- ...omplexAndSymbolicVariantsIntegrationTest.java | 4 ++-- .../HaplotypeCallerIntegrationTest.java | 16 ++++++++-------- .../HaplotypeCallerParallelIntegrationTest.java | 2 +- 4 files changed, 14 insertions(+), 22 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index f3f54060f..b94b74748 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -919,19 +919,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } - final List finalizedReadList = new ArrayList<>(); - final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); - activeRegion.clearReads(); - - // Join overlapping paired reads to create a single longer read - finalizedReadList.addAll( fragmentCollection.getSingletonReads() ); - for( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) { - finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) ); - } // Loop through the reads hard clipping the adaptor and low quality tails - final List readsToUse = new ArrayList<>(finalizedReadList.size()); - for( final GATKSAMRecord myRead : finalizedReadList ) { + final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); + for( final GATKSAMRecord myRead : activeRegion.getReads() ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { GATKSAMRecord clippedRead; @@ -962,6 +953,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } + activeRegion.clearReads(); activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 8394baa72..c1b8f8a70 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "e7b28ea087e8624f1e596c9d65381fea"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "03944bbedb012e2ac2026a84baa0560c"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -94,6 +94,6 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "2a72a9b5c6778b99bf155a7c5e90d11e"); + "7e9f99d4cba8087dac66ea871b910d7e"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index f9bab8ea7..da92f39fc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "f25b9cfc85995cbe8eb6ba5a126d713d"); + HCTest(CEUTRIO_BAM, "", "09d84bc1aef2dd9c185934752172b794"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "19d685727ec60b3568f313bc44f79b49"); + HCTest(NA12878_BAM, "", "5c074930b27d1f5c942fe755c2a8be27"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "6da65f1d396b9c709eb6246cf3f615c1"); + "005a6d1933913a5d96fc56d01303fa95"); } @Test @@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e3db7d56154e36eeb887259bea4b241d"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "9b6f667ad87e19c38d16fefe63c37484"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("40416433baf96f4e84a058459717060b")); + Arrays.asList("a47ef09a8701128cfb301a83b7bb0728")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("cf1461ce829023ea9920fbfeb534eb97")); + Arrays.asList("0cb99f6bb3e630add4b3486c496fa508")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("45ca324be3917655e645d6c290c9280f")); + Arrays.asList("92f947cc89e4f50cf2ef3121d2fe308d")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("b7037770b7953cdf858764b99fa243ed")); + Arrays.asList("91877c8ea3eb0e0316d9ad11fdcc1a87")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 857d0fc9e..d009550f4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "bd2a57e6b0cffb4cbdba609a6c1683dc"}); + tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"}); } return tests.toArray(new Object[][]{});