No longer merge overlapping fragments from HaplotypeCaller

-- Merging overlapping fragments turns out to be a bad idea.  In the case where you can safely merge the reads you only gain a small about of overlapping kmers, so the potential gains are relatively small.  That's in contrast to the very large danger of merging reads inappropriately, such as when the reads only overlap in a repetitive region, and you artificially construct reads that look like the reference but actually may carry a larger true insertion w.r.t. the reference.  Because this problem isn't limited to repetitive sequeuence, but in principle could occur in any sequence, it's just not safe to do this merging.  Best to leave haplotype construction to the assembly graph.
This commit is contained in:
Mark DePristo 2013-06-10 14:52:41 -04:00
parent c837d67b2f
commit 33720b83eb
4 changed files with 14 additions and 22 deletions

View File

@ -919,19 +919,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
final List<GATKSAMRecord> finalizedReadList = new ArrayList<>();
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
activeRegion.clearReads();
// Join overlapping paired reads to create a single longer read
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
}
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKSAMRecord> readsToUse = new ArrayList<>(finalizedReadList.size());
for( final GATKSAMRecord myRead : finalizedReadList ) {
final List<GATKSAMRecord> readsToUse = new ArrayList<>(activeRegion.getReads().size());
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead;
@ -962,6 +953,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
activeRegion.clearReads();
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
}

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "e7b28ea087e8624f1e596c9d65381fea");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "03944bbedb012e2ac2026a84baa0560c");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -94,6 +94,6 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"2a72a9b5c6778b99bf155a7c5e90d11e");
"7e9f99d4cba8087dac66ea871b910d7e");
}
}

View File

@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "f25b9cfc85995cbe8eb6ba5a126d713d");
HCTest(CEUTRIO_BAM, "", "09d84bc1aef2dd9c185934752172b794");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "19d685727ec60b3568f313bc44f79b49");
HCTest(NA12878_BAM, "", "5c074930b27d1f5c942fe755c2a8be27");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"6da65f1d396b9c709eb6246cf3f615c1");
"005a6d1933913a5d96fc56d01303fa95");
}
@Test
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e3db7d56154e36eeb887259bea4b241d");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "9b6f667ad87e19c38d16fefe63c37484");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("40416433baf96f4e84a058459717060b"));
Arrays.asList("a47ef09a8701128cfb301a83b7bb0728"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("cf1461ce829023ea9920fbfeb534eb97"));
Arrays.asList("0cb99f6bb3e630add4b3486c496fa508"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("45ca324be3917655e645d6c290c9280f"));
Arrays.asList("92f947cc89e4f50cf2ef3121d2fe308d"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("b7037770b7953cdf858764b99fa243ed"));
Arrays.asList("91877c8ea3eb0e0316d9ad11fdcc1a87"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "bd2a57e6b0cffb4cbdba609a6c1683dc"});
tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"});
}
return tests.toArray(new Object[][]{});