From 03e5184741945e5ffc89348dc2ba8c976858b11c Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 13 Feb 2012 13:40:58 -0500 Subject: [PATCH 1/2] Fix serious engine bug that could cause reads to be dropped under certain circumstances When aggregating raw BAM file spans into shards, the IntervalSharder tries to combine file spans when it can. Unfortunately, the method that combines two BAM file spans was seriously flawed, and would produce a truncated union if the file spans overlapped in certain ways. This could cause entire regions of the BAM file containing reads within the requested intervals to be dropped. Modified GATKBAMFileSpan.union() to correct this problem, and added unit tests to verify that the correct union is produced regardless of how the file spans happen to overlap. Thanks to Khalid, who did at least as much work on this bug as I did. --- .../src/net/sf/samtools/GATKBAMFileSpan.java | 10 ++- .../java/src/net/sf/samtools/GATKChunk.java | 12 ++++ .../sf/samtools/GATKBAMFileSpanUnitTest.java | 70 ++++++++++++++++++- 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index 4692c6671..ffc40067a 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -143,10 +143,14 @@ public class GATKBAMFileSpan extends BAMFileSpan { List mergedUnion = new ArrayList(); GATKChunk currentChunk = unmergedUnion.remove(); while(!unmergedUnion.isEmpty()) { - // Find the end of this range of chunks. - while(!unmergedUnion.isEmpty() && currentChunk.getChunkEnd() >= unmergedUnion.peek().getChunkStart()) { + + // While the current chunk can be merged with the next chunk: + while( ! unmergedUnion.isEmpty() && + (currentChunk.overlaps(unmergedUnion.peek()) || currentChunk.isAdjacentTo(unmergedUnion.peek())) ) { + + // Merge the current chunk with the next chunk: GATKChunk nextChunk = unmergedUnion.remove(); - currentChunk = new GATKChunk(currentChunk.getChunkStart(),nextChunk.getChunkEnd()); + currentChunk = currentChunk.merge(nextChunk); } // Add the accumulated range. mergedUnion.add(currentChunk); diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index 5d349e72e..c48567f6e 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -96,4 +96,16 @@ public class GATKChunk extends Chunk { final int offsetSpan = (int)((getChunkEnd()&0xFFFF)-(getChunkStart()&0xFFFF)); return chunkSpan + offsetSpan; } + + /** + * Merges two chunks together. The caller is responsible for testing whether the + * chunks overlap/are adjacent before calling this method! + * + * @param other the chunk to merge with this chunk + * @return a new chunk representing the union of the two chunks (provided the chunks were + * overlapping/adjacent) + */ + public GATKChunk merge ( GATKChunk other ) { + return new GATKChunk(Math.min(getChunkStart(), other.getChunkStart()), Math.max(getChunkEnd(), other.getChunkEnd())); + } } diff --git a/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java b/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java index a9586c3c8..dbb4ea225 100644 --- a/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java +++ b/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java @@ -50,7 +50,10 @@ public class GATKBAMFileSpanUnitTest { } @Test - public void testUnionOfOverlappingFileSpans() { + public void testUnionOfContiguousFileSpans() { + // Region 1 ends at position adjacent to Region 2 start: + // |---1----|---2----| + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,1<<16)); GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); GATKBAMFileSpan union = regionOne.union(regionTwo); @@ -58,6 +61,71 @@ public class GATKBAMFileSpanUnitTest { Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535)); } + @Test + public void testUnionOfFileSpansFirstRegionEndsWithinSecondRegion() { + // Region 1 ends within Region 2: + // |---2----| + // |---1----| + + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|32767)); + GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); + GATKBAMFileSpan union = regionOne.union(regionTwo); + Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not."); + Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535)); + } + + @Test + public void testUnionOfFileSpansFirstRegionEndsAtSecondRegionEnd() { + // Region 1 ends at Region 2 end: + // |---2----| + // |---1-----------| + + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535)); + GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); + GATKBAMFileSpan union = regionOne.union(regionTwo); + Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not."); + Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535)); + } + + @Test + public void testUnionOfFileSpansFirstRegionEndsAfterSecondRegionEnd() { + // Region 1 ends after Region 2 end: + // |---2----| + // |---1---------------| + + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535)); + GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767)); + GATKBAMFileSpan union = regionOne.union(regionTwo); + Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not."); + Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535)); + } + + @Test + public void testUnionOfFileSpansFirstRegionStartsAtSecondRegionStart() { + // Region 1 starts at Region 2 start, but ends before Region 2: + // |---2--------| + // |---1----| + + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767)); + GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); + GATKBAMFileSpan union = regionOne.union(regionTwo); + Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not."); + Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535)); + } + + @Test + public void testUnionOfFileSpansFirstRegionEqualToSecondRegion() { + // Region 1 and Region 2 represent the same region: + // |---2----| + // |---1----| + + GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); + GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535)); + GATKBAMFileSpan union = regionOne.union(regionTwo); + Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not."); + Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535)); + } + @Test public void testUnionOfStringOfFileSpans() { GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk[] { new GATKChunk(0,1<<16), new GATKChunk(2<<16,3<<16) });