Fix serious engine bug that could cause reads to be dropped under certain circumstances

When aggregating raw BAM file spans into shards, the IntervalSharder tries to combine
file spans when it can. Unfortunately, the method that combines two BAM file
spans was seriously flawed, and would produce a truncated union if the file spans
overlapped in certain ways. This could cause entire regions of the BAM file containing
reads within the requested intervals to be dropped.

Modified GATKBAMFileSpan.union() to correct this problem, and added unit tests
to verify that the correct union is produced regardless of how the file spans
happen to overlap.

Thanks to Khalid, who did at least as much work on this bug as I did.
This commit is contained in:
David Roazen 2012-02-13 13:40:58 -05:00
parent 23e7f1bed9
commit 03e5184741
3 changed files with 88 additions and 4 deletions

View File

@ -143,10 +143,14 @@ public class GATKBAMFileSpan extends BAMFileSpan {
List<GATKChunk> mergedUnion = new ArrayList<GATKChunk>();
GATKChunk currentChunk = unmergedUnion.remove();
while(!unmergedUnion.isEmpty()) {
// Find the end of this range of chunks.
while(!unmergedUnion.isEmpty() && currentChunk.getChunkEnd() >= unmergedUnion.peek().getChunkStart()) {
// While the current chunk can be merged with the next chunk:
while( ! unmergedUnion.isEmpty() &&
(currentChunk.overlaps(unmergedUnion.peek()) || currentChunk.isAdjacentTo(unmergedUnion.peek())) ) {
// Merge the current chunk with the next chunk:
GATKChunk nextChunk = unmergedUnion.remove();
currentChunk = new GATKChunk(currentChunk.getChunkStart(),nextChunk.getChunkEnd());
currentChunk = currentChunk.merge(nextChunk);
}
// Add the accumulated range.
mergedUnion.add(currentChunk);

View File

@ -96,4 +96,16 @@ public class GATKChunk extends Chunk {
final int offsetSpan = (int)((getChunkEnd()&0xFFFF)-(getChunkStart()&0xFFFF));
return chunkSpan + offsetSpan;
}
/**
* Merges two chunks together. The caller is responsible for testing whether the
* chunks overlap/are adjacent before calling this method!
*
* @param other the chunk to merge with this chunk
* @return a new chunk representing the union of the two chunks (provided the chunks were
* overlapping/adjacent)
*/
public GATKChunk merge ( GATKChunk other ) {
return new GATKChunk(Math.min(getChunkStart(), other.getChunkStart()), Math.max(getChunkEnd(), other.getChunkEnd()));
}
}

View File

@ -50,7 +50,10 @@ public class GATKBAMFileSpanUnitTest {
}
@Test
public void testUnionOfOverlappingFileSpans() {
public void testUnionOfContiguousFileSpans() {
// Region 1 ends at position adjacent to Region 2 start:
// |---1----|---2----|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,1<<16));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan union = regionOne.union(regionTwo);
@ -58,6 +61,71 @@ public class GATKBAMFileSpanUnitTest {
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
}
@Test
public void testUnionOfFileSpansFirstRegionEndsWithinSecondRegion() {
// Region 1 ends within Region 2:
// |---2----|
// |---1----|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|32767));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan union = regionOne.union(regionTwo);
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
}
@Test
public void testUnionOfFileSpansFirstRegionEndsAtSecondRegionEnd() {
// Region 1 ends at Region 2 end:
// |---2----|
// |---1-----------|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan union = regionOne.union(regionTwo);
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
}
@Test
public void testUnionOfFileSpansFirstRegionEndsAfterSecondRegionEnd() {
// Region 1 ends after Region 2 end:
// |---2----|
// |---1---------------|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767));
GATKBAMFileSpan union = regionOne.union(regionTwo);
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
}
@Test
public void testUnionOfFileSpansFirstRegionStartsAtSecondRegionStart() {
// Region 1 starts at Region 2 start, but ends before Region 2:
// |---2--------|
// |---1----|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan union = regionOne.union(regionTwo);
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535));
}
@Test
public void testUnionOfFileSpansFirstRegionEqualToSecondRegion() {
// Region 1 and Region 2 represent the same region:
// |---2----|
// |---1----|
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
GATKBAMFileSpan union = regionOne.union(regionTwo);
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535));
}
@Test
public void testUnionOfStringOfFileSpans() {
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk[] { new GATKChunk(0,1<<16), new GATKChunk(2<<16,3<<16) });