Fix serious engine bug that could cause reads to be dropped under certain circumstances
When aggregating raw BAM file spans into shards, the IntervalSharder tries to combine file spans when it can. Unfortunately, the method that combines two BAM file spans was seriously flawed, and would produce a truncated union if the file spans overlapped in certain ways. This could cause entire regions of the BAM file containing reads within the requested intervals to be dropped. Modified GATKBAMFileSpan.union() to correct this problem, and added unit tests to verify that the correct union is produced regardless of how the file spans happen to overlap. Thanks to Khalid, who did at least as much work on this bug as I did.
This commit is contained in:
parent
23e7f1bed9
commit
03e5184741
|
|
@ -143,10 +143,14 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
List<GATKChunk> mergedUnion = new ArrayList<GATKChunk>();
|
||||
GATKChunk currentChunk = unmergedUnion.remove();
|
||||
while(!unmergedUnion.isEmpty()) {
|
||||
// Find the end of this range of chunks.
|
||||
while(!unmergedUnion.isEmpty() && currentChunk.getChunkEnd() >= unmergedUnion.peek().getChunkStart()) {
|
||||
|
||||
// While the current chunk can be merged with the next chunk:
|
||||
while( ! unmergedUnion.isEmpty() &&
|
||||
(currentChunk.overlaps(unmergedUnion.peek()) || currentChunk.isAdjacentTo(unmergedUnion.peek())) ) {
|
||||
|
||||
// Merge the current chunk with the next chunk:
|
||||
GATKChunk nextChunk = unmergedUnion.remove();
|
||||
currentChunk = new GATKChunk(currentChunk.getChunkStart(),nextChunk.getChunkEnd());
|
||||
currentChunk = currentChunk.merge(nextChunk);
|
||||
}
|
||||
// Add the accumulated range.
|
||||
mergedUnion.add(currentChunk);
|
||||
|
|
|
|||
|
|
@ -96,4 +96,16 @@ public class GATKChunk extends Chunk {
|
|||
final int offsetSpan = (int)((getChunkEnd()&0xFFFF)-(getChunkStart()&0xFFFF));
|
||||
return chunkSpan + offsetSpan;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges two chunks together. The caller is responsible for testing whether the
|
||||
* chunks overlap/are adjacent before calling this method!
|
||||
*
|
||||
* @param other the chunk to merge with this chunk
|
||||
* @return a new chunk representing the union of the two chunks (provided the chunks were
|
||||
* overlapping/adjacent)
|
||||
*/
|
||||
public GATKChunk merge ( GATKChunk other ) {
|
||||
return new GATKChunk(Math.min(getChunkStart(), other.getChunkStart()), Math.max(getChunkEnd(), other.getChunkEnd()));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,10 @@ public class GATKBAMFileSpanUnitTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfOverlappingFileSpans() {
|
||||
public void testUnionOfContiguousFileSpans() {
|
||||
// Region 1 ends at position adjacent to Region 2 start:
|
||||
// |---1----|---2----|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,1<<16));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
|
|
@ -58,6 +61,71 @@ public class GATKBAMFileSpanUnitTest {
|
|||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfFileSpansFirstRegionEndsWithinSecondRegion() {
|
||||
// Region 1 ends within Region 2:
|
||||
// |---2----|
|
||||
// |---1----|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|32767));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
|
||||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfFileSpansFirstRegionEndsAtSecondRegionEnd() {
|
||||
// Region 1 ends at Region 2 end:
|
||||
// |---2----|
|
||||
// |---1-----------|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
|
||||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfFileSpansFirstRegionEndsAfterSecondRegionEnd() {
|
||||
// Region 1 ends after Region 2 end:
|
||||
// |---2----|
|
||||
// |---1---------------|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(0,(1<<16)|65535));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
|
||||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(0,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfFileSpansFirstRegionStartsAtSecondRegionStart() {
|
||||
// Region 1 starts at Region 2 start, but ends before Region 2:
|
||||
// |---2--------|
|
||||
// |---1----|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|32767));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
|
||||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfFileSpansFirstRegionEqualToSecondRegion() {
|
||||
// Region 1 and Region 2 represent the same region:
|
||||
// |---2----|
|
||||
// |---1----|
|
||||
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan regionTwo = new GATKBAMFileSpan(new GATKChunk(1<<16,(1<<16)|65535));
|
||||
GATKBAMFileSpan union = regionOne.union(regionTwo);
|
||||
Assert.assertEquals(union.getGATKChunks().size(),1,"Elements to be merged were not.");
|
||||
Assert.assertEquals(union.getGATKChunks().get(0),new GATKChunk(1<<16,(1<<16)|65535));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnionOfStringOfFileSpans() {
|
||||
GATKBAMFileSpan regionOne = new GATKBAMFileSpan(new GATKChunk[] { new GATKChunk(0,1<<16), new GATKChunk(2<<16,3<<16) });
|
||||
|
|
|
|||
Loading…
Reference in New Issue