Fix a nasty little bug in the sharding system: if the last shard in contig n
overlaps exactly on disk with the first shard in contig n+1, the shards would be merged together to avoid duplicate extraction. Unfortunately, the interval overlap filter couldn't handle shards spanning contigs, and was choosing to filter out reads from contig n+1 which should have been included. I'm not completely sure why the BAM indexing code would ever specify that the end of one chromosome had the same on-disk location as the start of the next one. I suspect that this is a indexer performance bug.
This commit is contained in:
parent
088fc39308
commit
fec495e292
|
|
@ -893,6 +893,7 @@ public class SAMDataSource {
|
|||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalContigIndices;
|
||||
private int[] intervalStarts;
|
||||
private int[] intervalEnds;
|
||||
|
||||
|
|
@ -917,12 +918,14 @@ public class SAMDataSource {
|
|||
if(foundMappedIntervals) {
|
||||
if(keepOnlyUnmappedReads)
|
||||
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
||||
this.intervalContigIndices = new int[intervals.size()];
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalStarts[i] = (int)interval.getStart();
|
||||
intervalEnds[i] = (int)interval.getStop();
|
||||
intervalContigIndices[i] = interval.getContigIndex();
|
||||
intervalStarts[i] = interval.getStart();
|
||||
intervalEnds[i] = interval.getStop();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
|
@ -961,11 +964,10 @@ public class SAMDataSource {
|
|||
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
||||
if(!keepOnlyUnmappedReads) {
|
||||
// Mapped read filter; check against GenomeLoc-derived bounds.
|
||||
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
|
||||
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
|
||||
if(readEndsOnOrAfterStartingBound(candidateRead)) {
|
||||
// This read ends after the current interval begins.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
|
||||
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
|
|
@ -993,6 +995,37 @@ public class SAMDataSource {
|
|||
candidateRead = iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
|
||||
* end will be distorted, so rely only on the alignment start.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read ends on a later contig, or...
|
||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies before the end of the current bound.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read starts on a prior contig, or...
|
||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue