diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java index a6fa87fd3..6029ea065 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java @@ -53,10 +53,4 @@ public interface BAMFormatAwareShard extends Shard { * @return An iterator over the reads stored in the shard. */ public StingSAMIterator iterator(); - - /** - * Gets any filter associated with this shard. Useful for filtering out overlaps, etc. - * @return A filter if one exists. Null if not. - */ - public SamRecordFilter getFilter(); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java index 46f62877e..ba8eebb68 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java @@ -109,15 +109,6 @@ public class LocusShard implements BAMFormatAwareShard { @Override public StingSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } - /** - * Gets a filter testing for overlap of this read with the given shard. - * @return A filter capable of filtering out reads outside a given shard. - */ - @Override - public SamRecordFilter getFilter() { - return new ReadOverlapFilter(loci); - } - /** * returns the type of shard. */ diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java deleted file mode 100644 index d4544a8bd..000000000 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.shards; - -import net.sf.picard.filter.SamRecordFilter; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.List; - -/** - * Filters reads out of a data stream that don't overlap with the given list of locations. - * - * @author mhanna - * @version 0.1 - */ -public class ReadOverlapFilter implements SamRecordFilter { - /** - * The list of locations containing reads to keep. - */ - private final List intervals; - - public ReadOverlapFilter(List intervals) { - this.intervals = intervals; - } - - /** - * Filter out this record if it doesn't appear in the interval list. - * @param read The read to examine. - * @return True to filter the read out. False otherwise. - */ - public boolean filterOut(SAMRecord read) { - for(GenomeLoc interval: intervals) { - if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) || - (read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) || - (read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop())) - return false; - } - return true; - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java index 9ebe52e3a..9245faa96 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java @@ -49,20 +49,20 @@ public class ReadShard implements BAMFormatAwareShard { */ private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS); + /** + * currently our location + */ + private final List loci; + /** * Statistics about which reads in this shards were used and which were filtered away. */ private final ReadMetrics readMetrics = new ReadMetrics(); - /** - * The filter to be applied to all reads meeting this criteria. - */ - private final SamRecordFilter filter; - - public ReadShard(SAMDataSource readsDataSource, Map fileSpans, SamRecordFilter filter) { + public ReadShard(SAMDataSource readsDataSource, Map fileSpans, List loci) { this.readsDataSource = readsDataSource; this.fileSpans = fileSpans; - this.filter = filter; + this.loci = loci; } /** @@ -85,7 +85,7 @@ public class ReadShard implements BAMFormatAwareShard { /** @return the genome location represented by this shard */ @Override public List getGenomeLocs() { - throw new UnsupportedOperationException("ReadShard isn't genome loc aware"); + return loci; } /** @@ -136,11 +136,6 @@ public class ReadShard implements BAMFormatAwareShard { return StingSAMIteratorAdapter.adapt(reads.iterator()); } - @Override - public SamRecordFilter getFilter() { - return filter; - } - /** * what kind of shard do we return * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java index 4fc928ccc..18517d1cd 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java @@ -124,7 +124,6 @@ public class ReadShardStrategy implements ShardStrategy { public void advance() { Map shardPosition = new HashMap(); nextShard = null; - SamRecordFilter filter = null; if(locations != null) { Map selectedReaders = new HashMap(); @@ -137,8 +136,7 @@ public class ReadShardStrategy implements ShardStrategy { } if(selectedReaders.size() > 0) { - filter = new ReadOverlapFilter(currentFilePointer.locations); - BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,filter); + BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations); dataSource.fillShard(shard); if(!shard.isBufferEmpty()) { @@ -152,7 +150,7 @@ public class ReadShardStrategy implements ShardStrategy { } } else { - BAMFormatAwareShard shard = new ReadShard(dataSource,position,filter); + BAMFormatAwareShard shard = new ReadShard(dataSource,position,null); dataSource.fillShard(shard); nextShard = !shard.isBufferEmpty() ? shard : null; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 64b2340de..4e567ed54 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -434,14 +435,15 @@ public class SAMDataSource implements SimpleDataSource { // Set up merging to dynamically merge together multiple BAMs. MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); + for(SAMReaderID id: getReaderIDs()) { if(shard.getFileSpans().get(id) == null) continue; - CloseableIterator iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + CloseableIterator iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(readProperties.getReadBufferSize() != null) iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); - if(shard.getFilter() != null) - iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads + if(shard.getGenomeLocs() != null) + iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); mergingIterator.addIterator(readers.getReader(id),iterator); } @@ -733,6 +735,100 @@ public class SAMDataSource implements SimpleDataSource { * Maps read groups in the original SAMFileReaders to read groups in */ private class ReadGroupMapping extends HashMap {} + + /** + * Filters out reads that do not overlap the current GenomeLoc. + * Note the custom implementation: BAM index querying returns all reads that could + * possibly overlap the given region (and quite a few extras). In order not to drag + * down performance, this implementation is highly customized to its task. + */ + private class IntervalOverlapFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private CloseableIterator iterator; + + /** + * The next read, queued up and ready to go. + */ + private SAMRecord nextRead; + + /** + * Custom representation of interval bounds. + * Makes it simpler to track current position. + */ + private int[] intervalStarts; + private int[] intervalEnds; + + /** + * Position within the interval list. + */ + private int currentBound = 0; + + public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { + this.iterator = iterator; + this.intervalStarts = new int[intervals.size()]; + this.intervalEnds = new int[intervals.size()]; + int i = 0; + for(GenomeLoc interval: intervals) { + intervalStarts[i] = (int)interval.getStart(); + intervalEnds[i] = (int)interval.getStop(); + i++; + } + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(nextRead == null) + throw new NoSuchElementException("No more reads left in this iterator."); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); + } + + + public void close() { + iterator.close(); + } + + private void advance() { + nextRead = null; + + if(!iterator.hasNext()) + return; + + SAMRecord candidateRead = iterator.next(); + while(nextRead == null && currentBound < intervalStarts.length) { + if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || + (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { + // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. + // Promising, but this read must be checked against the ending bound. + if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { + // Yes, this read is within both bounds. This must be our next read. + nextRead = candidateRead; + break; + } + else { + // Oops, we're past the end bound. Increment the current bound and try again. + currentBound++; + continue; + } + } + + // No reasonable read found; advance the iterator. + if(iterator.hasNext()) + candidateRead = iterator.next(); + } + } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 6ccfe1396..fb13a8dab 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger;