Fix for GATK slowdowns at the ends of intervals.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4514 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-10-17 23:21:23 +00:00
parent 5889138f4a
commit 6af9532090
7 changed files with 133 additions and 75 deletions

View File

@ -53,10 +53,4 @@ public interface BAMFormatAwareShard extends Shard {
* @return An iterator over the reads stored in the shard.
*/
public StingSAMIterator iterator();
/**
* Gets any filter associated with this shard. Useful for filtering out overlaps, etc.
* @return A filter if one exists. Null if not.
*/
public SamRecordFilter getFilter();
}

View File

@ -109,15 +109,6 @@ public class LocusShard implements BAMFormatAwareShard {
@Override
public StingSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); }
/**
* Gets a filter testing for overlap of this read with the given shard.
* @return A filter capable of filtering out reads outside a given shard.
*/
@Override
public SamRecordFilter getFilter() {
return new ReadOverlapFilter(loci);
}
/**
* returns the type of shard.
*/

View File

@ -1,40 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
* Filters reads out of a data stream that don't overlap with the given list of locations.
*
* @author mhanna
* @version 0.1
*/
public class ReadOverlapFilter implements SamRecordFilter {
/**
* The list of locations containing reads to keep.
*/
private final List<GenomeLoc> intervals;
public ReadOverlapFilter(List<GenomeLoc> intervals) {
this.intervals = intervals;
}
/**
* Filter out this record if it doesn't appear in the interval list.
* @param read The read to examine.
* @return True to filter the read out. False otherwise.
*/
public boolean filterOut(SAMRecord read) {
for(GenomeLoc interval: intervals) {
if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) ||
(read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) ||
(read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop()))
return false;
}
return true;
}
}

View File

@ -49,20 +49,20 @@ public class ReadShard implements BAMFormatAwareShard {
*/
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
/**
* currently our location
*/
private final List<GenomeLoc> loci;
/**
* Statistics about which reads in this shards were used and which were filtered away.
*/
private final ReadMetrics readMetrics = new ReadMetrics();
/**
* The filter to be applied to all reads meeting this criteria.
*/
private final SamRecordFilter filter;
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, SamRecordFilter filter) {
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci) {
this.readsDataSource = readsDataSource;
this.fileSpans = fileSpans;
this.filter = filter;
this.loci = loci;
}
/**
@ -85,7 +85,7 @@ public class ReadShard implements BAMFormatAwareShard {
/** @return the genome location represented by this shard */
@Override
public List<GenomeLoc> getGenomeLocs() {
throw new UnsupportedOperationException("ReadShard isn't genome loc aware");
return loci;
}
/**
@ -136,11 +136,6 @@ public class ReadShard implements BAMFormatAwareShard {
return StingSAMIteratorAdapter.adapt(reads.iterator());
}
@Override
public SamRecordFilter getFilter() {
return filter;
}
/**
* what kind of shard do we return
*

View File

@ -124,7 +124,6 @@ public class ReadShardStrategy implements ShardStrategy {
public void advance() {
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
nextShard = null;
SamRecordFilter filter = null;
if(locations != null) {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
@ -137,8 +136,7 @@ public class ReadShardStrategy implements ShardStrategy {
}
if(selectedReaders.size() > 0) {
filter = new ReadOverlapFilter(currentFilePointer.locations);
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,filter);
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations);
dataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
@ -152,7 +150,7 @@ public class ReadShardStrategy implements ShardStrategy {
}
}
else {
BAMFormatAwareShard shard = new ReadShard(dataSource,position,filter);
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null);
dataSource.fillShard(shard);
nextShard = !shard.isBufferEmpty() ? shard : null;
}

View File

@ -42,6 +42,7 @@ import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -434,14 +435,15 @@ public class SAMDataSource implements SimpleDataSource {
// Set up merging to dynamically merge together multiple BAMs.
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
for(SAMReaderID id: getReaderIDs()) {
if(shard.getFileSpans().get(id) == null)
continue;
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
if(shard.getFilter() != null)
iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads
if(shard.getGenomeLocs() != null)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
mergingIterator.addIterator(readers.getReader(id),iterator);
}
@ -733,6 +735,100 @@ public class SAMDataSource implements SimpleDataSource {
* Maps read groups in the original SAMFileReaders to read groups in
*/
private class ReadGroupMapping extends HashMap<String,String> {}
/**
* Filters out reads that do not overlap the current GenomeLoc.
* Note the custom implementation: BAM index querying returns all reads that could
* possibly overlap the given region (and quite a few extras). In order not to drag
* down performance, this implementation is highly customized to its task.
*/
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private CloseableIterator<SAMRecord> iterator;
/**
* The next read, queued up and ready to go.
*/
private SAMRecord nextRead;
/**
* Custom representation of interval bounds.
* Makes it simpler to track current position.
*/
private int[] intervalStarts;
private int[] intervalEnds;
/**
* Position within the interval list.
*/
private int currentBound = 0;
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
this.iterator = iterator;
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalStarts[i] = (int)interval.getStart();
intervalEnds[i] = (int)interval.getStop();
i++;
}
advance();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if(nextRead == null)
throw new NoSuchElementException("No more reads left in this iterator.");
SAMRecord currentRead = nextRead;
advance();
return currentRead;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
}
public void close() {
iterator.close();
}
private void advance() {
nextRead = null;
if(!iterator.hasNext())
return;
SAMRecord candidateRead = iterator.next();
while(nextRead == null && currentBound < intervalStarts.length) {
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
// Promising, but this read must be checked against the ending bound.
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
// Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
continue;
}
}
// No reasonable read found; advance the iterator.
if(iterator.hasNext())
candidateRead = iterator.next();
}
}
}
}

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;