Fix for GATK slowdowns at the ends of intervals.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4514 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5889138f4a
commit
6af9532090
|
|
@ -53,10 +53,4 @@ public interface BAMFormatAwareShard extends Shard {
|
|||
* @return An iterator over the reads stored in the shard.
|
||||
*/
|
||||
public StingSAMIterator iterator();
|
||||
|
||||
/**
|
||||
* Gets any filter associated with this shard. Useful for filtering out overlaps, etc.
|
||||
* @return A filter if one exists. Null if not.
|
||||
*/
|
||||
public SamRecordFilter getFilter();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,15 +109,6 @@ public class LocusShard implements BAMFormatAwareShard {
|
|||
@Override
|
||||
public StingSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); }
|
||||
|
||||
/**
|
||||
* Gets a filter testing for overlap of this read with the given shard.
|
||||
* @return A filter capable of filtering out reads outside a given shard.
|
||||
*/
|
||||
@Override
|
||||
public SamRecordFilter getFilter() {
|
||||
return new ReadOverlapFilter(loci);
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the type of shard.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,40 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Filters reads out of a data stream that don't overlap with the given list of locations.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ReadOverlapFilter implements SamRecordFilter {
|
||||
/**
|
||||
* The list of locations containing reads to keep.
|
||||
*/
|
||||
private final List<GenomeLoc> intervals;
|
||||
|
||||
public ReadOverlapFilter(List<GenomeLoc> intervals) {
|
||||
this.intervals = intervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out this record if it doesn't appear in the interval list.
|
||||
* @param read The read to examine.
|
||||
* @return True to filter the read out. False otherwise.
|
||||
*/
|
||||
public boolean filterOut(SAMRecord read) {
|
||||
for(GenomeLoc interval: intervals) {
|
||||
if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) ||
|
||||
(read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) ||
|
||||
(read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop()))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -49,20 +49,20 @@ public class ReadShard implements BAMFormatAwareShard {
|
|||
*/
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
|
||||
|
||||
/**
|
||||
* currently our location
|
||||
*/
|
||||
private final List<GenomeLoc> loci;
|
||||
|
||||
/**
|
||||
* Statistics about which reads in this shards were used and which were filtered away.
|
||||
*/
|
||||
private final ReadMetrics readMetrics = new ReadMetrics();
|
||||
|
||||
/**
|
||||
* The filter to be applied to all reads meeting this criteria.
|
||||
*/
|
||||
private final SamRecordFilter filter;
|
||||
|
||||
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, SamRecordFilter filter) {
|
||||
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci) {
|
||||
this.readsDataSource = readsDataSource;
|
||||
this.fileSpans = fileSpans;
|
||||
this.filter = filter;
|
||||
this.loci = loci;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -85,7 +85,7 @@ public class ReadShard implements BAMFormatAwareShard {
|
|||
/** @return the genome location represented by this shard */
|
||||
@Override
|
||||
public List<GenomeLoc> getGenomeLocs() {
|
||||
throw new UnsupportedOperationException("ReadShard isn't genome loc aware");
|
||||
return loci;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -136,11 +136,6 @@ public class ReadShard implements BAMFormatAwareShard {
|
|||
return StingSAMIteratorAdapter.adapt(reads.iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
public SamRecordFilter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* what kind of shard do we return
|
||||
*
|
||||
|
|
|
|||
|
|
@ -124,7 +124,6 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
public void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
nextShard = null;
|
||||
SamRecordFilter filter = null;
|
||||
|
||||
if(locations != null) {
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
|
|
@ -137,8 +136,7 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
filter = new ReadOverlapFilter(currentFilePointer.locations);
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,filter);
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations);
|
||||
dataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
|
|
@ -152,7 +150,7 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
}
|
||||
}
|
||||
else {
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,position,filter);
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null);
|
||||
dataSource.fillShard(shard);
|
||||
nextShard = !shard.isBufferEmpty() ? shard : null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import org.broadinstitute.sting.gatk.ReadProperties;
|
|||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -434,14 +435,15 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
// Set up merging to dynamically merge together multiple BAMs.
|
||||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
|
||||
for(SAMReaderID id: getReaderIDs()) {
|
||||
if(shard.getFileSpans().get(id) == null)
|
||||
continue;
|
||||
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
if(readProperties.getReadBufferSize() != null)
|
||||
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
||||
if(shard.getFilter() != null)
|
||||
iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads
|
||||
if(shard.getGenomeLocs() != null)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||
}
|
||||
|
||||
|
|
@ -733,6 +735,100 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* Maps read groups in the original SAMFileReaders to read groups in
|
||||
*/
|
||||
private class ReadGroupMapping extends HashMap<String,String> {}
|
||||
|
||||
/**
|
||||
* Filters out reads that do not overlap the current GenomeLoc.
|
||||
* Note the custom implementation: BAM index querying returns all reads that could
|
||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
*/
|
||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> iterator;
|
||||
|
||||
/**
|
||||
* The next read, queued up and ready to go.
|
||||
*/
|
||||
private SAMRecord nextRead;
|
||||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalStarts;
|
||||
private int[] intervalEnds;
|
||||
|
||||
/**
|
||||
* Position within the interval list.
|
||||
*/
|
||||
private int currentBound = 0;
|
||||
|
||||
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
||||
this.iterator = iterator;
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalStarts[i] = (int)interval.getStart();
|
||||
intervalEnds[i] = (int)interval.getStop();
|
||||
i++;
|
||||
}
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if(nextRead == null)
|
||||
throw new NoSuchElementException("No more reads left in this iterator.");
|
||||
SAMRecord currentRead = nextRead;
|
||||
advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
|
||||
}
|
||||
|
||||
|
||||
public void close() {
|
||||
iterator.close();
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextRead = null;
|
||||
|
||||
if(!iterator.hasNext())
|
||||
return;
|
||||
|
||||
SAMRecord candidateRead = iterator.next();
|
||||
while(nextRead == null && currentBound < intervalStarts.length) {
|
||||
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
|
||||
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||
currentBound++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// No reasonable read found; advance the iterator.
|
||||
if(iterator.hasNext())
|
||||
candidateRead = iterator.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
|
|||
Loading…
Reference in New Issue