Fix for some correctness bugs found during early performance testing, phase 1.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2822 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-10 22:32:25 +00:00
parent 60f05379a7
commit dc885ba386
5 changed files with 107 additions and 29 deletions

View File

@ -36,7 +36,7 @@ import net.sf.samtools.util.CloseableIterator;
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator implements Iterator<SAMRecord> {
public class MergingSamRecordIterator implements CloseableIterator<SAMRecord> {
private final PriorityQueue<ComparableSamRecordIterator> pq;
private final SamFileHeaderMerger samHeaderMerger;
private final SAMFileHeader.SortOrder sortOrder;
@ -44,7 +44,7 @@ public class MergingSamRecordIterator implements Iterator<SAMRecord> {
/**
* Maps iterators back to the readers from which they are derived.
*/
private final Map<Iterator<SAMRecord>,SAMFileReader> iteratorToSourceMap = new HashMap<Iterator<SAMRecord>,SAMFileReader>();
private final Map<CloseableIterator<SAMRecord>,SAMFileReader> iteratorToSourceMap = new HashMap<CloseableIterator<SAMRecord>,SAMFileReader>();
/**
* Constructs a new merging iterator with the same set of readers and sort order as
@ -94,6 +94,15 @@ public class MergingSamRecordIterator implements Iterator<SAMRecord> {
return readerToIteratorMap;
}
/**
* Close down all open iterators.
*/
public void close() {
// Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue.
for(CloseableIterator<SAMRecord> iterator: pq)
iterator.close();
}
/** Returns true if any of the underlying iterators has more records, otherwise false. */
public boolean hasNext() {
return !this.pq.isEmpty();

View File

@ -31,10 +31,7 @@ import net.sf.samtools.util.StringLineReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.*;
import java.net.URL;
/**
@ -185,7 +182,7 @@ class BAMFileReader2
}
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
List<Bin> bins = null;
List<Bin> bins = Collections.emptyList();
final SAMFileHeader fileHeader = getFileHeader();
int referenceIndex = fileHeader.getSequenceIndex(sequence);
@ -402,6 +399,8 @@ class BAMFileReader2
public SAMRecord next() {
final SAMRecord result = mNextRecord;
if(result.getAlignmentStart() <= 11632602 && result.getAlignmentEnd() >= 11632602)
System.out.printf("11632602: %s%n", result.getReadName());
advance();
return result;
}

View File

@ -112,8 +112,7 @@ public abstract class LocusView extends LocusIterator implements View {
* @return True if another locus context is bounded by this shard.
*/
protected boolean hasNextLocus() {
GenomeLoc lastLocus = !shard.getGenomeLocs().isEmpty() ? shard.getGenomeLocs().get(shard.getGenomeLocs().size()-1) : null;
return nextLocus != null && (lastLocus == null || !nextLocus.getLocation().isPast(lastLocus));
return nextLocus != null;
}
/**
@ -122,25 +121,17 @@ public abstract class LocusView extends LocusIterator implements View {
* @throw NoSuchElementException if the next element is missing.
*/
protected AlignmentContext nextLocus() {
GenomeLoc lastLocus = !shard.getGenomeLocs().isEmpty() ? shard.getGenomeLocs().get(shard.getGenomeLocs().size()-1) : null;
if( nextLocus == null || (lastLocus != null && nextLocus.getLocation().isPast(lastLocus)) )
if(nextLocus == null)
throw new NoSuchElementException("No more elements remain in locus context queue.");
// Cache the current and apply filtering.
AlignmentContext current = nextLocus;
// Find the next.
if( loci.hasNext() ) {
nextLocus = loci.next();
if( sourceInfo.getDownsampleToCoverage() != null )
current.downsampleToCoverage( sourceInfo.getDownsampleToCoverage() );
if( lastLocus != null && nextLocus.getLocation().isPast(lastLocus) )
nextLocus = null;
}
else
nextLocus = null;
seedNextLocus();
if( sourceInfo.getDownsampleToCoverage() != null )
current.downsampleToCoverage( sourceInfo.getDownsampleToCoverage() );
// if the current loci isn't null, get the overflow tracker and pass it to the alignment context
if ((this.loci != null))
current.setLocusOverflowTracker(loci.getLocusOverflowTracker());
@ -152,21 +143,56 @@ public abstract class LocusView extends LocusIterator implements View {
*/
private void seedNextLocus() {
//System.out.printf("loci is %s%n", loci);
if( loci.hasNext() )
nextLocus = loci.next();
if( !loci.hasNext() ) {
nextLocus = null;
return;
}
nextLocus = loci.next();
// If the location of this shard is available, trim the data stream to match the shard.
if(!shard.getGenomeLocs().isEmpty()) {
// Iterate past cruft at the beginning to the first locus in the shard.
while( nextLocus != null && nextLocus.getLocation().isBefore(shard.getGenomeLocs().get(0)) && loci.hasNext() )
// Iterate through any elements not contained within this shard.
while( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) && loci.hasNext() )
nextLocus = loci.next();
// If nothing in the shard was found, indicate that by setting nextAlignmentContext to null.
if( nextLocus != null && nextLocus.getLocation().isBefore(shard.getGenomeLocs().get(0)) )
if( nextLocus != null && (isBeforeShard(nextLocus.getLocation()) || isAfterShard(nextLocus.getLocation())) )
nextLocus = null;
}
}
/**
* Is this location before the given shard.
* @param location Location to check.
* @return True if the given location is before the start of the shard. False otherwise.
*/
private boolean isBeforeShard(GenomeLoc location) {
return location.isBefore(shard.getGenomeLocs().get(0));
}
/**
* Is this location after the given shard.
* @param location Location to check.
* @return True if the given location is after the end of the shard. False otherwise.
*/
private boolean isAfterShard(GenomeLoc location) {
return location.isPast(shard.getGenomeLocs().get(shard.getGenomeLocs().size()-1));
}
/**
* Is this location contained in the given shard.
* @param location Location to check.
* @return True if the given location is contained within the shard. False otherwise.
*/
private boolean isContainedInShard(GenomeLoc location) {
for(GenomeLoc shardLocation: shard.getGenomeLocs()) {
if(shardLocation.containsP(location))
return true;
}
return false;
}
/**
* Class to filter out un-handle-able reads from the stream. We currently are skipping
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.

View File

@ -11,6 +11,7 @@ import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.filter.FilteringIterator;
import java.util.*;
import java.io.File;
@ -127,9 +128,12 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
readerToIteratorMap.put(reader,reader.iterator(chunks));
}
MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger,readerToIteratorMap,true);
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readerToIteratorMap,true);
FilteringIterator filteringIterator = new FilteringIterator(mergingIterator,new IntervalOverlappingFilter(shard.getGenomeLocs()));
return applyDecoratingIterators(enableVerification,
StingSAMIteratorAdapter.adapt(reads,iterator),
StingSAMIteratorAdapter.adapt(reads,filteringIterator),
reads.getDownsamplingFraction(),
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters());

View File

@ -0,0 +1,40 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.List;
/**
* Filters reads out of a data stream that don't overlap with the given list of locations.
*
* @author mhanna
* @version 0.1
*/
public class IntervalOverlappingFilter implements SamRecordFilter {
/**
* The list of locations containing reads to keep.
*/
private final List<GenomeLoc> intervals;
public IntervalOverlappingFilter(List<GenomeLoc> intervals) {
this.intervals = intervals;
}
/**
* Filter out this record if it doesn't appear in the interval list.
* @param read The read to examine.
* @return True to filter the read out. False otherwise.
*/
public boolean filterOut(SAMRecord read) {
GenomeLoc readLocation = GenomeLocParser.createGenomeLoc(read);
for(GenomeLoc interval: intervals) {
if(interval.overlapsP(readLocation))
return false;
}
return true;
}
}