Support for read interval sharding with proper filtering.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2902 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-27 20:26:34 +00:00
parent d8fedd59be
commit 80f5d2829d
6 changed files with 53 additions and 16 deletions

View File

@ -3,11 +3,11 @@ package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.Chunk;
import net.sf.samtools.SAMFileReader2;
import net.sf.samtools.SAMRecord;
import net.sf.picard.filter.SamRecordFilter;
import java.util.List;
import java.util.Map;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
/**
@ -48,4 +48,10 @@ public interface BAMFormatAwareShard extends Shard {
* @return An iterator over the reads stored in the shard.
*/
public StingSAMIterator iterator();
/**
* Gets any filter associated with this shard. Useful for filtering out overlaps, etc.
* @return A filter if one exists. Null if not.
*/
public SamRecordFilter getFilter();
}

View File

@ -3,10 +3,10 @@ package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.Chunk;
import net.sf.samtools.SAMFileReader2;
import net.sf.samtools.SAMRecord;
import net.sf.picard.filter.SamRecordFilter;
import java.util.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
@ -34,15 +34,21 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware
*/
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(BlockDelimitedReadShardStrategy.MAX_READS);
/**
* The filter to be applied to all reads meeting this criteria.
*/
private final SamRecordFilter filter;
/**
* An BlockDelimitedLocusShard can be used either for READ or READ shard types.
* Track which type is being used.
*/
private final Shard.ShardType shardType;
public BlockDelimitedReadShard(Reads sourceInfo, Map<SAMFileReader2,List<Chunk>> chunks, Shard.ShardType shardType) {
public BlockDelimitedReadShard(Reads sourceInfo, Map<SAMFileReader2,List<Chunk>> chunks, SamRecordFilter filter, Shard.ShardType shardType) {
this.sourceInfo = sourceInfo;
this.chunks = chunks;
this.filter = filter;
this.shardType = shardType;
}
@ -73,10 +79,18 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware
reads.add(read);
}
/**
* Creates an iterator over reads stored in this shard's read cache.
* @return
*/
public StingSAMIterator iterator() {
return StingSAMIteratorAdapter.adapt(sourceInfo,reads.iterator());
}
public SamRecordFilter getFilter() {
return filter;
}
/**
* Get the list of chunks delimiting this shard.
* @return a list of chunks that contain data for this shard.

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.*;
import net.sf.picard.filter.SamRecordFilter;
import java.util.*;
@ -72,6 +73,8 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard.");
Map<SAMFileReader2,List<Chunk>> shardPosition = null;
SamRecordFilter filter = null;
if(!filePointers.isEmpty()) {
boolean foundData = false;
for(FilePointer filePointer: filePointers) {
@ -89,8 +92,10 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
}
}
}
if(foundData)
if(foundData) {
filter = new ReadOverlapFilter(filePointer.locations);
break;
}
}
}
else {
@ -98,9 +103,10 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
shardPosition = new HashMap<SAMFileReader2,List<Chunk>>();
for(Map.Entry<SAMFileReader2,Chunk> entry: position.entrySet())
shardPosition.put(entry.getKey(),Collections.singletonList(entry.getValue()));
filter = null;
}
BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,Shard.ShardType.READ);
BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,filter,Shard.ShardType.READ);
atEndOfStream = dataSource.fillShard(shard);
this.position = dataSource.getCurrentPosition();

View File

@ -2,11 +2,11 @@ package org.broadinstitute.sting.gatk.datasources.shards;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import net.sf.samtools.Chunk;
import net.sf.samtools.SAMFileReader2;
import net.sf.samtools.SAMRecord;
import net.sf.picard.filter.SamRecordFilter;
import java.util.List;
import java.util.Map;
@ -66,6 +66,14 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa
this.shardType = shardType;
}
/**
* Gets the chunks associated with this locus shard.
* @return A list of the chunks to use when retrieving locus data.
*/
public Map<SAMFileReader2,List<Chunk>> getChunks() {
return chunks;
}
/**
* Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations.
@ -85,14 +93,18 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa
*/
public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
/**
* Gets the iterator over the elements cached in the shard.
* @return
*/
public StingSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); }
/**
* Gets the chunks associated with this locus shard.
* @return A list of the chunks to use when retrieving locus data.
* Gets a filter testing for overlap of this read with the given shard.
* @return A filter capable of filtering out reads outside a given shard.
*/
public Map<SAMFileReader2,List<Chunk>> getChunks() {
return chunks;
public SamRecordFilter getFilter() {
return new ReadOverlapFilter(loci);
}
/**

View File

@ -1,9 +1,8 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.List;
@ -13,13 +12,13 @@ import java.util.List;
* @author mhanna
* @version 0.1
*/
public class IntervalOverlappingFilter implements SamRecordFilter {
public class ReadOverlapFilter implements SamRecordFilter {
/**
* The list of locations containing reads to keep.
*/
private final List<GenomeLoc> intervals;
public IntervalOverlappingFilter(List<GenomeLoc> intervals) {
public ReadOverlapFilter(List<GenomeLoc> intervals) {
this.intervals = intervals;
}

View File

@ -198,8 +198,8 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
CloseableIterator<SAMRecord> iterator = new MergingSamRecordIterator(headerMerger,readerToIteratorMap,true);
if(addIntervalFilter)
iterator = new FilteringIterator(iterator,new IntervalOverlappingFilter(shard.getGenomeLocs()));
if(shard.getFilter() != null)
iterator = new FilteringIterator(iterator,shard.getFilter());
return applyDecoratingIterators(enableVerification,
StingSAMIteratorAdapter.adapt(reads,iterator),