From 80f5d2829de0755f15fd7f0e1d2f7b98a67ee63b Mon Sep 17 00:00:00 2001 From: hanna Date: Sat, 27 Feb 2010 20:26:34 +0000 Subject: [PATCH] Support for read interval sharding with proper filtering. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2902 348d0f76-0448-11de-a6fe-93d51630548a --- .../shards/BAMFormatAwareShard.java | 8 ++++++- .../shards/BlockDelimitedReadShard.java | 18 +++++++++++++-- .../BlockDelimitedReadShardStrategy.java | 10 +++++++-- .../shards/IndexDelimitedLocusShard.java | 22 ++++++++++++++----- .../ReadOverlapFilter.java} | 7 +++--- .../BlockDrivenSAMDataSource.java | 4 ++-- 6 files changed, 53 insertions(+), 16 deletions(-) rename java/src/org/broadinstitute/sting/gatk/datasources/{simpleDataSources/IntervalOverlappingFilter.java => shards/ReadOverlapFilter.java} (81%) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java index ca475b875..cf87aa46d 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java @@ -3,11 +3,11 @@ package org.broadinstitute.sting.gatk.datasources.shards; import net.sf.samtools.Chunk; import net.sf.samtools.SAMFileReader2; import net.sf.samtools.SAMRecord; +import net.sf.picard.filter.SamRecordFilter; import java.util.List; import java.util.Map; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; /** @@ -48,4 +48,10 @@ public interface BAMFormatAwareShard extends Shard { * @return An iterator over the reads stored in the shard. */ public StingSAMIterator iterator(); + + /** + * Gets any filter associated with this shard. Useful for filtering out overlaps, etc. + * @return A filter if one exists. Null if not. + */ + public SamRecordFilter getFilter(); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java index aea3a1d32..74e066bb3 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java @@ -3,10 +3,10 @@ package org.broadinstitute.sting.gatk.datasources.shards; import net.sf.samtools.Chunk; import net.sf.samtools.SAMFileReader2; import net.sf.samtools.SAMRecord; +import net.sf.picard.filter.SamRecordFilter; import java.util.*; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; @@ -34,15 +34,21 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware */ private final Collection reads = new ArrayList(BlockDelimitedReadShardStrategy.MAX_READS); + /** + * The filter to be applied to all reads meeting this criteria. + */ + private final SamRecordFilter filter; + /** * An BlockDelimitedLocusShard can be used either for READ or READ shard types. * Track which type is being used. */ private final Shard.ShardType shardType; - public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks, Shard.ShardType shardType) { + public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks, SamRecordFilter filter, Shard.ShardType shardType) { this.sourceInfo = sourceInfo; this.chunks = chunks; + this.filter = filter; this.shardType = shardType; } @@ -73,10 +79,18 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware reads.add(read); } + /** + * Creates an iterator over reads stored in this shard's read cache. + * @return + */ public StingSAMIterator iterator() { return StingSAMIteratorAdapter.adapt(sourceInfo,reads.iterator()); } + public SamRecordFilter getFilter() { + return filter; + } + /** * Get the list of chunks delimiting this shard. * @return a list of chunks that contain data for this shard. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java index 506f3cecf..6a5591913 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.shards; import net.sf.samtools.*; +import net.sf.picard.filter.SamRecordFilter; import java.util.*; @@ -72,6 +73,8 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard."); Map> shardPosition = null; + SamRecordFilter filter = null; + if(!filePointers.isEmpty()) { boolean foundData = false; for(FilePointer filePointer: filePointers) { @@ -89,8 +92,10 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { } } } - if(foundData) + if(foundData) { + filter = new ReadOverlapFilter(filePointer.locations); break; + } } } else { @@ -98,9 +103,10 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { shardPosition = new HashMap>(); for(Map.Entry entry: position.entrySet()) shardPosition.put(entry.getKey(),Collections.singletonList(entry.getValue())); + filter = null; } - BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,Shard.ShardType.READ); + BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,filter,Shard.ShardType.READ); atEndOfStream = dataSource.fillShard(shard); this.position = dataSource.getCurrentPosition(); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java index 3b01cd7ec..389dd7001 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java @@ -2,11 +2,11 @@ package org.broadinstitute.sting.gatk.datasources.shards; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import net.sf.samtools.Chunk; import net.sf.samtools.SAMFileReader2; import net.sf.samtools.SAMRecord; +import net.sf.picard.filter.SamRecordFilter; import java.util.List; import java.util.Map; @@ -66,6 +66,14 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa this.shardType = shardType; } + /** + * Gets the chunks associated with this locus shard. + * @return A list of the chunks to use when retrieving locus data. + */ + public Map> getChunks() { + return chunks; + } + /** * Returns true if this shard is meant to buffer reads, rather * than just holding pointers to their locations. @@ -85,14 +93,18 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa */ public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + /** + * Gets the iterator over the elements cached in the shard. + * @return + */ public StingSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } /** - * Gets the chunks associated with this locus shard. - * @return A list of the chunks to use when retrieving locus data. + * Gets a filter testing for overlap of this read with the given shard. + * @return A filter capable of filtering out reads outside a given shard. */ - public Map> getChunks() { - return chunks; + public SamRecordFilter getFilter() { + return new ReadOverlapFilter(loci); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java similarity index 81% rename from java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java rename to java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java index 827f69938..d4544a8bd 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadOverlapFilter.java @@ -1,9 +1,8 @@ -package org.broadinstitute.sting.gatk.datasources.simpleDataSources; +package org.broadinstitute.sting.gatk.datasources.shards; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.List; @@ -13,13 +12,13 @@ import java.util.List; * @author mhanna * @version 0.1 */ -public class IntervalOverlappingFilter implements SamRecordFilter { +public class ReadOverlapFilter implements SamRecordFilter { /** * The list of locations containing reads to keep. */ private final List intervals; - public IntervalOverlappingFilter(List intervals) { + public ReadOverlapFilter(List intervals) { this.intervals = intervals; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index 8c89634d8..0b369c9da 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -198,8 +198,8 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. CloseableIterator iterator = new MergingSamRecordIterator(headerMerger,readerToIteratorMap,true); - if(addIntervalFilter) - iterator = new FilteringIterator(iterator,new IntervalOverlappingFilter(shard.getGenomeLocs())); + if(shard.getFilter() != null) + iterator = new FilteringIterator(iterator,shard.getFilter()); return applyDecoratingIterators(enableVerification, StingSAMIteratorAdapter.adapt(reads,iterator),