diff --git a/java/src/net/sf/samtools/BAMChunkIterator.java b/java/src/net/sf/samtools/BAMChunkIterator.java index 017ae6e61..903a067bd 100644 --- a/java/src/net/sf/samtools/BAMChunkIterator.java +++ b/java/src/net/sf/samtools/BAMChunkIterator.java @@ -36,6 +36,7 @@ public class BAMChunkIterator implements Iterator { this.blockIterator = blockIterator; this.prefetchedSegments = new LinkedList(); this.filters = new PriorityQueue(filters); + seedNextSegments(); } /** diff --git a/java/src/net/sf/samtools/BAMFileHeaderLoader.java b/java/src/net/sf/samtools/BAMFileHeaderLoader.java index d1a156874..0b39ddd33 100644 --- a/java/src/net/sf/samtools/BAMFileHeaderLoader.java +++ b/java/src/net/sf/samtools/BAMFileHeaderLoader.java @@ -8,6 +8,8 @@ import java.io.File; import java.io.IOException; import java.io.DataInputStream; import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; /** * Loads a BAM file header from an file, optionally providing its position @@ -27,6 +29,8 @@ public class BAMFileHeaderLoader { */ private final Chunk location; + public static final Chunk preambleLocation = new Chunk(0<<16 | 0, 0<<16 | 3); + /** * Load the header from the given file. * @param header the parsed haeder for the BAM file. @@ -69,9 +73,55 @@ public class BAMFileHeaderLoader { headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath()); + // directly copied from BAMFileReader... + final int sequenceCount = binaryCodec.readInt(); + if (header.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != header.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + header.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(binaryCodec,file); + final SAMSequenceRecord sequenceRecord = header.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + binaryCodec); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + binaryCodec); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(binaryCodec,file)); + } + header.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } inputStream.close(); - return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer())); + return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer()-1)); } + /** + * Reads a single binary sequence record from the file or stream + * @param binaryCodec stream to read from. + * @param file Note that this is used only for reporting errors. + * @return an individual sequence record. + */ + private static SAMSequenceRecord readSequenceRecord(final BinaryCodec binaryCodec, final File file) { + final int nameLength = binaryCodec.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file.getAbsolutePath()); + } + final String sequenceName = binaryCodec.readString(nameLength - 1); + // Skip the null terminator + binaryCodec.readByte(); + final int sequenceLength = binaryCodec.readInt(); + return new SAMSequenceRecord(sequenceName, sequenceLength); + } } diff --git a/java/src/net/sf/samtools/BAMFileReader2.java b/java/src/net/sf/samtools/BAMFileReader2.java index a1e8708e4..9489468c5 100644 --- a/java/src/net/sf/samtools/BAMFileReader2.java +++ b/java/src/net/sf/samtools/BAMFileReader2.java @@ -490,10 +490,7 @@ class BAMFileReader2 mFilePointerLimit = endOffset; } // Pull next record from stream - final SAMRecord record = super.getNextRecord(); - if (record == null) { - return null; - } + return super.getNextRecord(); } } } diff --git a/java/src/net/sf/samtools/BlockSegment.java b/java/src/net/sf/samtools/BlockSegment.java index be8fd146b..eb6082e44 100644 --- a/java/src/net/sf/samtools/BlockSegment.java +++ b/java/src/net/sf/samtools/BlockSegment.java @@ -51,7 +51,7 @@ class BlockSegment { * @return the chunk equivalent of this block. */ public Chunk toChunk() { - return new Chunk(position << 16 & blockStart,position << 16 & blockStop); + return new Chunk(position << 16 | blockStart,position << 16 | blockStop); } /** diff --git a/java/src/net/sf/samtools/Chunk.java b/java/src/net/sf/samtools/Chunk.java index 1d6abdc2a..fd7ff4ea9 100644 --- a/java/src/net/sf/samtools/Chunk.java +++ b/java/src/net/sf/samtools/Chunk.java @@ -12,7 +12,7 @@ import java.util.ArrayList; * @author mhanna * @version 0.1 */ -class Chunk implements Comparable { +public class Chunk implements Comparable { private long mChunkStart; private long mChunkEnd; diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ffc560bd9..b07d01b3d 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -26,13 +26,14 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.shards.Shard; @@ -329,11 +330,11 @@ public class GenomeAnalysisEngine { public List> getSamplesByReaders() { - SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + Collection readers = getDataSource().getReaders(); - List> sample_sets = new ArrayList>(hm.getReaders().size()); + List> sample_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : hm.getReaders()) { + for (SAMFileReader r : readers) { Set samples = new HashSet(1); sample_sets.add(samples); @@ -358,11 +359,11 @@ public class GenomeAnalysisEngine { public List> getLibrariesByReaders() { - SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + Collection readers = getDataSource().getReaders(); - List> lib_sets = new ArrayList>(hm.getReaders().size()); + List> lib_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : hm.getReaders()) { + for (SAMFileReader r : readers) { Set libs = new HashSet(2); lib_sets.add(libs); @@ -387,20 +388,20 @@ public class GenomeAnalysisEngine { public List> getMergedReadGroupsByReaders() { - SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + Collection readers = getDataSource().getReaders(); - List> rg_sets = new ArrayList>(hm.getReaders().size()); + List> rg_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : hm.getReaders()) { + for (SAMFileReader r : readers) { Set groups = new HashSet(5); rg_sets.add(groups); for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) { - if (hm.hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: + if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: // use HeaderMerger to translate original read group id from the reader into the read group id in the // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(hm.getReadGroupId(r, g.getReadGroupId())); + groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId())); } else { // otherwise, pass through the unmapped read groups since this is what Picard does as well groups.add(g.getReadGroupId()); @@ -609,26 +610,29 @@ public class GenomeAnalysisEngine { ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; - shardStrategy = ShardStrategyFactory.shatter(shardType, + shardStrategy = ShardStrategyFactory.shatter(readsDataSource, + shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, intervals, maxIterations); } else - shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, + shardStrategy = ShardStrategyFactory.shatter(readsDataSource,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); } else if (walker instanceof ReadWalker || walker instanceof DuplicateWalker) { - - shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS; + if(argCollection.experimentalSharding) + shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; + else + shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS; if (intervals != null && !intervals.isEmpty()) { - shardStrategy = ShardStrategyFactory.shatter(shardType, + shardStrategy = ShardStrategyFactory.shatter(readsDataSource,shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, intervals, maxIterations); } else { - shardStrategy = ShardStrategyFactory.shatter(shardType, + shardStrategy = ShardStrategyFactory.shatter(readsDataSource,shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); } @@ -636,7 +640,8 @@ public class GenomeAnalysisEngine { if ((intervals == null || intervals.isEmpty()) && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)) Utils.warnUser("walker is of type LocusWindow (which operates over intervals), but no intervals were provided." + "This may be unintentional, check your command-line arguments."); - shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, + shardStrategy = ShardStrategyFactory.shatter(readsDataSource, + ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, intervals, maxIterations); @@ -657,7 +662,11 @@ public class GenomeAnalysisEngine { if (reads.getReadsFiles().size() == 0) return null; - SAMDataSource dataSource = new SAMDataSource(reads); + SAMDataSource dataSource = null; + if(argCollection.experimentalSharding) + dataSource = new BlockDrivenSAMDataSource(reads); + else + dataSource = new IndexDrivenSAMDataSource(reads); return dataSource; } diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 63ce86b8c..6b6766aac 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -146,6 +146,9 @@ public class GATKArgumentCollection { @Argument(fullName = "enableRodWalkers", shortName = "erw", doc = "Enable experimental rodWalker support. TEMPORARY HACK TO ALLOW EXPERIMENTATION WITH ROD WALKERS. [default is false]}.", required = false) public boolean enableRodWalkers = false; + @Element(required = false) + @Argument(fullName = "experimental_sharding",shortName="es", doc="Use the experimental sharding strategy. Will not work for all traversal types.", required = false) + public boolean experimentalSharding = false; /** * marshal the data out to a object diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java new file mode 100644 index 000000000..8d9b1b287 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.gatk.datasources.shards; + +import net.sf.samtools.Chunk; + +import java.util.List; + +/** + * Expresses a shard of read data in block format. + * + * @author mhanna + * @version 0.1 + */ +public class BlockDelimitedReadShard extends ReadShard { + /** + * The list of chunks to retrieve when loading this shard. + */ + private final List chunks; + + public BlockDelimitedReadShard(List chunks) { + this.chunks = chunks; + } + + /** + * Get the list of chunks delimiting this shard. + * @return a list of chunks that contain data for this shard. + */ + public List getChunks() { + return chunks; + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for(Chunk chunk : chunks) { + sb.append(chunk); + sb.append(' '); + } + return sb.toString(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java new file mode 100644 index 000000000..f84c1997c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java @@ -0,0 +1,101 @@ +package org.broadinstitute.sting.gatk.datasources.shards; + +import net.sf.samtools.Chunk; +import net.sf.samtools.BAMFileHeaderLoader; +import net.sf.samtools.BAMChunkIterator; +import net.sf.samtools.BAMBlockIterator; + +import java.util.*; +import java.io.File; +import java.io.IOException; + +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; + +/** + * A read shard strategy that delimits based on the number of + * blocks in the BAM file. + * + * @author mhanna + * @version 0.1 + */ +public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { + /** + * Number of blocks in a given shard. + */ + protected int blockCount = 100; + + /** + * The actual chunks streaming into the file. + */ + private final BAMChunkIterator chunkIterator; + + /** + * The data backing the next chunks to deliver to the traversal engine. + */ + private final List nextChunks; + + /** + * Create a new read shard strategy, loading read shards from the given BAM file. + * @param dataSource Data source from which to load shards. + */ + public BlockDelimitedReadShardStrategy(SAMDataSource dataSource) { + if(dataSource.getReadsInfo().getReadsFiles().size() > 1) + throw new UnsupportedOperationException("Experimental sharding only works with a single BAM at the moment."); + File bamFile = dataSource.getReadsInfo().getReadsFiles().get(0); + try { + Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation(); + chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile),Arrays.asList(BAMFileHeaderLoader.preambleLocation,headerLocation)); + } + catch(IOException ex) { + throw new StingException("Unable to open BAM file for sharding."); + } + nextChunks = new ArrayList(); + + advance(); + } + + /** + * do we have another read shard? + * @return True if any more data is available. False otherwise. + */ + public boolean hasNext() { + return nextChunks.size() > 0; + } + + /** + * Retrieves the next shard, if available. + * @return The next shard, if available. + * @throws NoSuchElementException if no such shard is available. + */ + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard."); + Shard shard = new BlockDelimitedReadShard(Collections.unmodifiableList(new ArrayList(nextChunks))); + advance(); + return shard; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Remove not supported"); + } + + /** + * Convenience method for using ShardStrategy in an foreach loop. + * @return A iterator over shards. + */ + public Iterator iterator() { + return this; + } + + private void advance() { + nextChunks.clear(); + int chunksCopied = 0; + + while(chunksCopied++ < blockCount && chunkIterator.hasNext()) + nextChunks.add(chunkIterator.next()); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalShard.java index 1b56727ee..73b94f120 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalShard.java @@ -62,4 +62,13 @@ public class IntervalShard implements Shard { public Shard.ShardType getShardType() { return mType; } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + return mSet.toString(); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShard.java new file mode 100644 index 000000000..03a5cc4d7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShard.java @@ -0,0 +1,51 @@ +package org.broadinstitute.sting.gatk.datasources.shards; + +/** + * A read shard delimited by an actual read count, rather than blocks or any other + * physical mapping of the BAM file. + * + * @author mhanna + * @version 0.1 + */ +public class ReadDelimitedReadShard extends ReadShard { + // the count of the reads we want to copy off + private int size = 0; + + /** + * our tie in for the shard strategy. This allows us to signal to the shard + * strategy that we've finished process, so it can indicate that we're out of reads + */ + private final ReadDelimitedReadShardStrategy strat; + + /** + * create a read shard, given a read size + * @param strat The sharding strategy used to create this shard. + * @param size Size of the shard, in reads. + */ + ReadDelimitedReadShard(ReadDelimitedReadShardStrategy strat, int size) { + this.size = size; + this.strat = strat; + } + + /** @return the genome location represented by this shard */ + public int getSize() { + return size; + } + + /** + * this method is used as a backend, to signal to the sharding strategy that we've + * finished processing. When we move to a more read-aware bam system this method could disappear. + */ + public void signalDone() { + strat.signalDone(); + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + return String.format("%d reads", size); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShardStrategy.java new file mode 100644 index 000000000..149316ff9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadDelimitedReadShardStrategy.java @@ -0,0 +1,86 @@ +package org.broadinstitute.sting.gatk.datasources.shards; + +import java.util.Iterator; + +/** + * A shard strategy that breaks up shards based on how many reads are + * in each. + * + * @author mhanna + * @version 0.1 + */ +public class ReadDelimitedReadShardStrategy extends ReadShardStrategy { + // our read bucket size, default + protected long readCount = 100000L; + + // our hasnext flag + boolean hasNext = true; + + // our limiting factor + long limitedSize = -1; + boolean stopDueToLimitingFactor = false; + + /** + * the default constructor + * @param size the read count to iterate over + * @param limitedSize limit the shard to this length + */ + ReadDelimitedReadShardStrategy(long size, long limitedSize) { + readCount = size; + this.limitedSize = limitedSize; + } + + /** + * do we have another read shard? + * @return + */ + public boolean hasNext() { + if (stopDueToLimitingFactor) { + return false; + } + return hasNext; + } + + public Shard next() { + if (limitedSize > 0) { + if (limitedSize > readCount) { + limitedSize = limitedSize - readCount; + } + else { + readCount = limitedSize; + limitedSize = 0; + stopDueToLimitingFactor = true; + } + } + + return new ReadDelimitedReadShard(this,(int)readCount); + } + + public void remove() { + throw new UnsupportedOperationException("Remove not supported"); + } + + public Iterator iterator() { + return this; + } + + /** + * set the next shards size + * + * @param size adjust the next size to this + */ + public void adjustNextShardSize(long size) { + readCount = size; + } + + + /** + * this function is a work-around for the fact that + * we don't know when we're out of reads until the SAM data source + * tells us so. + */ + public void signalDone() { + hasNext = false; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java index 413dc27a7..8eb93c72d 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java @@ -18,68 +18,16 @@ import org.broadinstitute.sting.utils.GenomeLoc; * */ - /** + * the base class for read shards. * @author aaron - *

- * ReadShard - *

- * the base class for read shards. */ -public class ReadShard implements Shard { - - // the count of the reads we want to copy off - private int size = 0; - - /** - * our tie in for the shard strategy. This allows us to signal to the shard - * strategy that we've finished process, so it can indicate that we're out of reads - */ - private final ReadShardStrategy str; - - // the reference back to our read shard strategy - private final ReadShardStrategy strat; - - /** - * create a read shard, given a read size - * - * @param size - */ - ReadShard(int size, ReadShardStrategy strat) { - this.str = null; - this.size = size; - this.strat = strat; - } - - /** - * create a read shard, given a read size - * - * @param size - */ - ReadShard(ReadShardStrategy caller, int size, ReadShardStrategy strat) { - this.str = caller; - this.size = size; - this.strat = strat; - } - +public abstract class ReadShard implements Shard { /** @return the genome location represented by this shard */ public GenomeLoc getGenomeLoc() { throw new UnsupportedOperationException("ReadShard isn't genome loc aware"); } - /** @return the genome location represented by this shard */ - public int getSize() { - return size; - } - - /** - * this method is used as a backend, to signal to the sharding strategy that we've - * finished processing. When we move to a more read-aware bam system this method could disappear. - */ - public void signalDone() { - strat.signalDone(); - } - /** * what kind of shard do we return * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java index 9d5d0bb83..82c1586f4 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java @@ -40,85 +40,12 @@ import java.util.Iterator; * The sharding strategy for reads using a simple counting mechanism. Each read shard * has a specific number of reads (default to 100K) which is configured in the constructor. */ -public class ReadShardStrategy implements ShardStrategy { +public abstract class ReadShardStrategy implements ShardStrategy { // do we use unmapped reads in the sharding strategy private boolean unMappedReads = true; - // our read bucket size, default - protected long readCount = 100000L; - - // our sequence dictionary - final private SAMSequenceDictionary dic; - - // our hasnext flag - boolean hasNext = true; - - // our limiting factor - long limitedSize = -1; - boolean stopDueToLimitingFactor = false; - - /** - * the default constructor - * @param dic the sequence dictionary to use - * @param size the read count to iterate over - */ - ReadShardStrategy(SAMSequenceDictionary dic, long size, long limitedSize) { - this.dic = dic; - readCount = size; - this.limitedSize = limitedSize; - } - - /** - * do we have another read shard? - * @return - */ - public boolean hasNext() { - if (stopDueToLimitingFactor) { - return false; - } - return hasNext; - } - - public Shard next() { - if (limitedSize > 0) { - if (limitedSize > readCount) { - limitedSize = limitedSize - readCount; - } - else { - readCount = limitedSize; - limitedSize = 0; - stopDueToLimitingFactor = true; - } - } - - return new ReadShard((int)readCount, this); - } - - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - public Iterator iterator() { return this; } - - /** - * set the next shards size - * - * @param size adjust the next size to this - */ - public void adjustNextShardSize(long size) { - readCount = size; - } - - - /** - * this function is a work-around for the fact that - * we don't know when we're out of reads until the SAM data source - * tells us so. - */ - public void signalDone() { - hasNext = false; - } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategy.java index 8a57e57bf..c454a6aca 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategy.java @@ -28,13 +28,4 @@ import java.util.Iterator; * class, but not this will be an interface to accomidate read based sharding */ public interface ShardStrategy extends Iterator, Iterable { - - /** - * set the next shards size - * - * @param size adjust the next size to this - */ - public abstract void adjustNextShardSize(long size); - - } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java index e66aceabc..dfcea239b 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java @@ -4,6 +4,9 @@ import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; + +import java.io.File; /** * @@ -37,6 +40,7 @@ public class ShardStrategyFactory { LINEAR, EXPONENTIAL, READS, + READS_EXPERIMENTAL, INTERVAL, MONOLITHIC // Put all of the available data into one shard. } @@ -48,31 +52,35 @@ public class ShardStrategyFactory { /** * get a new shatter strategy * + * @param dataSource File pointer to BAM. TODO: Eliminate this argument; pass a data source instead! * @param strat what's our strategy - SHATTER_STRATEGY type * @param dic the seq dictionary * @param startingSize the starting size - * @return + * @return a shard strategy capable of dividing input data into shards. */ - static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) { - return ShardStrategyFactory.shatter(strat, dic, startingSize, -1L); + static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) { + return ShardStrategyFactory.shatter(dataSource, strat, dic, startingSize, -1L); } /** * get a new shatter strategy * + * @param dataSource File pointer to BAM. * @param strat what's our strategy - SHATTER_STRATEGY type * @param dic the seq dictionary * @param startingSize the starting size - * @return + * @return a shard strategy capable of dividing input data into shards. */ - static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, long limitByCount) { + static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, long limitByCount) { switch (strat) { case LINEAR: return new LinearLocusShardStrategy(dic, startingSize, limitByCount); case EXPONENTIAL: return new ExpGrowthLocusShardStrategy(dic, startingSize, limitByCount); case READS: - return new ReadShardStrategy(dic, startingSize, limitByCount); + return new ReadDelimitedReadShardStrategy(startingSize, limitByCount); + case READS_EXPERIMENTAL: + return new BlockDelimitedReadShardStrategy(dataSource); case INTERVAL: throw new StingException("Requested trategy: " + strat + " doesn't work with the limiting count (-M) command line option"); default: @@ -90,8 +98,8 @@ public class ShardStrategyFactory { * @param startingSize the starting size * @return */ - static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) { - return ShardStrategyFactory.shatter(strat, dic, startingSize, lst, -1l); + static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) { + return ShardStrategyFactory.shatter(dataSource, strat, dic, startingSize, lst, -1l); } @@ -103,7 +111,7 @@ public class ShardStrategyFactory { * @param startingSize the starting size * @return */ - static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst, long limitDataCount) { + static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst, long limitDataCount) { switch (strat) { case LINEAR: return new LinearLocusShardStrategy(dic, startingSize, lst, limitDataCount); @@ -113,6 +121,8 @@ public class ShardStrategyFactory { return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.LOCUS_INTERVAL); case READS: return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.READ_INTERVAL); + case READS_EXPERIMENTAL: + throw new UnsupportedOperationException("Cannot do experimental read sharding with intervals"); default: throw new StingException("Strategy: " + strat + " isn't implemented"); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java new file mode 100644 index 000000000..cc89bbab0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import org.broadinstitute.sting.gatk.datasources.shards.Shard; +import org.broadinstitute.sting.gatk.datasources.shards.BlockDelimitedReadShard; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader2; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + +import java.util.Collection; +import java.io.File; + +/** + * An iterator that's aware of how data is stored on disk in SAM format. + * + * @author mhanna + * @version 0.1 + */ +public class BlockDrivenSAMDataSource extends SAMDataSource { + + private final SAMFileReader2 reader; + /** + * Create a new block-aware SAM data source given the supplied read metadata. + * @param reads The read metadata. + */ + public BlockDrivenSAMDataSource(Reads reads) { + super(reads); + + if(reads.getReadsFiles().size() > 1) + throw new StingException("Experimental sharding strategy cannot handle multiple BAM files at this point."); + + File readsFile = reads.getReadsFiles().get(0); + reader = new SAMFileReader2(readsFile); + } + + public boolean hasIndex() { + return reader.hasIndex(); + } + + public StingSAMIterator seek(Shard shard) { + if(!(shard instanceof BlockDelimitedReadShard)) + throw new StingException("Currently unable to operate on types other than block delimited read shards."); + CloseableIterator iterator = reader.iterator(((BlockDelimitedReadShard)shard).getChunks()); + return applyDecoratingIterators(true, + StingSAMIteratorAdapter.adapt(reads, iterator), + reads.getDownsamplingFraction(), + reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + reads.getSupplementalFilters()); + } + + /** + * Gets the merged header from the SAM file. + * @return The merged header. + */ + public SAMFileHeader getHeader() { + return reader.getFileHeader(); + } + + /** + * Currently unsupported. + * @return + */ + public Collection getReaders() { + throw new StingException("Currently unable to get readers for shard-based fields."); + } + + /** + * No read group collisions at this time because only one SAM file is currently supported. + * @return False always. + */ + public boolean hasReadGroupCollisions() { + return false; + } + + /** + * Currently unsupported. + * @return + */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + throw new UnsupportedOperationException("Getting read group ID from this experimental SAM reader is not currently supported."); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java new file mode 100644 index 000000000..b5d002ddb --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -0,0 +1,423 @@ +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.filter.FilteringIterator; +import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.sam.SamFileHeaderMerger; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; +import org.broadinstitute.sting.gatk.datasources.shards.Shard; +import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard; +import org.broadinstitute.sting.gatk.datasources.shards.ReadDelimitedReadShard; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; + +import java.io.File; +import java.util.Collection; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * Converts shards to SAM iterators over the specified region + */ +public class IndexDrivenSAMDataSource extends SAMDataSource { + // used for the reads case, the last count of reads retrieved + long readsTaken = 0; + + // our last genome loc position + protected GenomeLoc lastReadPos = null; + + // do we take unmapped reads + private boolean includeUnmappedReads = true; + + // reads based traversal variables + private boolean intoUnmappedReads = false; + private int readsSeenAtLastPos = 0; + + /** + * A histogram of exactly what reads were removed from the input stream and why. + */ + private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); + + // A pool of SAM iterators. + private SAMResourcePool resourcePool = null; + + private GenomeLoc mLastInterval = null; + + /** + * Returns a histogram of reads that were screened out, grouped by the nature of the error. + * @return Histogram of reads. Will not be null. + */ + public SAMReadViolationHistogram getViolationHistogram() { + return violations; + } + + /** + * constructor, given sam files + * + * @param reads the list of sam files + */ + public IndexDrivenSAMDataSource( Reads reads ) throws SimpleDataSourceLoadException { + super(reads); + resourcePool = new SAMResourcePool(reads); + } + + /** + * Do all BAM files backing this data source have an index? The case where hasIndex() is false + * is supported, but only in a few extreme cases. + * @return True if an index is present; false otherwise. + */ + public boolean hasIndex() { + return resourcePool.hasIndex; + } + + /** + * Gets the (potentially merged) SAM file header. + * + * @return SAM file header. + */ + public SAMFileHeader getHeader() { + return resourcePool.getHeader(); + } + + + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public Reads getReadsInfo() { return reads; } + + /** + * Returns header merger: a class that keeps the mapping between original read groups and read groups + * of the merged stream; merger also provides access to the individual file readers (and hence headers + * prior to the merging too) maintained by the system. + * @return + */ + public Collection getReaders() { return resourcePool.getHeaderMerger().getReaders(); } + + /** Returns true if there are read group duplicates within the merged headers. */ + public boolean hasReadGroupCollisions() { + return resourcePool.getHeaderMerger().hasReadGroupCollisions(); + } + + /** Returns the read group id that should be used for the input read and RG id. */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + return resourcePool.getHeaderMerger().getReadGroupId(reader,originalReadGroupId); + } + + /** + * + * @param shard the shard to get data for + * + * @return an iterator for that region + */ + public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException { + // setup the iterator pool if it's not setup + boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true; + resourcePool.setQueryOverlapping(queryOverlapping); + + StingSAMIterator iterator = null; + if (shard.getShardType() == Shard.ShardType.READ) { + iterator = seekRead(shard); + iterator = applyDecoratingIterators(true, + iterator, + reads.getDownsamplingFraction(), + reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + reads.getSupplementalFilters()); + } else if (shard.getShardType() == Shard.ShardType.LOCUS) { + iterator = seekLocus(shard); + iterator = applyDecoratingIterators(false, + iterator, + reads.getDownsamplingFraction(), + reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + reads.getSupplementalFilters()); + } else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) || + (shard.getShardType() == Shard.ShardType.READ_INTERVAL)) { + iterator = seekLocus(shard); + iterator = applyDecoratingIterators(false, + iterator, + reads.getDownsamplingFraction(), + reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + reads.getSupplementalFilters()); + + // add the new overlapping detection iterator, if we have a last interval and we're a read based shard + if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) + iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false)); + mLastInterval = shard.getGenomeLoc(); + } else { + + throw new StingException("seek: Unknown shard type"); + } + + return iterator; + } + + + /** + *

+ * seekLocus + *

+ * + * @param shard the shard containing the genome location to extract data for + * + * @return an iterator for that region + */ + private StingSAMIterator seekLocus( Shard shard ) throws SimpleDataSourceLoadException { + if(shard instanceof MonolithicShard) + return createIterator(new EntireStream()); + + if( getHeader().getSequenceDictionary().getSequences().size() == 0 ) + throw new StingException("Unable to seek to the given locus; reads data source has no alignment information."); + return createIterator( new MappedStreamSegment(shard.getGenomeLoc()) ); + } + + /** + *

+ * seek + *

+ * + * @param shard the read shard to extract from + * + * @return an iterator for that region + */ + private StingSAMIterator seekRead( Shard shard ) throws SimpleDataSourceLoadException { + if(shard instanceof MonolithicShard) + return createIterator(new EntireStream()); + + ReadDelimitedReadShard readShard = (ReadDelimitedReadShard)shard; + StingSAMIterator iter = null; + + // If there are no entries in the sequence dictionary, there can't possibly be any unmapped reads. Force state to 'unmapped'. + if( isSequenceDictionaryEmpty() ) + intoUnmappedReads = true; + + if (!intoUnmappedReads) { + if (lastReadPos == null) { + lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); + iter = createIterator(new MappedStreamSegment(lastReadPos)); + return InitialReadIterator(readShard.getSize(), iter); + } else { + lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); + iter = fastMappedReadSeek(readShard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos)))); + } + + if (intoUnmappedReads && !includeUnmappedReads) + readShard.signalDone(); + } + + if (intoUnmappedReads && includeUnmappedReads) { + if (iter != null) + iter.close(); + iter = toUnmappedReads(readShard.getSize()); + if (!iter.hasNext()) + readShard.signalDone(); + } + + return iter; + } + + /** + * If we're in by-read mode, this indicates if we want + * to see unmapped reads too. Only seeing mapped reads + * is much faster, but most BAM files have significant + * unmapped read counts. + * + * @param seeUnMappedReads true to see unmapped reads, false otherwise + */ + public void viewUnmappedReads( boolean seeUnMappedReads ) { + includeUnmappedReads = seeUnMappedReads; + } + + /** + * For unit testing, add a custom iterator pool. + * + * @param resourcePool Custom mock iterator pool. + */ + void setResourcePool( SAMResourcePool resourcePool ) { + this.resourcePool = resourcePool; + } + + /** + * Retrieve unmapped reads. + * + * @param readCount how many reads to retrieve + * + * @return the bounded iterator that you can use to get the intervaled reads from + */ + StingSAMIterator toUnmappedReads( long readCount ) { + StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount)); + readsTaken += readCount; + return iter; + } + + + /** + * A seek function for mapped reads. + * + * @param readCount how many reads to retrieve + * @param iter the iterator to use, seeked to the correct start location + * + * @return the bounded iterator that you can use to get the intervaled reads from. Will be a zero-length + * iterator if no reads are available. + * @throws SimpleDataSourceLoadException + */ + StingSAMIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException { + BoundedReadIterator bound; + correctForReadPileupSeek(iter); + if (readsTaken == 0) { + return InitialReadIterator(readCount, iter); + } + int x = 0; + SAMRecord rec = null; + + // Assuming that lastReadPos should never be null, because this is a mappedReadSeek + // and initial queries are handled by the previous conditional. + int lastContig = lastReadPos.getContigIndex(); + int lastPos = (int)lastReadPos.getStart(); + + while (x < readsTaken) { + if (iter.hasNext()) { + rec = iter.next(); + if (lastContig == rec.getReferenceIndex() && lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos; + else this.readsSeenAtLastPos = 1; + lastPos = rec.getAlignmentStart(); + ++x; + } else { + iter.close(); + + // jump contigs + lastReadPos = GenomeLocParser.toNextContig(lastReadPos); + if (lastReadPos == null) { + // check to see if we're using unmapped reads, if not return, we're done + readsTaken = 0; + intoUnmappedReads = true; + + // fastMappedReadSeek must return an iterator, even if that iterator iterates through nothing. + return new NullSAMIterator(reads); + } else { + readsTaken = readCount; + readsSeenAtLastPos = 0; + lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); + CloseableIterator ret = createIterator(new MappedStreamSegment(lastReadPos)); + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount); + } + } + } + + // if we're off the end of the last contig (into unmapped territory) + if (rec != null && rec.getAlignmentStart() == 0) { + readsTaken += readCount; + intoUnmappedReads = true; + } + // else we're not off the end, store our location + else if (rec != null) { + int stopPos = rec.getAlignmentStart(); + if (stopPos < lastReadPos.getStart()) { + lastReadPos = GenomeLocParser.createGenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos); + } else { + lastReadPos = GenomeLocParser.setStart(lastReadPos,rec.getAlignmentStart()); + } + } + // in case we're run out of reads, get out + else { + throw new StingException("Danger: weve run out reads in fastMappedReadSeek"); + //return null; + } + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + + + // return the iterator + return bound; + } + + /** + * Determines whether the BAM file is completely unsequenced. Requires that the resource pool be initialized. + * @return True if the sequence dictionary is completely empty. False otherwise. + */ + private boolean isSequenceDictionaryEmpty() { + return getHeader().getSequenceDictionary().isEmpty(); + } + + /** + * Even though the iterator has seeked to the correct location, there may be multiple reads at that location, + * and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable + * + * @param iter the iterator + */ + private void correctForReadPileupSeek( StingSAMIterator iter ) { + // move the number of reads we read from the last pos + boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.) + for(int i = 0; i < this.readsSeenAtLastPos && iter.hasNext(); i++,iter.next()) + atLeastOneReadSeen = true; + if (readsSeenAtLastPos > 0 && !atLeastOneReadSeen) { + throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0"); + } + } + + + /** + * set the initial iterator + * + * @param readCount the number of reads + * @param iter the merging iterator + * + * @return a bounded read iterator at the first read position in the file. + */ + private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator iter ) { + BoundedReadIterator bound; + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + this.readsTaken = readCount; + return bound; + } + + /** + * Creates an iterator over the selected segment, from a resource pulled from the pool. + * @param segment Segment over which to gather reads. + * @return An iterator over just the reads in the given segment. + */ + private StingSAMIterator createIterator( DataStreamSegment segment ) { + StingSAMIterator iterator = resourcePool.iterator(segment); + StingSAMIterator malformedWrappedIterator = new MalformedSAMFilteringIterator( getHeader(), iterator, violations ); + StingSAMIterator readWrappingIterator = new ReadWrappingIterator(malformedWrappedIterator); + return readWrappingIterator; + } +} + + diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index b8e405374..55ce30957 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -2,21 +2,17 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; import net.sf.samtools.util.CloseableIterator; import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; -import net.sf.picard.sam.SamFileHeaderMerger; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; -import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; @@ -54,36 +50,21 @@ import java.util.Collection; *

* Converts shards to SAM iterators over the specified region */ -public class SAMDataSource implements SimpleDataSource { +public abstract class SAMDataSource implements SimpleDataSource { /** Backing support for reads. */ - private final Reads reads; + protected final Reads reads; /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMDataSource.class); - // used for the reads case, the last count of reads retrieved - long readsTaken = 0; - - // our last genome loc position - protected GenomeLoc lastReadPos = null; - // do we take unmapped reads - private boolean includeUnmappedReads = true; - - // reads based traversal variables - private boolean intoUnmappedReads = false; - private int readsSeenAtLastPos = 0; + protected boolean includeUnmappedReads = true; /** * A histogram of exactly what reads were removed from the input stream and why. */ - private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); - - // A pool of SAM iterators. - private SAMResourcePool resourcePool = null; - - private GenomeLoc mLastInterval = null; + protected SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); /** * Returns a histogram of reads that were screened out, grouped by the nature of the error. @@ -110,7 +91,6 @@ public class SAMDataSource implements SimpleDataSource { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); } } - resourcePool = new SAMResourcePool(reads); } /** @@ -118,18 +98,14 @@ public class SAMDataSource implements SimpleDataSource { * is supported, but only in a few extreme cases. * @return True if an index is present; false otherwise. */ - public boolean hasIndex() { - return resourcePool.hasIndex; - } + public abstract boolean hasIndex(); /** * Gets the (potentially merged) SAM file header. * * @return SAM file header. */ - public SAMFileHeader getHeader() { - return resourcePool.getHeader(); - } + public abstract SAMFileHeader getHeader(); /** @@ -140,12 +116,15 @@ public class SAMDataSource implements SimpleDataSource { public Reads getReadsInfo() { return reads; } /** - * Returns header merger: a class that keeps the mapping between original read groups and read groups - * of the merged stream; merger also provides access to the individual file readers (and hence headers - * prior to the merging too) maintained by the system. - * @return + * Returns readers used by this data source. */ - public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); } + public abstract Collection getReaders(); + + /** Returns true if there are read group duplicates within the merged headers. */ + public abstract boolean hasReadGroupCollisions(); + + /** Returns the read group id that should be used for the input read and RG id. */ + public abstract String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId); /** * @@ -153,110 +132,7 @@ public class SAMDataSource implements SimpleDataSource { * * @return an iterator for that region */ - public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException { - // setup the iterator pool if it's not setup - boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true; - resourcePool.setQueryOverlapping(queryOverlapping); - - StingSAMIterator iterator = null; - if (shard.getShardType() == Shard.ShardType.READ) { - iterator = seekRead(shard); - iterator = applyDecoratingIterators(true, - iterator, - reads.getDownsamplingFraction(), - reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - reads.getSupplementalFilters()); - } else if (shard.getShardType() == Shard.ShardType.LOCUS) { - iterator = seekLocus(shard); - iterator = applyDecoratingIterators(false, - iterator, - reads.getDownsamplingFraction(), - reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - reads.getSupplementalFilters()); - } else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) || - (shard.getShardType() == Shard.ShardType.READ_INTERVAL)) { - iterator = seekLocus(shard); - iterator = applyDecoratingIterators(false, - iterator, - reads.getDownsamplingFraction(), - reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - reads.getSupplementalFilters()); - - // add the new overlapping detection iterator, if we have a last interval and we're a read based shard - if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) - iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false)); - mLastInterval = shard.getGenomeLoc(); - } else { - - throw new StingException("seek: Unknown shard type"); - } - - return iterator; - } - - - /** - *

- * seekLocus - *

- * - * @param shard the shard containing the genome location to extract data for - * - * @return an iterator for that region - */ - private StingSAMIterator seekLocus( Shard shard ) throws SimpleDataSourceLoadException { - if(shard instanceof MonolithicShard) - return createIterator(new EntireStream()); - - if( getHeader().getSequenceDictionary().getSequences().size() == 0 ) - throw new StingException("Unable to seek to the given locus; reads data source has no alignment information."); - return createIterator( new MappedStreamSegment(shard.getGenomeLoc()) ); - } - - /** - *

- * seek - *

- * - * @param shard the read shard to extract from - * - * @return an iterator for that region - */ - private StingSAMIterator seekRead( Shard shard ) throws SimpleDataSourceLoadException { - if(shard instanceof MonolithicShard) - return createIterator(new EntireStream()); - - ReadShard readShard = (ReadShard)shard; - StingSAMIterator iter = null; - - // If there are no entries in the sequence dictionary, there can't possibly be any unmapped reads. Force state to 'unmapped'. - if( isSequenceDictionaryEmpty() ) - intoUnmappedReads = true; - - if (!intoUnmappedReads) { - if (lastReadPos == null) { - lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); - iter = createIterator(new MappedStreamSegment(lastReadPos)); - return InitialReadIterator(readShard.getSize(), iter); - } else { - lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); - iter = fastMappedReadSeek(readShard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos)))); - } - - if (intoUnmappedReads && !includeUnmappedReads) - readShard.signalDone(); - } - - if (intoUnmappedReads && includeUnmappedReads) { - if (iter != null) - iter.close(); - iter = toUnmappedReads(readShard.getSize()); - if (!iter.hasNext()) - readShard.signalDone(); - } - - return iter; - } + public abstract StingSAMIterator seek(Shard shard); /** * If we're in by-read mode, this indicates if we want @@ -270,160 +146,6 @@ public class SAMDataSource implements SimpleDataSource { includeUnmappedReads = seeUnMappedReads; } - /** - * For unit testing, add a custom iterator pool. - * - * @param resourcePool Custom mock iterator pool. - */ - void setResourcePool( SAMResourcePool resourcePool ) { - this.resourcePool = resourcePool; - } - - /** - * Retrieve unmapped reads. - * - * @param readCount how many reads to retrieve - * - * @return the bounded iterator that you can use to get the intervaled reads from - */ - StingSAMIterator toUnmappedReads( long readCount ) { - StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount)); - readsTaken += readCount; - return iter; - } - - - /** - * A seek function for mapped reads. - * - * @param readCount how many reads to retrieve - * @param iter the iterator to use, seeked to the correct start location - * - * @return the bounded iterator that you can use to get the intervaled reads from. Will be a zero-length - * iterator if no reads are available. - * @throws SimpleDataSourceLoadException - */ - StingSAMIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException { - BoundedReadIterator bound; - correctForReadPileupSeek(iter); - if (readsTaken == 0) { - return InitialReadIterator(readCount, iter); - } - int x = 0; - SAMRecord rec = null; - - // Assuming that lastReadPos should never be null, because this is a mappedReadSeek - // and initial queries are handled by the previous conditional. - int lastContig = lastReadPos.getContigIndex(); - int lastPos = (int)lastReadPos.getStart(); - - while (x < readsTaken) { - if (iter.hasNext()) { - rec = iter.next(); - if (lastContig == rec.getReferenceIndex() && lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos; - else this.readsSeenAtLastPos = 1; - lastPos = rec.getAlignmentStart(); - ++x; - } else { - iter.close(); - - // jump contigs - lastReadPos = GenomeLocParser.toNextContig(lastReadPos); - if (lastReadPos == null) { - // check to see if we're using unmapped reads, if not return, we're done - readsTaken = 0; - intoUnmappedReads = true; - - // fastMappedReadSeek must return an iterator, even if that iterator iterates through nothing. - return new NullSAMIterator(reads); - } else { - readsTaken = readCount; - readsSeenAtLastPos = 0; - lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); - CloseableIterator ret = createIterator(new MappedStreamSegment(lastReadPos)); - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount); - } - } - } - - // if we're off the end of the last contig (into unmapped territory) - if (rec != null && rec.getAlignmentStart() == 0) { - readsTaken += readCount; - intoUnmappedReads = true; - } - // else we're not off the end, store our location - else if (rec != null) { - int stopPos = rec.getAlignmentStart(); - if (stopPos < lastReadPos.getStart()) { - lastReadPos = GenomeLocParser.createGenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos); - } else { - lastReadPos = GenomeLocParser.setStart(lastReadPos,rec.getAlignmentStart()); - } - } - // in case we're run out of reads, get out - else { - throw new StingException("Danger: weve run out reads in fastMappedReadSeek"); - //return null; - } - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); - - - // return the iterator - return bound; - } - - /** - * Determines whether the BAM file is completely unsequenced. Requires that the resource pool be initialized. - * @return True if the sequence dictionary is completely empty. False otherwise. - */ - private boolean isSequenceDictionaryEmpty() { - return getHeader().getSequenceDictionary().isEmpty(); - } - - /** - * Even though the iterator has seeked to the correct location, there may be multiple reads at that location, - * and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable - * - * @param iter the iterator - */ - private void correctForReadPileupSeek( StingSAMIterator iter ) { - // move the number of reads we read from the last pos - boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.) - for(int i = 0; i < this.readsSeenAtLastPos && iter.hasNext(); i++,iter.next()) - atLeastOneReadSeen = true; - if (readsSeenAtLastPos > 0 && !atLeastOneReadSeen) { - throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0"); - } - } - - - /** - * set the initial iterator - * - * @param readCount the number of reads - * @param iter the merging iterator - * - * @return a bounded read iterator at the first read position in the file. - */ - private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator iter ) { - BoundedReadIterator bound; - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); - this.readsTaken = readCount; - return bound; - } - - /** - * Creates an iterator over the selected segment, from a resource pulled from the pool. - * @param segment Segment over which to gather reads. - * @return An iterator over just the reads in the given segment. - */ - private StingSAMIterator createIterator( DataStreamSegment segment ) { - StingSAMIterator iterator = resourcePool.iterator(segment); - StingSAMIterator malformedWrappedIterator = new MalformedSAMFilteringIterator( getHeader(), iterator, violations ); - StingSAMIterator readWrappingIterator = new ReadWrappingIterator(malformedWrappedIterator); - return readWrappingIterator; - } - /** * Filter reads based on user-specified criteria. * @@ -434,11 +156,11 @@ public class SAMDataSource implements SimpleDataSource { * @param supplementalFilters additional filters to apply to the reads. * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. */ - private StingSAMIterator applyDecoratingIterators(boolean enableVerification, - StingSAMIterator wrappedIterator, - Double downsamplingFraction, - Boolean noValidationOfReadOrder, - Collection supplementalFilters) { + protected StingSAMIterator applyDecoratingIterators(boolean enableVerification, + StingSAMIterator wrappedIterator, + Double downsamplingFraction, + Boolean noValidationOfReadOrder, + Collection supplementalFilters) { // NOTE: this (and other filtering) should be done before on-the-fly sorting // as there is no reason to sort something that we will end of throwing away if (downsamplingFraction != null) diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 32712412d..27654640f 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.*; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index e8a861a0c..fa2a4ff0a 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -71,12 +71,7 @@ public class TraverseReads extends TraversalEngine { ShardDataProvider dataProvider, T sum) { - if (shard instanceof ReadShard) { - logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize())); - } else if (shard instanceof IntervalShard) { - logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((IntervalShard) shard).getGenomeLoc())); - } - + logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", shard)); if (!(walker instanceof ReadWalker)) throw new IllegalArgumentException("Walker isn't a read walker!"); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactoryTest.java index 0291c50c3..1e84a70f5 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactoryTest.java @@ -49,7 +49,7 @@ public class ShardStrategyFactoryTest extends BaseTest { @Test public void testReadNonInterval() { - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100); assertTrue(st instanceof ReadShardStrategy); } @@ -57,19 +57,19 @@ public class ShardStrategyFactoryTest extends BaseTest { public void testReadInterval() { GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100); set.add(l); - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set); assertTrue(st instanceof IntervalShardStrategy); } @Test public void testLinearNonInterval() { - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100); assertTrue(st instanceof LinearLocusShardStrategy); } @Test public void testExpNonInterval() { - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100); assertTrue(st instanceof ExpGrowthLocusShardStrategy); } @@ -77,7 +77,7 @@ public class ShardStrategyFactoryTest extends BaseTest { public void testExpInterval() { GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100); set.add(l); - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set); assertTrue(st instanceof ExpGrowthLocusShardStrategy); } @@ -85,7 +85,7 @@ public class ShardStrategyFactoryTest extends BaseTest { public void testLinearInterval() { GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100); set.add(l); - ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set); + ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set); assertTrue(st instanceof LinearLocusShardStrategy); } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceTest.java index 81b28eca2..868e3a4ae 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceTest.java @@ -81,16 +81,17 @@ public class SAMBAMDataSourceTest extends BaseTest { @Test public void testLinearBreakIterateAll() { logger.warn("Executing testLinearBreakIterateAll"); - // the sharding strat. - ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); - int count = 0; // setup the data fl.add(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam")); Reads reads = new Reads(fl); + // the sharding strat. + SAMDataSource data = new IndexDrivenSAMDataSource(reads); + ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); + int count = 0; + try { - SAMDataSource data = new SAMDataSource(reads); for (Shard sh : strat) { int readCount = 0; count++; @@ -124,13 +125,14 @@ public class SAMBAMDataSourceTest extends BaseTest { @Test public void testMergingTwoBAMFiles() { logger.warn("Executing testMergingTwoBAMFiles"); - // the sharding strat. - ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); - // setup the test files fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam")); - Reads reads = new Reads(fl); + Reads reads = new Reads(fl); + + // the sharding strat. + SAMDataSource data = new IndexDrivenSAMDataSource(reads); + ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); ArrayList readcountPerShard = new ArrayList(); ArrayList readcountPerShard2 = new ArrayList(); @@ -140,7 +142,6 @@ public class SAMBAMDataSourceTest extends BaseTest { int count = 0; try { - SAMDataSource data = new SAMDataSource(reads); for (Shard sh : strat) { int readCount = 0; count++; @@ -173,11 +174,11 @@ public class SAMBAMDataSourceTest extends BaseTest { count = 0; // the sharding strat. - strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); + data = new IndexDrivenSAMDataSource(reads); + strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); logger.debug("Pile two:"); try { - SAMDataSource data = new SAMDataSource(reads); for (Shard sh : strat) { int readCount = 0; count++; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByIntervalTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByIntervalTest.java index 00880492b..3c72961f4 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByIntervalTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByIntervalTest.java @@ -101,11 +101,11 @@ public class SAMByIntervalTest extends BaseTest { int unmappedReadsSeen = 0; int iterations = 0; - SAMDataSource data = new SAMDataSource(reads); + IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads); data.setResourcePool(gen); GenomeLocSortedSet set = new GenomeLocSortedSet(); set.add(GenomeLocParser.createGenomeLoc(0, start, stop)); - ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, gen.getHeader().getSequenceDictionary(), UNMAPPED_READ_COUNT, set); + ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, gen.getHeader().getSequenceDictionary(), UNMAPPED_READ_COUNT, set); StingSAMIterator iter = data.seek(strat.next()); int count = 0; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java index a3a726883..a60f8d8d0 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java @@ -79,7 +79,7 @@ public class SAMByReadsTest extends BaseTest { int unmappedReadsSeen = 0; int iterations = 0; - SAMDataSource data = new SAMDataSource(reads); + IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads); data.setResourcePool(gen); ++iterations; StingSAMIterator ret = data.toUnmappedReads(100); @@ -109,10 +109,10 @@ public class SAMByReadsTest extends BaseTest { targetReadCount = 5; try { int readCount = 0; - SAMDataSource data = new SAMDataSource(reads); + IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads); data.setResourcePool(gen); - shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); + shardStrategy = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); while (shardStrategy.hasNext()) { StingSAMIterator ret = data.seek(shardStrategy.next()); assertTrue(ret != null); @@ -140,11 +140,11 @@ public class SAMByReadsTest extends BaseTest { targetReadCount = 3; try { int readCount = 0; - SAMDataSource data = new SAMDataSource(reads); + IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads); data.setResourcePool(gen); - shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); + shardStrategy = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); while (shardStrategy.hasNext()) { diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java index f1c35549c..d8f32efc4 100755 --- a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java +++ b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; @@ -83,22 +84,21 @@ public class BoundedReadIteratorTest extends BaseTest { @Test public void testBounding() { logger.warn("Executing testBounding"); - // the sharding strat. - ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); - int count = 0; - // setup the test files fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam")); Reads reads = new Reads(fl); + SAMDataSource data = new IndexDrivenSAMDataSource(reads); + // the sharding strat. + ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); + int count = 0; + // our target read final long boundedReadCount = 100; long shardReadCount = 0; try { - SAMDataSource data = new SAMDataSource(reads); - // make sure we have a shard if (!strat.hasNext()) { fail("Our shatter didn't give us a single piece, this is bad"); diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java index e26630af5..e9d5f77c7 100755 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -115,13 +116,12 @@ public class TraverseReadsTest extends BaseTest { } GenomeLocParser.setupRefContigOrdering(ref); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, + SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList)); + dataSource.viewUnmappedReads(false); + ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ShardStrategyFactory.SHATTER_STRATEGY.READS, ref.getSequenceDictionary(), readSize); - SAMDataSource dataSource = new SAMDataSource(new Reads(bamList)); - dataSource.viewUnmappedReads(false); - countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); @@ -162,13 +162,12 @@ public class TraverseReadsTest extends BaseTest { } GenomeLocParser.setupRefContigOrdering(ref); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, + SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList)); + dataSource.viewUnmappedReads(true); + ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ShardStrategyFactory.SHATTER_STRATEGY.READS, ref.getSequenceDictionary(), readSize); - SAMDataSource dataSource = new SAMDataSource(new Reads(bamList)); - dataSource.viewUnmappedReads(true); - countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit();