diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java index de0855a3b..d1416f3b0 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java @@ -115,19 +115,19 @@ public abstract class ShardStrategy implements Iterator, Iterable public Shard next() { // lets get some background info on the problem long length = dic.getSequence(seqLoc).getSequenceLength(); - long proposedSize = nextShardSize() - 1; + long proposedSize = nextShardSize(); long nextStart = mLoc.getStop() + 1; // can we fit it into the current seq size? - if (nextStart + proposedSize < length) { + if (nextStart + proposedSize - 1 < length) { lastGenomeLocSize = proposedSize; - mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize); - return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize)); + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1); + return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1)); } // else we can't make it in the current location, we have to stitch one together else { - long overflow = nextStart + proposedSize - length; + long overflow = nextStart + proposedSize -1 - length; logger.debug("Overflow = " + overflow + " length: " + length); - lastGenomeLocSize = lastGenomeLocSize - overflow; + lastGenomeLocSize = proposedSize - overflow; // move to the next contig // the next sequence should start at the begining of the next contig Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize)); diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java index 863edf7be..505772ac0 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java @@ -37,35 +37,31 @@ public class SAMBAMDataSource implements SimpleDataSource { // do we care that the SAM files respect the sort order. private boolean matchedSortOrders = true; - // our merged sam iterator for spliting up the files - MergingSamRecordIterator2 mergeIterator; - // are we set to locus mode or read mode for dividing private boolean locusMode = true; // How strict should we be with SAM/BAM parsing? protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; + // our list of readers + private final List samFileList = new ArrayList(); + /** * constructor, given a single sam file * * @param samFiles the list of sam files */ public SAMBAMDataSource(List samFiles) throws SimpleDataSourceLoadException { - List readers = new ArrayList(); for (String fileName : samFiles) { File smFile = new File(fileName); if (!smFile.canRead()) { throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName); } - SAMFileReader reader = initializeSAMFile(smFile); - if (reader != null) { - readers.add(reader); - } + samFileList.add(smFile); + } - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); - this.mergeIterator = new MergingSamRecordIterator2(headerMerger); + //SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER); } @@ -73,6 +69,7 @@ public class SAMBAMDataSource implements SimpleDataSource { if (samFile.toString().endsWith(".list")) { return null; } else { + System.err.println("initializeSAMFile"); SAMFileReader samReader = new SAMFileReader(samFile, true); samReader.setValidationStringency(strictness); @@ -96,19 +93,43 @@ public class SAMBAMDataSource implements SimpleDataSource { /** *

- * getQueryRegionIterator + * seek *

* * @param location the genome location to extract data for * @return an iterator for that region */ - public MergingSamRecordIterator2 seek(GenomeLoc location) { - MergingSamRecordIterator2 iter = null; // new MergingSamRecordIterator2(this.mergeIterator.getMergedHeader().); + public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException { + + // right now this is pretty damn heavy, it copies the file list into a reader list every time + List lst = new ArrayList(); + for (File f : this.samFileList) { + SAMFileReader reader = initializeSAMFile(f); + if (reader == null) { + throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + f); + } + lst.add(reader); + } + + // now merge the headers + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); + + // make a merging iterator for this record + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); + + + System.err.println("About to query"); + // we do different things for locus and read modes if (locusMode) { iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true); } else { iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()); } - return iter; //To change body of implemented methods use File | Settings | File Templates. + + // return the iterator + return iter; } + + + } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java index 408672f38..da70db932 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java @@ -32,6 +32,6 @@ public interface SimpleDataSource extends Serializable { * @param location the genome location to extract data for * @return an iterator of the appropriate type, that is limited by the region */ - public Iterator seek(GenomeLoc location); + public Iterator seek(GenomeLoc location) throws SimpleDataSourceLoadException; } diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java index c3627880a..f560f9acb 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java @@ -10,35 +10,30 @@ */ package org.broadinstitute.sting.gatk.iterators; -import edu.mit.broad.picard.sam.SamFileHeaderMerger; -import edu.mit.broad.picard.sam.ReservedTagConstants; import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.sam.ReservedTagConstants; +import edu.mit.broad.picard.sam.SamFileHeaderMerger; import edu.mit.broad.picard.util.PeekableIterator; import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. -* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or -* functionality. -*/ -import java.util.*; import java.lang.reflect.Constructor; +import java.util.Comparator; +import java.util.Iterator; +import java.util.PriorityQueue; /** * Provides an iterator interface for merging multiple underlying iterators into a single * iterable stream. The underlying iterators/files must all have the same sort order unless * the requested output format is unsorted, in which case any combination is valid. */ -public class MergingSamRecordIterator2 implements Iterator { - protected PriorityQueue pq; +public class MergingSamRecordIterator2 implements CloseableIterator, Iterable { + protected PriorityQueue pq = null; protected final SamFileHeaderMerger samHeaderMerger; protected final SAMFileHeader.SortOrder sortOrder; + protected boolean initialized = false; + /** * Constructs a new merging iterator with the same set of readers and sort order as * provided by the header merger parameter. @@ -46,8 +41,18 @@ public class MergingSamRecordIterator2 implements Iterator { public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) { this.samHeaderMerger = headerMerger; this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); - initializePQ(); + this.pq = new PriorityQueue(samHeaderMerger.getReaders().size()); + } + + /** + * this class MUST only be initialized once, since the creation of the + */ + private void lazyInitialization() { + if (initialized) { + throw new UnsupportedOperationException("You cannot double initialize a MergingSamRecordIterator2"); + } + initialized = true; final SAMRecordComparator comparator = getComparator(); for (final SAMFileReader reader : samHeaderMerger.getReaders()) { if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) { @@ -59,17 +64,15 @@ public class MergingSamRecordIterator2 implements Iterator { } } - - protected void initializePQ() { - this.pq = new PriorityQueue(samHeaderMerger.getReaders().size()); - } - - public boolean supportsSeeking() { + public boolean supportsSeeking() { return true; } public void queryOverlapping(final String contig, final int start, final int stop) { - initializePQ(); // reinitialize the system + if (initialized) { + throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); + } + initialized = true; final SAMRecordComparator comparator = getComparator(); for (final SAMFileReader reader : samHeaderMerger.getReaders()) { @@ -80,9 +83,13 @@ public class MergingSamRecordIterator2 implements Iterator { } public void query(final String contig, final int start, final int stop, final boolean contained) { - initializePQ(); // reinitialize the system + if (initialized) { + throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); + } + initialized = true; final SAMRecordComparator comparator = getComparator(); for (final SAMFileReader reader : samHeaderMerger.getReaders()) { + //reader.close(); Iterator recordIter = reader.query(contig, start, stop, contained); final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator); addIfNotEmpty(iterator); @@ -90,7 +97,10 @@ public class MergingSamRecordIterator2 implements Iterator { } public void queryContained(final String contig, final int start, final int stop) { - initializePQ(); // reinitialize the system + if (initialized) { + throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); + } + initialized = true; final SAMRecordComparator comparator = getComparator(); for (final SAMFileReader reader : samHeaderMerger.getReaders()) { Iterator recordIter = reader.queryContained(contig, start, stop); @@ -106,6 +116,9 @@ public class MergingSamRecordIterator2 implements Iterator { /** Returns the next record from the top most iterator during merging. */ public synchronized SAMRecord next() { + if (!initialized) { + lazyInitialization(); + } final ComparableSamRecordIterator iterator = this.pq.poll(); final SAMRecord record = iterator.next(); addIfNotEmpty(iterator); @@ -186,6 +199,26 @@ public class MergingSamRecordIterator2 implements Iterator { public SAMFileHeader getMergedHeader() { return this.samHeaderMerger.getMergedHeader(); } + + + /** + * closes all the file handles for the readers....DO THIS or you will run out of handles + * with sharding. + */ + public void close() { + for (SAMFileReader reader : samHeaderMerger.getReaders()) { + reader.close(); + } + } + + + /** + * allows us to be used in the new style for loops + * @return + */ + public Iterator iterator() { + return this; + } } // Should replace picard class with the same name