diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java index c42fddda6..f2fba766c 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java @@ -33,14 +33,27 @@ public class ReadShard implements Shard { // the count of the reads we want to copy off int size = 0; + // this is going to get gross + private final ReadShardStrategy str; + /** * create a read shard, given a read size * @param size */ public ReadShard(int size) { + this.str = null; this.size = size; } + /** + * create a read shard, given a read size + * @param size + */ + ReadShard(ReadShardStrategy caller, int size) { + this.str = caller; + this.size = size; + } + /** @return the genome location represented by this shard */ public GenomeLoc getGenomeLoc() { return null; //To change body of implemented methods use File | Settings | File Templates. diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 1a0643780..b1633edcb 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -3,7 +3,9 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; import edu.mit.broad.picard.sam.SamFileHeaderMerger; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; import org.broadinstitute.sting.utils.GenomeLoc; @@ -40,6 +42,20 @@ public class SAMDataSource implements SimpleDataSource { // our list of readers private final List samFileList = new ArrayList(); + // used for the reads case, the last count of reads retrieved + private long readsTaken = 0; + + // our last genome loc position + private GenomeLoc lastReadPos = null; + + // do we take unmapped reads + private boolean includeUnmappedReads = true; + + // reads based traversal variables + private boolean intoUnmappedReads = false; + private int readsAtLastPos = 0; + + /** * constructor, given sam files * @@ -48,18 +64,16 @@ public class SAMDataSource implements SimpleDataSource { public SAMDataSource(List samFiles) throws SimpleDataSourceLoadException { // check the length if (samFiles.size() < 1) { - throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0"); + throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0"); } for (Object fileName : samFiles) { File smFile; - if ( samFiles.get(0) instanceof String) { - smFile = new File((String)samFiles.get(0)); - } - else if (samFiles.get(0) instanceof File) { - smFile = (File)fileName; - } - else { - throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File"); + if (samFiles.get(0) instanceof String) { + smFile = new File((String) samFiles.get(0)); + } else if (samFiles.get(0) instanceof File) { + smFile = (File) fileName; + } else { + throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File"); } if (!smFile.canRead()) { @@ -71,11 +85,13 @@ public class SAMDataSource implements SimpleDataSource { } - - - - - protected SAMFileReader initializeSAMFile(final File samFile) { + /** + * Load up a sam file. + * + * @param samFile the file name + * @return a SAMFileReader for the file + */ + private SAMFileReader initializeSAMFile(final File samFile) { if (samFile.toString().endsWith(".list")) { return null; } else { @@ -99,6 +115,197 @@ public class SAMDataSource implements SimpleDataSource { */ public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException { + // right now this is pretty damn heavy, it copies the file list into a reader list every time + List lst = GetReaderList(); + + // now merge the headers + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); + + // make a merging iterator for this record + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); + + + // we do different things for locus and read modes + if (locusMode) { + iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1); + } else { + iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1); + } + + // return the iterator + return iter; + } + + + /** + * If we're in by-read mode, this indicates if we want + * to see unmapped reads too. Only seeing mapped reads + * is much faster, but most BAM files have significant + * unmapped read counts. + * + * @param seeUnMappedReads true to see unmapped reads, false otherwise + */ + public void viewUnmappedReads(boolean seeUnMappedReads) { + includeUnmappedReads = seeUnMappedReads; + } + + + /** + *

+ * seek + *

+ * + * @param readCount the length or reads to extract + * @return an iterator for that region + */ + public BoundedReadIterator seek(long readCount) throws SimpleDataSourceLoadException { + // TODO: make extremely less horrible + List lst = GetReaderList(); + + BoundedReadIterator bound; + + // now merge the headers + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); + + // make a merging iterator for this record + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); + if (!includeUnmappedReads) { + return fastMappedReadSeek(readCount, iter); + } else { + return unmappedReadSeek(readCount, iter); + } + + + } + + /** + * Seek, if we want unmapped reads. This method will be faster then the unmapped read method, but you cannot extract the + * unmapped reads. + * + * @param readCount how many reads to retrieve + * @param iter the iterator to use + * @return the bounded iterator that you can use to get the intervaled reads from + * @throws SimpleDataSourceLoadException + */ + private BoundedReadIterator unmappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { + BoundedReadIterator bound;// is this the first time we're doing this? + int count = 0; + + // throw away as many reads as it takes + while (count < this.readsTaken && iter.hasNext()) { + iter.next(); + ++count; + } + + // check to see what happened, did we run out of reads? + if (count != this.readsTaken) { + return null; + } + + // we're good, increment our read cout + this.readsTaken += readCount; + return new BoundedReadIterator(iter,readCount); + + } + + + /** + * Seek, if we only want mapped reads. This method will be faster then the unmapped read method, but you cannot extract the + * unmapped reads. + * + * @param readCount how many reads to retrieve + * @param iter the iterator to use + * @return the bounded iterator that you can use to get the intervaled reads from + * @throws SimpleDataSourceLoadException + */ + private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { + BoundedReadIterator bound;// is this the first time we're doing this? + if (lastReadPos == null) { + lastReadPos = new GenomeLoc(iter.getMergedHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0); + bound = new BoundedReadIterator(iter, readCount); + this.readsTaken = readCount; + } + + // we're not at the beginning, not at the end, so we move forward with our ghastly plan... + else { + + iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1); + + // move the number of reads we read from the last pos + while (iter.hasNext() && this.readsAtLastPos > 0) { + iter.next(); + --readsAtLastPos; + } + if (readsAtLastPos != 0) { + throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0"); + } + + int x = 0; + SAMRecord rec = null; + int lastPos = 0; + + while (x < readsTaken) { + if (iter.hasNext()) { + rec = iter.next(); + if (lastPos == rec.getAlignmentStart()) { + this.readsAtLastPos++; + } else { + this.readsAtLastPos = 1; + } + lastPos = rec.getAlignmentStart(); + //System.out.print("t" + rec.getAlignmentStart() + " "); + ++x; + } else { + // jump contigs + if (lastReadPos.toNextContig() == false) { + return null; + } + iter.close(); + // now merge the headers + // right now this is pretty damn heavy, it copies the file list into a reader list every time + List lst2 = GetReaderList(); + SamFileHeaderMerger mg = new SamFileHeaderMerger(lst2, SORT_ORDER); + iter = new MergingSamRecordIterator2(mg); + iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1); + } + } + + + // if we're off the end of the last contig (into unmapped territory) + if (rec != null && rec.getAlignmentStart() == 0) { + readsTaken += readCount; + intoUnmappedReads = true; + } + // else we're not off the end, store our location + else if (rec != null) { + int stopPos = rec.getAlignmentStart(); + if (stopPos < lastReadPos.getStart()) { + lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos); + } else { + lastReadPos.setStop(rec.getAlignmentStart()); + } + } + // in case we're run out of reads, get out + else { + return null; + } + bound = new BoundedReadIterator(iter, readCount); + } + + + // return the iterator + return bound; + } + + + /** + * A private function that, given the internal file list, generates a SamFileReader + * list of validated files. + * + * @return a list of SAMFileReaders that represent the stored file names + * @throws SimpleDataSourceLoadException if there's a problem loading the files + */ + private List GetReaderList() throws SimpleDataSourceLoadException { // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); for (File f : this.samFileList) { @@ -108,25 +315,7 @@ public class SAMDataSource implements SimpleDataSource { } lst.add(reader); } - - // now merge the headers - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); - - // make a merging iterator for this record - MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); - - - // we do different things for locus and read modes - if (locusMode) { - iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop()+1); - } else { - iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()+1); - } - - // return the iterator - return iter; + return lst; } - - } diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java index 561ed04bf..17729d20c 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java @@ -37,11 +37,19 @@ public class BoundedReadIterator implements CloseableIterator, Iterab // the genome loc we're bounding final private long readCount; private long currentCount = 0; + // the iterator we want to decorate - private CloseableIterator iterator; + private final CloseableIterator iterator; + + // our unmapped read flag + private boolean doNotUseThatUnmappedReadPile = false; // are we open private boolean isOpen = false; + + // the next read we've buffered + private SAMRecord record = null; + /** * constructor * @param iter @@ -50,11 +58,16 @@ public class BoundedReadIterator implements CloseableIterator, Iterab public BoundedReadIterator(CloseableIterator iter, long readCount) { if (iter != null) { isOpen = true; - this.iterator = iter; + } + this.iterator = iter; this.readCount = readCount; } + public void useUnmappedReads(boolean useThem) { + this.doNotUseThatUnmappedReadPile = useThem; + } + /** * Do we have a next? If the iterator has a read and we're not over the read @@ -63,6 +76,11 @@ public class BoundedReadIterator implements CloseableIterator, Iterab */ public boolean hasNext() { if (isOpen && iterator.hasNext() && currentCount < readCount) { + record = iterator.next(); + ++currentCount; + if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { + return false; + } return true; } else { if (isOpen) { @@ -77,11 +95,7 @@ public class BoundedReadIterator implements CloseableIterator, Iterab * @return SAMRecord representing the next read */ public SAMRecord next() { - if (!isOpen) { - throw new UnsupportedOperationException("You cannot call next on a closed iterator"); - } - ++currentCount; - return iterator.next(); + return record; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java index 9980d05a9..8dff73791 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java @@ -110,7 +110,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator, } /** Returns true if any of the underlying iterators has more records, otherwise false. */ - public synchronized boolean hasNext() { + public boolean hasNext() { if (!initialized) { lazyInitialization(); } @@ -118,7 +118,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator, } /** Returns the next record from the top most iterator during merging. */ - public synchronized SAMRecord next() { + public SAMRecord next() { if (!initialized) { lazyInitialization(); } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index e2d92d48e..5f9ab1d92 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -284,6 +284,24 @@ public class GenomeLoc implements Comparable { } } + + /** + * Move this Genome loc to the next contig, with a start + * and stop of 1. + * @return true if we are not out of contigs, otherwise false if we're + * at the end of the genome (no more contigs to jump to). + */ + public boolean toNextContig() { + if (contigIndex < GenomeLoc.contigInfo.size()) { + this.contigIndex++; + this.start = 1; + this.stop = 1; + return true; + } + return false; + } + + /** * Returns true iff we have a specified series of locations to process AND we are past the last * location in the list. It means that, in a serial processing of the genome, that we are done.