diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java index 4d79c0e96..600a38234 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java @@ -1,121 +1,122 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.datasources.simpleDataSources; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.Reads; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; -import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; +import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.utils.StingException; import net.sf.picard.sam.SamFileHeaderMerger; -import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.CloseableIterator; - -import java.util.List; -import java.util.ArrayList; -import java.io.File; -/** - * User: hanna - * Date: Jun 23, 2009 - * Time: 6:49:04 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ /** - * Maintains a pointer into a stream of reads. Tracks state between mapped and unmapped. - * For mapped, assumes that the user will query directly to where they want; closes the iterator after each use. - * For unmapped, assumes that the user will walk through the entire stream. Keeps the iterator open permanently. + * Abstract class that models a current state in some category of reads. + * @author hanna + * @version 0.1 */ -enum MappingType { MAPPED, UNMAPPED } - -class ReadStreamPointer { - /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(ReadStreamPointer.class); - +abstract class ReadStreamPointer { /** * Describes the source of reads data. */ - private final Reads sourceInfo; + protected final Reads sourceInfo; /** * Open handles to the reads info. */ - private final SamFileHeaderMerger headerMerger; + protected final SamFileHeaderMerger headerMerger; - /** - * The (possibly merged) header for the input fileset. - */ - private final SAMFileHeader header; - - /** - * In which bucket of reads does this pointer live? - */ - private MappingType streamPosition = MappingType.MAPPED; - - /** - * A pointer to the current position of this iterator in the read stream. - */ - private PositionTrackingIterator unmappedIterator = null; - - public ReadStreamPointer( Reads sourceInfo ) { + public ReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) { this.sourceInfo = sourceInfo; - this.headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate); - this.header = this.headerMerger.getMergedHeader(); + this.headerMerger = headerMerger; } /** - * Gets the header information for the read stream. - * @return Header information for the read stream. + * Can this pointer access the provided segment efficiently? + * @param segment Segment to test. + * @return True if it would be quick for this segment to access the given data. + * False if accessing this data would require some sort of reinitialization. */ - public SAMFileHeader getHeader() { - return header; - } + public abstract boolean canAccessSegmentEfficiently(DataStreamSegment segment); /** - * Can this pointer be efficiently used to access the given segment? - * @param segment Segment to inspect. - * @return True if the segment can be accessed efficiently, false otherwise. + * Close this resource, destroying all file handles. */ - public boolean canAccessSegmentEfficiently( DataStreamSegment segment ) { - switch( streamPosition ) { - case MAPPED: - return true; - case UNMAPPED: - if( segment instanceof MappedStreamSegment ) - return false; - else if( segment instanceof UnmappedStreamSegment ) { - UnmappedStreamSegment unmappedSegment = (UnmappedStreamSegment)segment; - return unmappedIterator.position <= unmappedSegment.position; - } - else - throw new StingException("Unsupported stream segment type: " + segment.getClass()); - default: - throw new StingException("Pointer has hit illegal stream position; current position is " + streamPosition); - - } - } - public void close() { - if( unmappedIterator != null ) - unmappedIterator.close(); for (SAMFileReader reader : headerMerger.getReaders()) reader.close(); } + /** + * Remove an iterator from service. + * @param iterator The iterator to remove from service. Must not be null. + */ + public abstract void destroy( StingSAMIterator iterator ); + /** * Get a stream of all the reads that overlap a given segment. * @param segment Segment to check for overlaps. * @return An iterator over all reads overlapping the given segment. */ + public abstract StingSAMIterator getReadsOverlapping( MappedStreamSegment segment ); + + /** + * Get a stream of all the reads that are completely contained by a given segment. + * The segment can be mapped or unmapped. + * @param segment Segment to check for containment.. + * @return An iterator over all reads contained by the given segment. + */ + public abstract StingSAMIterator getReadsContainedBy( DataStreamSegment segment ); +} + +class MappedReadStreamPointer extends ReadStreamPointer { + + public MappedReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) { + super( sourceInfo, headerMerger ); + } + + /** + * MappedReadStreamPointers can access any segment efficiently. Always return true. + * @param segment Segment to test. + * @return True. + */ + public boolean canAccessSegmentEfficiently(DataStreamSegment segment) { + return true; + } + + /** + * {@inheritDoc} + */ + public void destroy( StingSAMIterator iterator ) { + iterator.close(); + } + + + /** + * {@inheritDoc} + */ + @Override public StingSAMIterator getReadsOverlapping( MappedStreamSegment segment ) { MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); mergingIterator.queryOverlapping( segment.locus.getContig(), @@ -124,157 +125,98 @@ class ReadStreamPointer { return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); } + /** + * {@inheritDoc} + */ + @Override public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) { - if( segment instanceof MappedStreamSegment ) { - MappedStreamSegment mappedSegment = (MappedStreamSegment)segment; - MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); - mergingIterator.queryContained( mappedSegment.locus.getContig(), - (int)mappedSegment.locus.getStart(), - (int)mappedSegment.locus.getStop()); - return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); - } - else if( segment instanceof UnmappedStreamSegment ) { - UnmappedStreamSegment unmappedSegment = (UnmappedStreamSegment)segment; - - // If the stream position has not flipped over to the unmapped state, do some initialization. - if( streamPosition == MappingType.MAPPED ) { - MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); - mergingIterator.queryUnmappedReads(); - unmappedIterator = new PositionTrackingIterator( sourceInfo, mergingIterator, 0L ); - streamPosition = MappingType.UNMAPPED; - } - else { - if( streamPosition != MappingType.UNMAPPED || unmappedIterator == null ) - throw new StingException("Illegal state: iterator has fetched all mapped reads but has not properly transition to unmapped reads"); - - // Force the iterator to the next pending position. - while(unmappedIterator.position < unmappedSegment.position) - unmappedIterator.next(); - } - - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(sourceInfo,unmappedIterator), unmappedSegment.size); - } - else - throw new StingException("Unable to handle stream segment of type" + segment.getClass()); + if( !(segment instanceof MappedStreamSegment) ) + throw new StingException("Trying to access unmapped content from a mapped read stream pointer"); + MappedStreamSegment mappedSegment = (MappedStreamSegment)segment; + MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); + mergingIterator.queryContained( mappedSegment.locus.getContig(), + (int)mappedSegment.locus.getStart(), + (int)mappedSegment.locus.getStop()); + return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); } /** - * A private function that, given the internal file list, generates a merging construct for - * all available files. - * @param reads source information about the reads. - * @param SORT_ORDER sort order for the reads. - * @return a list of SAMFileReaders that represent the stored file names + * Convert a mapped read stream pointer to an unmapped read stream pointer, transferring ownership + * of the underlying file handles to the new container. + * After doing this conversion, the source MappedReadStreamPointer should not be used. + * @return */ - protected SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER ) - throws SimpleDataSourceLoadException { - // right now this is pretty damn heavy, it copies the file list into a reader list every time - List lst = new ArrayList(); - for (File f : reads.getReadsFiles()) { - SAMFileReader reader = new SAMFileReader(f, true); - reader.setValidationStringency(reads.getValidationStringency()); - - final SAMFileHeader header = reader.getFileHeader(); - logger.debug(String.format("Sort order is: " + header.getSortOrder())); - - if (reader.getFileHeader().getReadGroups().size() < 1) { - //logger.warn("Setting header in reader " + f.getName()); - SAMReadGroupRecord rec = new SAMReadGroupRecord(f.getName()); - rec.setLibrary(f.getName()); - rec.setSample(f.getName()); - - reader.getFileHeader().addReadGroup(rec); - } - - lst.add(reader); - } - return new SamFileHeaderMerger(lst,SORT_ORDER,true); - } - - private class PositionTrackingIterator implements StingSAMIterator { - /** - * Source information about the reads. - */ - private Reads sourceInfo; - - /** - * The iterator being tracked. - */ - private CloseableIterator iterator; - - /** - * Current position within the tracked iterator. - */ - private long position; - - /** - * {@inheritDoc} - */ - public Reads getSourceInfo() { - return sourceInfo; - } - - /** - * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as - * the coordinate of the read that will be returned if next() is called. - * @return The current position of the iterator. - */ - public long getPosition() { - return position; - } - - /** - * Create a new iterator wrapping the given position, assuming that the reader is position reads - * into the sequence. - * @param sourceInfo Information about where these reads came from. - * @param iterator Iterator to wraps. - * @param position Non-negative position where the iterator currently sits. - */ - public PositionTrackingIterator( Reads sourceInfo, CloseableIterator iterator, long position ) { - this.sourceInfo = sourceInfo; - this.iterator = iterator; - this.position = position; - } - - /** - * {@inheritDoc} - */ - public boolean hasNext() { - return iterator.hasNext(); - } - - /** - * Try to get the next read in the list. If a next read is available, increment the position. - * @return next read in the list, if available. - */ - public SAMRecord next() { - try { - return iterator.next(); - } - finally { - position++; - } - } - - /** - * {@inheritDoc} - */ - public StingSAMIterator iterator() { - return this; - } - - /** - * {@inheritDoc} - */ - public void close() { - // Position tracking iterators are constant through the life of the traversal. Don't close them. - // TODO: This is an artifact of the fact that pooled query iterators need to be closed, but pooled unmapped - // TODO: iterators must not be. Clean this up! - } - - /** - * {@inheritDoc} - */ - public void remove() { throw new UnsupportedOperationException("Cannot remove from a StingSAMIterator"); } - + public UnmappedReadStreamPointer toUnmappedReadStreamPointer() { + return new UnmappedReadStreamPointer( sourceInfo, headerMerger ); } } + +class UnmappedReadStreamPointer extends ReadStreamPointer { + /** + * A pointer to the current position of this iterator in the read stream. + */ + private PositionTrackingIterator unmappedIterator = null; + + public UnmappedReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) { + super( sourceInfo, headerMerger ); + + MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); + mergingIterator.queryUnmappedReads(); + unmappedIterator = new PositionTrackingIterator( sourceInfo, mergingIterator, 0L ); + } + + /** + * UnmappedReadStreamPointers are streams and can therefore access 'future' reads in the file quickly, + * but reads already seen are difficult to seek to. + * @param segment Segment to test. + * @return True if this DataStreamSegment follows the current position. + */ + public boolean canAccessSegmentEfficiently(DataStreamSegment segment) { + if( !(segment instanceof UnmappedStreamSegment) ) + return false; + return unmappedIterator.getPosition() <= ((UnmappedStreamSegment)segment).position; + } + + /** + * {@inheritDoc} + */ + @Override + public StingSAMIterator getReadsOverlapping( MappedStreamSegment segment ) { + throw new UnsupportedOperationException("Unable to determine overlapped reads of an unmapped segment"); + } + + /** + * {@inheritDoc} + */ + @Override + public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) { + if( !(segment instanceof UnmappedStreamSegment) ) + throw new StingException("Trying to access mapped content from an unmapped read stream pointer"); + + UnmappedStreamSegment unmappedSegment = (UnmappedStreamSegment)segment; + + // Force the iterator to the next pending position. + while(unmappedIterator.getPosition() < unmappedSegment.position) + unmappedIterator.next(); + + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(sourceInfo,unmappedIterator), unmappedSegment.size); + } + + /** + * {@inheritDoc} + */ + @Override + public void close() { + if( unmappedIterator != null ) + unmappedIterator.close(); + super.close(); + } + + /** + * {@inheritDoc} + */ + public void destroy( StingSAMIterator iterator ) { + // Don't destroy the iterator; reuse it. + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java new file mode 100644 index 000000000..e0000562e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.picard.sam.SamFileHeaderMerger; + +import java.util.List; +import java.util.ArrayList; +import java.io.File; + +/** + * Represents a single stream of read data. Used to represent the state of the stream and determine + * whether the state of this resource is such that it can field the desired query. + * @author hanna + * @version 0.1 + */ +class ReadStreamResource { + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(ReadStreamPointer.class); + + /** + * The (possibly merged) header for the input fileset. + */ + private final SAMFileHeader header; + + /** + * A pointer to the current location of the file. + */ + private ReadStreamPointer readStreamPointer = null; + + public ReadStreamResource( Reads sourceInfo ) { + SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate); + + this.header = headerMerger.getMergedHeader(); + readStreamPointer = new MappedReadStreamPointer(sourceInfo, headerMerger); + } + + /** + * Gets the header information for the read stream. + * @return Header information for the read stream. + */ + public SAMFileHeader getHeader() { + return header; + } + + public boolean canAccessSegmentEfficiently(DataStreamSegment segment) { + return readStreamPointer.canAccessSegmentEfficiently(segment); + } + + public void close() { + readStreamPointer.close(); + } + + public void destroy( StingSAMIterator iterator ) { + readStreamPointer.destroy(iterator); + } + + public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) { + if( readStreamPointer instanceof MappedReadStreamPointer && segment instanceof UnmappedStreamSegment ) + readStreamPointer = ((MappedReadStreamPointer)readStreamPointer).toUnmappedReadStreamPointer(); + return readStreamPointer.getReadsContainedBy(segment); + } + + + public StingSAMIterator getReadsOverlapping( MappedStreamSegment segment ) { + return readStreamPointer.getReadsOverlapping(segment); + } + + /** + * A private function that, given the internal file list, generates a merging construct for + * all available files. + * @param reads source information about the reads. + * @param SORT_ORDER sort order for the reads. + * @return a list of SAMFileReaders that represent the stored file names + * @throws SimpleDataSourceLoadException if the file cannot be opened. + */ + private SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER ) + throws SimpleDataSourceLoadException { + // right now this is pretty damn heavy, it copies the file list into a reader list every time + List lst = new ArrayList(); + for (File f : reads.getReadsFiles()) { + SAMFileReader reader = new SAMFileReader(f, true); + reader.setValidationStringency(reads.getValidationStringency()); + + final SAMFileHeader header = reader.getFileHeader(); + logger.debug(String.format("Sort order is: " + header.getSortOrder())); + + if (reader.getFileHeader().getReadGroups().size() < 1) { + //logger.warn("Setting header in reader " + f.getName()); + SAMReadGroupRecord rec = new SAMReadGroupRecord(f.getName()); + rec.setLibrary(f.getName()); + rec.setSample(f.getName()); + + reader.getFileHeader().addReadGroup(rec); + } + + lst.add(reader); + } + return new SamFileHeaderMerger(lst,SORT_ORDER,true); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index da777e97a..8c572d197 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -10,15 +10,12 @@ import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; -import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.SAMReadValidator; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; -import java.util.List; /* * Copyright (c) 2009 The Broad Institute @@ -74,15 +71,20 @@ public class SAMDataSource implements SimpleDataSource { private boolean intoUnmappedReads = false; private int readsSeenAtLastPos = 0; + /** + * A histogram of exactly what reads were removed from the input stream and why. + */ + private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); + // A pool of SAM iterators. - private SAMIteratorPool iteratorPool = null; + private SAMResourcePool resourcePool = null; /** * Returns a histogram of reads that were screened out, grouped by the nature of the error. * @return Histogram of reads. Will not be null. */ public SAMReadViolationHistogram getViolationHistogram() { - return iteratorPool.getViolationHistogram(); + return violations; } /** @@ -102,7 +104,7 @@ public class SAMDataSource implements SimpleDataSource { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); } } - iteratorPool = new SAMIteratorPool(reads); + resourcePool = new SAMResourcePool(reads); } /** @@ -111,7 +113,7 @@ public class SAMDataSource implements SimpleDataSource { * @return SAM file header. */ public SAMFileHeader getHeader() { - return iteratorPool.getHeader(); + return resourcePool.getHeader(); } /** @@ -123,7 +125,7 @@ public class SAMDataSource implements SimpleDataSource { public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException { // setup the iterator pool if it's not setup boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true; - iteratorPool.setQueryOverlapping(queryOverlapping); + resourcePool.setQueryOverlapping(queryOverlapping); StingSAMIterator iterator = null; if (shard.getShardType() == Shard.ShardType.READ) { @@ -158,7 +160,7 @@ public class SAMDataSource implements SimpleDataSource { * @return an iterator for that region */ private StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException { - return iteratorPool.iterator(new MappedStreamSegment(location)); + return createIterator( new MappedStreamSegment(location) ); } @@ -177,11 +179,11 @@ public class SAMDataSource implements SimpleDataSource { if (!intoUnmappedReads) { if (lastReadPos == null) { lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); - iter = iteratorPool.iterator(new MappedStreamSegment(lastReadPos)); + iter = createIterator(new MappedStreamSegment(lastReadPos)); return InitialReadIterator(shard.getSize(), iter); } else { lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); - iter = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iteratorPool.iterator(new MappedStreamSegment(lastReadPos)))); + iter = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos)))); } if (intoUnmappedReads && !includeUnmappedReads) @@ -214,10 +216,10 @@ public class SAMDataSource implements SimpleDataSource { /** * For unit testing, add a custom iterator pool. * - * @param iteratorPool Custom mock iterator pool. + * @param resourcePool Custom mock iterator pool. */ - void setResourcePool( SAMIteratorPool iteratorPool ) { - this.iteratorPool = iteratorPool; + void setResourcePool( SAMResourcePool resourcePool ) { + this.resourcePool = resourcePool; } /** @@ -228,7 +230,7 @@ public class SAMDataSource implements SimpleDataSource { * @return the bounded iterator that you can use to get the intervaled reads from */ StingSAMIterator toUnmappedReads( long readCount ) { - StingSAMIterator iter = iteratorPool.iterator(new UnmappedStreamSegment(readsTaken, readCount)); + StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount)); readsTaken += readCount; return iter; } @@ -277,7 +279,7 @@ public class SAMDataSource implements SimpleDataSource { readsTaken = readCount; readsSeenAtLastPos = 0; lastReadPos = GenomeLocParser.setStop(lastReadPos,-1); - CloseableIterator ret = iteratorPool.iterator(new MappedStreamSegment(lastReadPos)); + CloseableIterator ret = createIterator(new MappedStreamSegment(lastReadPos)); return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount); } } @@ -344,6 +346,16 @@ public class SAMDataSource implements SimpleDataSource { return bound; } + /** + * Creates an iterator over the selected segment, from a resource pulled from the pool. + * @param segment Segment over which to gather reads. + * @return An iterator over just the reads in the given segment. + */ + private StingSAMIterator createIterator( DataStreamSegment segment ) { + StingSAMIterator iterator = resourcePool.iterator(segment); + return new MalformedSAMFilteringIterator( getHeader(), iterator, violations ); + } + /** * Filter reads based on user-specified criteria. * @@ -376,127 +388,10 @@ public class SAMDataSource implements SimpleDataSource { private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter { public boolean filterOut(SAMRecord rec) { - if (rec.getMappingQuality() == 0) { - //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format()); - return true; - } else { - return false; - } + return (rec.getMappingQuality() == 0); } } } -class SAMIteratorPool extends ResourcePool { - /** Source information about the reads. */ - protected Reads reads; - /** - * A histogram of exactly what reads were removed from the input stream and why. - */ - private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); - - /** Is this a by-reads traversal or a by-locus? */ - protected boolean queryOverlapping; - - /** File header for the combined file. */ - protected SAMFileHeader header; - - /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(SAMIteratorPool.class); - - public SAMIteratorPool( Reads reads ) { - this.reads = reads; - this.queryOverlapping = true; - - ReadStreamPointer streamPointer = createNewResource(); - this.header = streamPointer.getHeader(); - // Add this resource to the pool. - this.addNewResource(streamPointer); - } - - /** Get the combined header for all files in the iterator pool. */ - public SAMFileHeader getHeader() { - return header; - } - - /** - * Returns a histogram of reads that were screened out, grouped by the nature of the error. - * @return Histogram of reads. Will not be null. - */ - public SAMReadViolationHistogram getViolationHistogram() { - return violations; - } - - protected ReadStreamPointer selectBestExistingResource( DataStreamSegment segment, List pointers ) { - for (ReadStreamPointer pointer : pointers) { - if (pointer.canAccessSegmentEfficiently(segment)) { - return pointer; - } - } - return null; - } - - protected ReadStreamPointer createNewResource() { - return new ReadStreamPointer(reads); - } - - protected StingSAMIterator createIteratorFromResource( DataStreamSegment segment, ReadStreamPointer streamPointer ) { - StingSAMIterator iterator = null; - - if (!queryOverlapping) - iterator = streamPointer.getReadsContainedBy(segment); - else { - if (!( segment instanceof MappedStreamSegment )) - throw new StingException("Segment is unmapped; true overlaps cannot be determined."); - iterator = streamPointer.getReadsOverlapping((MappedStreamSegment) segment); - } - - return new ReleasingIterator(new MalformedSAMFilteringIterator(header, iterator, violations)); - } - - protected void closeResource( ReadStreamPointer resource ) { - resource.close(); - } - - private class ReleasingIterator implements StingSAMIterator { - private final StingSAMIterator wrappedIterator; - - public Reads getSourceInfo() { - return wrappedIterator.getSourceInfo(); - } - - public ReleasingIterator( StingSAMIterator wrapped ) { - this.wrappedIterator = wrapped; - } - - public ReleasingIterator iterator() { - return this; - } - - public void remove() { - throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); - } - - public void close() { - wrappedIterator.close(); - release(this); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public SAMRecord next() { - return wrappedIterator.next(); - } - } - - public boolean isQueryOverlapping() { - return queryOverlapping; - } - - public void setQueryOverlapping( boolean queryOverlapping ) { - this.queryOverlapping = queryOverlapping; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java new file mode 100644 index 000000000..dc213b23d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.utils.StingException; +import org.apache.log4j.Logger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; + +import java.util.List; + +/** + * Maintain a pool of resources of accessors to SAM read data. SAMFileReaders and + * headers are actually quite expensive to open, so this class manages the mechanics + * of keeping them open and reusing them. + * @author hanna + * @version 0.1 + */ +class SAMResourcePool extends ResourcePool { + /** Source information about the reads. */ + protected Reads reads; + + /** Is this a by-reads traversal or a by-locus? */ + protected boolean queryOverlapping; + + /** File header for the combined file. */ + protected SAMFileHeader header; + + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(SAMResourcePool.class); + + public SAMResourcePool( Reads reads ) { + this.reads = reads; + this.queryOverlapping = true; + + ReadStreamResource streamResource = createNewResource(); + this.header = streamResource.getHeader(); + // Add this resource to the pool. + this.addNewResource(streamResource); + } + + /** Get the combined header for all files in the iterator pool. */ + public SAMFileHeader getHeader() { + return header; + } + + protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List resources ) { + for (ReadStreamResource resource : resources) { + if (resource.canAccessSegmentEfficiently(segment)) { + return resource; + } + } + return null; + } + + protected ReadStreamResource createNewResource() { + return new ReadStreamResource(reads); + } + + protected StingSAMIterator createIteratorFromResource( DataStreamSegment segment, ReadStreamResource streamResource ) { + StingSAMIterator iterator = null; + + if (!queryOverlapping) + iterator = streamResource.getReadsContainedBy(segment); + else { + if (!( segment instanceof MappedStreamSegment )) + throw new StingException("Segment is unmapped; true overlaps cannot be determined."); + iterator = streamResource.getReadsOverlapping((MappedStreamSegment) segment); + } + + return new ReleasingIterator( streamResource, iterator ); + } + + protected void closeResource( ReadStreamResource resource ) { + resource.close(); + } + + private class ReleasingIterator implements StingSAMIterator { + /** + * The resource acting as the source of the data. + */ + private final ReadStreamResource resource; + + /** + * The iterator to wrap. + */ + private final StingSAMIterator wrappedIterator; + + public Reads getSourceInfo() { + return wrappedIterator.getSourceInfo(); + } + + public ReleasingIterator( ReadStreamResource resource, StingSAMIterator wrapped ) { + this.resource = resource; + this.wrappedIterator = wrapped; + } + + public ReleasingIterator iterator() { + return this; + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); + } + + public void close() { + resource.destroy(wrappedIterator); + release(this); + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecord next() { + return wrappedIterator.next(); + } + } + + public boolean isQueryOverlapping() { + return queryOverlapping; + } + + public void setQueryOverlapping( boolean queryOverlapping ) { + this.queryOverlapping = queryOverlapping; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java new file mode 100644 index 000000000..3c92d31ee --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import org.broadinstitute.sting.gatk.Reads; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + +/** + * Iterates through a list of elements, tracking the number of elements it has seen. + * @author hanna + * @version 0.1 + */ +public class PositionTrackingIterator implements StingSAMIterator { + /** + * Source information about the reads. + */ + private Reads sourceInfo; + + /** + * The iterator being tracked. + */ + private CloseableIterator iterator; + + /** + * Current position within the tracked iterator. + */ + private long position; + + /** + * {@inheritDoc} + */ + public Reads getSourceInfo() { + return sourceInfo; + } + + /** + * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as + * the coordinate of the read that will be returned if next() is called. + * @return The current position of the iterator. + */ + public long getPosition() { + return position; + } + + /** + * Create a new iterator wrapping the given position, assuming that the reader is position reads + * into the sequence. + * @param sourceInfo Information about where these reads came from. + * @param iterator Iterator to wraps. + * @param position Non-negative position where the iterator currently sits. + */ + public PositionTrackingIterator( Reads sourceInfo, CloseableIterator iterator, long position ) { + this.sourceInfo = sourceInfo; + this.iterator = iterator; + this.position = position; + } + + /** + * {@inheritDoc} + */ + public boolean hasNext() { + return iterator.hasNext(); + } + + /** + * Try to get the next read in the list. If a next read is available, increment the position. + * @return next read in the list, if available. + */ + public SAMRecord next() { + try { + return iterator.next(); + } + finally { + position++; + } + } + + /** + * {@inheritDoc} + */ + public StingSAMIterator iterator() { + return this; + } + + /** + * {@inheritDoc} + */ + public void close() { + iterator.close(); + } + + /** + * {@inheritDoc} + */ + public void remove() { throw new UnsupportedOperationException("Cannot remove from a StingSAMIterator"); } +} diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java index c79b7eab9..0aa799cec 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java @@ -41,7 +41,7 @@ import java.io.File; /** * use this to inject into SAMDataSource for testing */ -public class ArtificialResourcePool extends SAMIteratorPool { +public class ArtificialResourcePool extends SAMResourcePool { // How strict should we be with SAM/BAM parsing? protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;