diff --git a/java/src/net/sf/samtools/BAMBlockIterator.java b/java/src/net/sf/samtools/BAMBlockIterator.java index 0898237f8..b87bce8ee 100644 --- a/java/src/net/sf/samtools/BAMBlockIterator.java +++ b/java/src/net/sf/samtools/BAMBlockIterator.java @@ -1,15 +1,9 @@ package net.sf.samtools; -import net.sf.samtools.util.BinaryCodec; -import net.sf.samtools.util.BlockCompressedInputStream; -import net.sf.samtools.util.StringLineReader; import net.sf.samtools.util.CloseableIterator; -import net.sf.samtools.util.SeekableStream; +import net.sf.picard.PicardException; -import java.util.Iterator; import java.util.NoSuchElementException; -import java.util.Arrays; -import java.nio.channels.FileChannel; import java.io.IOException; import java.io.FileInputStream; import java.io.File; @@ -22,6 +16,11 @@ import java.io.File; * @version 0.1 */ public class BAMBlockIterator implements CloseableIterator { + /** + * The input stream for back files. + */ + private FileInputStream inputStream; + /** * File channel from which to read chunks. */ @@ -35,13 +34,23 @@ public class BAMBlockIterator implements CloseableIterator { /** * Iterate through the BAM chunks in a file. * @param file stream File to use when accessing the BAM. + * @throws IOException in case of problems reading from the file. */ public BAMBlockIterator(File file) throws IOException { - FileInputStream inputStream = new FileInputStream(file); - this.blockReader = new BlockReader(inputStream); + inputStream = new FileInputStream(file); + blockReader = new BlockReader(inputStream); } + /** + * Close the existing file. + */ public void close() { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new PicardException("Unable to close file", ex); + } } /** @@ -59,7 +68,7 @@ public class BAMBlockIterator implements CloseableIterator { /** * Get the next chunk from the iterator. * @return The next chunk. - * @throw NoSuchElementException if no next chunk is available. + * @throws NoSuchElementException if no next chunk is available. */ public Block next() { if(!hasNext()) diff --git a/java/src/net/sf/samtools/BAMChunkIterator.java b/java/src/net/sf/samtools/BAMChunkIterator.java new file mode 100644 index 000000000..017ae6e61 --- /dev/null +++ b/java/src/net/sf/samtools/BAMChunkIterator.java @@ -0,0 +1,87 @@ +package net.sf.samtools; + +import java.util.*; + +/** + * Adapts a list of BAM blocks to BAM chunks. + * + * @author mhanna + * @version 0.1 + */ +public class BAMChunkIterator implements Iterator { + /** + * The wrapped chunk iterator. + */ + private final BAMBlockIterator blockIterator; + + /** + * List of prefetched segments. If prefetchedSegments is + * empty, this should mean that there's nothing left in the queue. + */ + private Queue prefetchedSegments; + + /** + * List of chunks to filter out of the final chunk list. + */ + private final Queue filters; + + /** + * Creates a new chunk iterator wrapping the given block, optionally filtering + * out a list of chunks from those blocks. + * TODO: This class is too smart. Wrap filtering functionality in another class. + * @param blockIterator block iterator to wrap. + * @param filters List of chunks to filter out of the given input stream. + */ + public BAMChunkIterator(BAMBlockIterator blockIterator, List filters) { + this.blockIterator = blockIterator; + this.prefetchedSegments = new LinkedList(); + this.filters = new PriorityQueue(filters); + } + + /** + * Is a next chunk available. + * @return true if a next chunk is available; false otherwise. + */ + public boolean hasNext() { + return !prefetchedSegments.isEmpty(); + } + + /** + * Returns the next chunk in the list. + * @return Next block, adapted to a chunk. + */ + public Chunk next() { + if(prefetchedSegments.isEmpty()) + throw new NoSuchElementException("BAMChunkIterator is exhausted."); + // TODO: Maybe tie together adjacent chunks together? + Chunk nextChunk = prefetchedSegments.remove().toChunk(); + seedNextSegments(); + return nextChunk; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a BAM chunk iterator"); + } + + /** + * Seed the next value or set of values for the iterator, if necessary. + */ + private void seedNextSegments() { + while(prefetchedSegments.isEmpty() && blockIterator.hasNext()) { + // Add the new segments to the prefetch... + prefetchedSegments.add(new BlockSegment(blockIterator.next())); + + // And iteratively subtract the filtering chunks. + for(Chunk filter: filters) { + Queue filteredBlockSegments = new LinkedList(); + for(BlockSegment blockSegment: prefetchedSegments) + filteredBlockSegments.addAll(blockSegment.minus(filter)); + prefetchedSegments = filteredBlockSegments; + } + } + } + +} diff --git a/java/src/net/sf/samtools/BAMFileReader2.java b/java/src/net/sf/samtools/BAMFileReader2.java new file mode 100644 index 000000000..a1e8708e4 --- /dev/null +++ b/java/src/net/sf/samtools/BAMFileReader2.java @@ -0,0 +1,630 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.BinaryCodec; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.StringLineReader; +import net.sf.samtools.SAMFileReader.ValidationStringency; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NoSuchElementException; +import java.net.URL; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader2 + extends SAMFileReader2.ReaderImplementation2 { + // True if reading from a File rather than an InputStream + private boolean mIsSeekable = false; + // For converting bytes into other primitive types + private BinaryCodec mStream = null; + // Underlying compressed data stream. + private final BlockCompressedInputStream mCompressedInputStream; + private SAMFileHeader mFileHeader = null; + // Populated if the file is seekable and an index exists + private BAMFileIndex mFileIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + // If true, all SAMRecords are fully decoded as they are read. + private final boolean eagerDecode; + // For error-checking. + private ValidationStringency mValidationStringency; + + /** + * Prepare to read BAM from a stream (not seekable) + * @param stream source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader2(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency) + throws IOException { + mIsSeekable = false; + mCompressedInputStream = new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + readHeader(null); + } + + /** + * Prepare to read BAM from a file (seekable) + * @param file source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader2(final File file, final boolean eagerDecode, final ValidationStringency validationStringency) + throws IOException { + this(new BlockCompressedInputStream(file), eagerDecode, file.getAbsolutePath(), validationStringency); + } + + + BAMFileReader2(final URL url, final boolean eagerDecode, final ValidationStringency validationStringency) + throws IOException { + this(new BlockCompressedInputStream(url), eagerDecode, url.toString(), validationStringency); + } + + private BAMFileReader2(final BlockCompressedInputStream compressedInputStream, final boolean eagerDecode, + final String source, final ValidationStringency validationStringency) + throws IOException { + mIsSeekable = true; + mCompressedInputStream = compressedInputStream; + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + readHeader(source); + mFirstRecordPointer = mCompressedInputStream.getFilePointer(); + } + + void close() { + if (mStream != null) { + mStream.close(); + } + mStream = null; + mFileHeader = null; + mFileIndex = null; + } + + /** + * @return the file index, if one exists, else null. + */ + BAMFileIndex getFileIndex() { + return mFileIndex; + } + + void setFileIndex(final BAMFileIndex fileIndex) { + mFileIndex = fileIndex; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Set error-checking level for subsequent SAMRecord reads. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + SAMFileReader.ValidationStringency getValidationStringency() { + return this.mValidationStringency; + } + + /** + * Prepare to iterate through the SAMRecords in file order. + * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, + * that iterator must be closed before getIterator() can be called again. + * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to + * getIterator() begins its iteration where the last one left off. That is the best that can be + * done in that situation. + */ + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mCompressedInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + CloseableIterator getIterator(List chunks) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mCompressedInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + // Create an iterator over the given chunk boundaries. + mCurrentIterator = new BAMFileIndexIterator(Chunk.toCoordinateArray(chunks)); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords that match the given interval. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. + * A value of zero implies the start of the reference sequence. + * @param end A value of zero implies the end of the reference sequence. + * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval + * specified by start and end. If false, the SAMRecords need only overlap the interval. + * @return Iterator for the matching SAMRecords + */ + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + if (mFileIndex == null) { + throw new IllegalStateException("No BAM file index is available"); + } + mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords with the given alignment start. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Alignment start sought. + * @return Iterator for the matching SAMRecords. + */ + CloseableIterator queryAlignmentStart(final String sequence, final int start) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + if (mFileIndex == null) { + throw new IllegalStateException("No BAM file index is available"); + } + mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); + return mCurrentIterator; + } + + public CloseableIterator queryUnmapped() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + if (mFileIndex == null) { + throw new IllegalStateException("No BAM file index is available"); + } + try { + final long startOfLastLinearBin = mFileIndex.getStartOfLastLinearBin(); + if (startOfLastLinearBin != -1) { + mCompressedInputStream.seek(startOfLastLinearBin); + } else { + // No mapped reads in file, just start at the first read in file. + mCompressedInputStream.seek(mFirstRecordPointer); + } + mCurrentIterator = new BAMFileIndexUnmappedIterator(); + return mCurrentIterator; + } catch (IOException e) { + throw new RuntimeException("IOException seeking to unmapped reads", e); + } + } + + /** + * Reads the header from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private void readHeader(final String source) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(mValidationStringency); + mFileHeader = headerCodec.decode(new StringLineReader(textHeader), + source); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + mFileHeader.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + source); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + source); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(source)); + } + mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } + } + + /** + * Reads a single binary sequence record from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private SAMSequenceRecord readSequenceRecord(final String source) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + return new SAMSequenceRecord(sequenceName, sequenceLength); + } + + /** + * Iterator for non-indexed sequential iteration through all SAMRecords in file. + * Starting point of iteration is wherever current file position is when the iterator is constructed. + */ + private class BAMFileIterator implements CloseableIterator { + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(getFileHeader()); + private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file + + BAMFileIterator() { + this(true); + } + + /** + * @param advance Trick to enable subclass to do more setup before advancing + */ + BAMFileIterator(final boolean advance) { + this.bamRecordCodec.setInputStream(BAMFileReader2.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + if (mNextRecord != null) { + ++this.samRecordIndex; + // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setValidationStringency(mValidationStringency); + + if (mValidationStringency != ValidationStringency.SILENT) { + final List validationErrors = mNextRecord.isValid(); + SAMUtils.processValidationErrors(validationErrors, + this.samRecordIndex, BAMFileReader2.this.getValidationStringency()); + } + } + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + /** + * Read the next record from the input stream. + */ + SAMRecord getNextRecord() throws IOException { + return bamRecordCodec.decode(); + } + + /** + * @return The record that will be return by the next call to next() + */ + protected SAMRecord peek() { + return mNextRecord; + } + } + + enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} + + /** + * Creates an iterator over indexed data in the specified range. + * @param sequence Sequence to which to constrain the data. + * @param start Starting position within the above sequence to which the data should be constrained. + * @param end Ending position within the above sequence to which the data should be constrained.s + * @param queryType Type of query. Useful for establishing the boundary rules. + * @return An iterator over the requested data. + */ + private CloseableIterator createIndexIterator(final String sequence, + final int start, + final int end, + final QueryType queryType) { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + final SAMFileHeader fileHeader = getFileHeader(); + int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMFileIndex fileIndex = getFileIndex(); + filePointers = fileIndex.getSearchBins(referenceIndex, start, end); + } + + // Create an iterator over the above chunk boundaries. + BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); + + // Add some preprocessing filters for edge-case reads that don't fit into this + // query type. + return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); + } + + private class BAMFileIndexIterator + extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + + BAMFileIndexIterator(final long[] filePointers) { + super(false); // delay advance() until after construction + mFilePointers = filePointers; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + while (true) { + // Advance to next file block if necessary + while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mCompressedInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + final SAMRecord record = super.getNextRecord(); + if (record == null) { + return null; + } + } + } + } + + /** + * A decorating iterator that filters out records that are outside the bounds of the + * given query parameters. + */ + private class BAMQueryFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private final CloseableIterator wrappedIterator; + + /** + * The next record to be returned. Will be null if no such record exists. + */ + private SAMRecord nextRead; + + private final int mReferenceIndex; + private final int mRegionStart; + private final int mRegionEnd; + private final QueryType mQueryType; + + public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { + this.wrappedIterator = iterator; + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + mRegionStart = start; + if (queryType == QueryType.STARTING_AT) { + mRegionEnd = mRegionStart; + } else { + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + } + mQueryType = queryType; + advance(); + } + + /** + * Returns true if a next element exists; false otherwise. + */ + public boolean hasNext() { + return nextRead != null; + } + + /** + * Gets the next record from the given iterator. + * @return The next SAM record in the iterator. + */ + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); + final SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + /** + * Closes down the existing iterator. + */ + public void close() { + wrappedIterator.close(); + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + SAMRecord advance() { + while (true) { + // Pull next record from stream + if(!wrappedIterator.hasNext()) + return null; + + final SAMRecord record = wrappedIterator.next(); + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + // If read is unmapped but has a coordinate, return it if the coordinate is within + // the query region, regardless of whether the mapped mate will be returned. + final int alignmentEnd; + if (mQueryType == QueryType.STARTING_AT) { + alignmentEnd = -1; + } else { + alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? + record.getAlignmentEnd(): alignmentStart); + } + + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + return null; + } + // Filter for overlap with region + if (mQueryType == QueryType.CONTAINED) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else if (mQueryType == QueryType.OVERLAPPING) { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } else { + if (alignmentStart == mRegionStart) { + return record; + } + } + } + } + } + + private class BAMFileIndexUnmappedIterator extends BAMFileIterator { + private BAMFileIndexUnmappedIterator() { + while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + advance(); + } + } + } + +} diff --git a/java/src/net/sf/samtools/BlockSegment.java b/java/src/net/sf/samtools/BlockSegment.java new file mode 100644 index 000000000..be8fd146b --- /dev/null +++ b/java/src/net/sf/samtools/BlockSegment.java @@ -0,0 +1,112 @@ +package net.sf.samtools; + +import java.util.List; +import java.util.Collections; +import java.util.ArrayList; + +/** + * Represents a segment of a given block. A block segment can easily be + * converted or compared to either a block or a chunk. + */ +class BlockSegment { + /** + * Offset to the start of this block within the file. + */ + private final long position; + + /** + * Starting coordinate within this segment. + */ + private final long blockStart; + + /** + * Last coordinate within this segment. + */ + private final long blockStop; + + /** + * Create a new block segment from the given block. + * @param block the block to use as the starting point for this segment. + */ + public BlockSegment(Block block) { + this(block.position,0,block.uncompressedBlockSize-1); + } + + BlockSegment(final long position,final long blockStart,final long blockStop) { + this.position = position; + this.blockStart = blockStart; + this.blockStop = blockStop; + } + + /** + * The size of the block segment. + * @return Number of bytes represented by this block segment. + */ + public long size() { + return blockStop - blockStart + 1; + } + + /** + * Convert the given segment to a chunk. + * @return the chunk equivalent of this block. + */ + public Chunk toChunk() { + return new Chunk(position << 16 & blockStart,position << 16 & blockStop); + } + + /** + * Does this block segment overlap the given chunk? + * @param chunk The chunk to test. + * @return true if the chunk is w/i the block; false otherwise. + */ + public boolean overlaps(Chunk chunk) { + Chunk segmentAsChunk = toChunk(); + + // Chunk is completely encompasses the given block. + if(chunk.getChunkStart() < segmentAsChunk.getChunkStart() && + chunk.getChunkEnd() > segmentAsChunk.getChunkEnd()) + return true; + + // Chunk starts w/i block. + if(chunk.getChunkStart() >= segmentAsChunk.getChunkStart() && + chunk.getChunkStart() <= segmentAsChunk.getChunkEnd()) + return true; + + // Chunk ends w/i block. + if(chunk.getChunkEnd() >= segmentAsChunk.getChunkStart() && + chunk.getChunkEnd() <= segmentAsChunk.getChunkEnd()) + return true; + + return false; + } + + /** + * Subtracts the given chunk from the block segment. + * @param chunk The chunk to subtract. + * @return The block segment(s) derived from subtracting the chunk. + */ + public List minus(Chunk chunk) { + // If no overlap exists, just return a new instance of the current block segment. + if(!overlaps(chunk)) + return Collections.singletonList(this); + + List differenceSegments = new ArrayList(); + Chunk segmentAsChunk = toChunk(); + + // If the segment begins before the chunk, build out a segment from the start of the block to just before + // the start of the chunk or the end of the block segment, whichever comes first. + if(segmentAsChunk.getChunkStart() < chunk.getChunkStart()) { + long segmentStop = (position == chunk.getChunkStart()>>16) ? (chunk.getChunkStart()&0xFFFF)-1 : blockStop; + differenceSegments.add(new BlockSegment(position,blockStart,segmentStop)); + } + + // If the segment ends after the chunk, build out a segment from either after the end of the chunk or the + // start of the block segment, whichever comes last. + if(segmentAsChunk.getChunkEnd() > chunk.getChunkEnd()) { + long segmentStart = (position == chunk.getChunkEnd()>>16) ? (chunk.getChunkEnd()&0xFFFF)+1 : blockStart; + differenceSegments.add(new BlockSegment(position,segmentStart,blockStop)); + } + + return differenceSegments; + } +} diff --git a/java/src/net/sf/samtools/BlockTestHarness.java b/java/src/net/sf/samtools/BlockTestHarness.java index cc0d60757..52386b282 100644 --- a/java/src/net/sf/samtools/BlockTestHarness.java +++ b/java/src/net/sf/samtools/BlockTestHarness.java @@ -2,6 +2,7 @@ package net.sf.samtools; import java.io.File; import java.io.IOException; +import java.util.Collections; /** * Test harness for playing with sharding by BGZF block. @@ -30,6 +31,14 @@ public class BlockTestHarness { Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation(); System.out.printf("Header location = %s%n", headerLocation); + BAMChunkIterator chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile), + Collections.singletonList(headerLocation)); + while(chunkIterator.hasNext()) { + Chunk chunk = chunkIterator.next(); + } + + // The following test code blocks a bunch of files together. + /* BAMBlockIterator blockIterator = new BAMBlockIterator(bamFile); long blockCount = 0; @@ -37,10 +46,11 @@ public class BlockTestHarness { while(blockIterator.hasNext()) { Block block = blockIterator.next(); blockCount++; - //System.out.println(block); + System.out.println(block); } long endTime = System.currentTimeMillis(); System.out.printf("Number of chunks: %d; elapsed time: %dms%n", blockCount, endTime-startTime); + */ } } diff --git a/java/src/net/sf/samtools/Chunk.java b/java/src/net/sf/samtools/Chunk.java index aacbd89e1..1d6abdc2a 100644 --- a/java/src/net/sf/samtools/Chunk.java +++ b/java/src/net/sf/samtools/Chunk.java @@ -1,5 +1,10 @@ package net.sf.samtools; +import net.sf.picard.PicardException; + +import java.util.List; +import java.util.ArrayList; + /** * Represents a chunk stolen from the BAM file. Originally a private static inner class within * BAMFileIndex; now breaking it out so that the sharding system can use it. @@ -33,6 +38,43 @@ class Chunk implements Comparable { mChunkEnd = value; } + /** + * The list of chunks is often represented as an array of + * longs where every even-numbered index is a start coordinate + * and every odd-numbered index is a stop coordinate. Convert + * from that format back to a list of chunks. + * @param coordinateArray List of chunks to convert. + * @return A list of chunks. + */ + public static List toChunkList(long[] coordinateArray) { + if(coordinateArray.length % 2 != 0) + throw new PicardException("Data supplied does not appear to be in coordinate array format."); + + // TODO: possibly also check for monotonically increasing; this seems to be an implicit requirement of this format. + List chunkList = new ArrayList(); + for(int i = 0; i < coordinateArray.length; i += 2) + chunkList.add(new Chunk(coordinateArray[i],coordinateArray[i+1])); + + return chunkList; + } + + /** + * The list of chunks is often represented as an array of + * longs where every even-numbered index is a start coordinate + * and every odd-numbered index is a stop coordinate. + * @param chunks List of chunks to convert. + * @return A long array of the format described above. + */ + public static long[] toCoordinateArray(List chunks) { + long[] coordinateArray = new long[chunks.size()*2]; + int position = 0; + for(Chunk chunk: chunks) { + coordinateArray[position++] = chunk.getChunkStart(); + coordinateArray[position++] = chunk.getChunkEnd(); + } + return coordinateArray; + } + public int compareTo(final Chunk chunk) { int result = Long.signum(mChunkStart - chunk.mChunkStart); if (result == 0) { diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java new file mode 100644 index 000000000..4d8033fd9 --- /dev/null +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -0,0 +1,522 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.IOUtil; +import net.sf.samtools.util.RuntimeIOException; +import net.sf.samtools.SAMFileReader.ReaderImplementation; +import net.sf.samtools.SAMFileReader.ValidationStringency; +import net.sf.picard.PicardException; + +import java.io.*; +import java.util.zip.GZIPInputStream; +import java.util.List; +import java.net.URL; + + +/** + * Class for reading and querying SAM/BAM files. Delegates to appropriate concrete implementation. + */ +public class SAMFileReader2 implements Iterable { + + private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY; + + public static ValidationStringency getDefaultValidationStringency() { + return defaultValidationStringency; + } + + /** + * Set validation stringency for all subsequently-created SAMFileReaders. This is the only way to + * change the validation stringency for SAM header. + */ + public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) { + SAMFileReader2.defaultValidationStringency = defaultValidationStringency; + } + + private boolean mIsBinary = false; + private BAMFileIndex mFileIndex = null; + private ReaderImplementation mReader = null; + private File samFile = null; + + /** + * Internal interface for SAM/BAM file reader implementations. + * Implemented as an abstract class to enforce better access control. + */ + static abstract class ReaderImplementation2 extends SAMFileReader.ReaderImplementation { + abstract CloseableIterator getIterator(List chunks); + } + + /** + * Prepare to read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream. + */ + public SAMFileReader2(final InputStream stream) { + this(stream, false); + } + + /** + * Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file + * that is named according to the convention, it will be found and opened, and indexed query will be allowed. + */ + public SAMFileReader2(final File file) { + this(file, null, false); + } + + /** + * Prepare to read a SAM or BAM file. If the given file is a BAM, and an index is present, indexed query + * will be allowed. + * + * @param file SAM or BAM to read. + * @param indexFile Index file that is companion to BAM, or null if no index file, or if index file + * should be found automatically. + */ + public SAMFileReader2(final File file, final File indexFile) { + this(file, indexFile, false); + } + + /** + * Read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream. + * + * @param stream input SAM or BAM. + * @param eagerDecode if true, decode SAM record entirely when reading it. + */ + public SAMFileReader2(final InputStream stream, final boolean eagerDecode) { + init(stream, eagerDecode, defaultValidationStringency); + } + + /** + * Read a SAM or BAM file, possibly with an index file if present. + * If the given file is a BAM, and an index is present, indexed query will be allowed. + * + * @param file SAM or BAM. + * @param eagerDecode if true, decode SAM record entirely when reading it. + */ + public SAMFileReader2(final File file, final boolean eagerDecode) { + init(file, null, eagerDecode, defaultValidationStringency); + } + + /** + * Read a SAM or BAM file, possibly with an index file. If the given file is a BAM, and an index is present, + * indexed query will be allowed. + * + * @param file SAM or BAM. + * @param indexFile Location of index file, or null in order to use the default index file (if present). + * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it. + */ + public SAMFileReader2(final File file, final File indexFile, final boolean eagerDecode){ + init(file, indexFile, eagerDecode, defaultValidationStringency); + } + + /** + * Read a BAM file by http + * indexed query will be allowed. + * + * @param url BAM. + * @param indexFile Location of index file, or null in order to use the default index file (if present). + * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it. + */ + public SAMFileReader2(final URL url, final File indexFile, final boolean eagerDecode) { + init(url, indexFile, eagerDecode, defaultValidationStringency); + } + + public void close() { + if (mReader != null) { + mReader.close(); + } + if (mFileIndex != null) { + mFileIndex.close(); + } + mReader = null; + mFileIndex = null; + } + + /** + * @return True if this is a BAM reader. + */ + public boolean isBinary() { + return mIsBinary; + } + + /** + * @return true if ths is a BAM file, and has an index + */ + public boolean hasIndex() { + return (mFileIndex != null); + } + + public SAMFileHeader getFileHeader() { + return mReader.getFileHeader(); + } + + /** + * Control validation of SAMRecords as they are read from file. + * In order to control validation stringency for SAM Header, call SAMFileReader.setDefaultValidationStringency + * before constructing a SAMFileReader. + */ + public void setValidationStringency(final ValidationStringency validationStringency) { + mReader.setValidationStringency(validationStringency); + } + + /** + * Iterate through file in order. For a SAMFileReader constructed from an InputStream, and for any SAM file, + * a 2nd iteration starts where the 1st one left off. For a BAM constructed from a File, each new iteration + * starts at the first record. + *

+ * Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start + * a second iteration, the first one must be closed first. + */ + public CloseableIterator iterator() { + return mReader.getIterator(); + } + + /** + * Iterate through the given chunks in the file. + * @param chunks List of chunks for which to retrieve data. + * @return An iterator over the given chunks. + */ + public CloseableIterator iterator(List chunks) { + // TODO: Add sanity checks so that we're not doing this against a BAM file. + if(!(mReader instanceof ReaderImplementation2)) + throw new PicardException("This call requires a ReaderImplementation2-compliant interface"); + return ((ReaderImplementation2)mReader).getIterator(chunks); + } + + /** + * Iterate over records that match the given interval. Only valid to call this if hasIndex() == true. + *

+ * Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start + * a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate + * in parallel over the same underlying file. + *

+ * Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read + * and then discarded because they do not match the interval of interest. + *

+ * Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that + * is in the query region. + * + * @param sequence Reference sequence of interest. + * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. + * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. + * @param contained If true, each SAMRecord returned is will have its alignment completely contained in the + * interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest. + * @return Iterator over the SAMRecords matching the interval. + */ + public CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + return mReader.query(sequence, start, end, contained); + } + + /** + * Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true. + *

+ * Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start + * a second iteration, the first one must be closed first. + *

+ * Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read + * and then discarded because they do not match the interval of interest. + *

+ * Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that + * is in the query region. + * + * @param sequence Reference sequence of interest. + * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. + * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. + * @return Iterator over the SAMRecords overlapping the interval. + */ + public CloseableIterator queryOverlapping(final String sequence, final int start, final int end) { + return query(sequence, start, end, false); + } + + /** + * Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true. + *

+ * Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start + * a second iteration, the first one must be closed first. + *

+ * Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read + * and then discarded because they do not match the interval of interest. + *

+ * Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that + * is in the query region. + * + * @param sequence Reference sequence of interest. + * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. + * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. + * @return Iterator over the SAMRecords contained in the interval. + */ + public CloseableIterator queryContained(final String sequence, final int start, final int end) { + return query(sequence, start, end, true); + } + + public CloseableIterator queryUnmapped() { + return mReader.queryUnmapped(); + } + + /** + * Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true. + *

+ * Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start + * a second iteration, the first one must be closed first. + *

+ * Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read + * and then discarded because they do not match the interval of interest. + *

+ * Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that + * matches the arguments. + * + * @param sequence Reference sequence of interest. + * @param start Alignment start of interest. + * @return Iterator over the SAMRecords with the given alignment start. + */ + public CloseableIterator queryAlignmentStart(final String sequence, final int start) { + return mReader.queryAlignmentStart(sequence, start); + } + + /** + * Fetch the mate for the given read. Only valid to call this if hasIndex() == true. + * This will work whether the mate has a coordinate or not, so long as the given read has correct + * mate information. This method iterates over the SAM file, so there may not be an unclosed + * iterator on the SAM file when this method is called. + * + * @param rec Record for which mate is sought. Must be a paired read. + * @return rec's mate, or null if it cannot be found. + */ + public SAMRecord queryMate(final SAMRecord rec) { + if (!rec.getReadPairedFlag()) { + throw new IllegalArgumentException("queryMate called for unpaired read."); + } + if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { + throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); + } + final boolean firstOfPair = rec.getFirstOfPairFlag(); + final CloseableIterator it; + if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + it = queryUnmapped(); + } else { + it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); + } + try { + SAMRecord mateRec = null; + while (it.hasNext()) { + final SAMRecord next = it.next(); + if (!next.getReadPairedFlag()) { + if (rec.getReadName().equals(next.getReadName())) { + throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); + } + continue; + } + if (firstOfPair) { + if (next.getFirstOfPairFlag()) continue; + } else { + if (next.getSecondOfPairFlag()) continue; + } + if (rec.getReadName().equals(next.getReadName())) { + if (mateRec != null) { + throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + + " for " + (firstOfPair ? "second" : "first") + " end."); + } + mateRec = next; + } + } + return mateRec; + } finally { + it.close(); + } + } + + private void init(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency) { + + try { + final BufferedInputStream bufferedStream = IOUtil.toBufferedStream(stream); + if (isBAMFile(bufferedStream)) { + mIsBinary = true; + mReader = new BAMFileReader2(bufferedStream, eagerDecode, validationStringency); + } else if (isGzippedSAMFile(bufferedStream)) { + mIsBinary = false; + mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency); + } else if (isSAMFile(bufferedStream)) { + mIsBinary = false; + mReader = new SAMTextReader(bufferedStream, validationStringency); + } else { + throw new SAMFormatException("Unrecognized file format"); + } + setValidationStringency(validationStringency); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * @param url + * @param indexFile + * @param eagerDecode + */ + private void init(final URL url, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) { + + try { + // Its too expensive to examine the remote file to determine type. + // Rely on file extension. + if (url.toString().toLowerCase().endsWith(".bam")) { + mIsBinary = true; + final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency); + mReader = reader; + if (indexFile != null) { + mFileIndex = new BAMFileIndex(indexFile); + reader.setFileIndex(mFileIndex); + } + } else { + throw new SAMFormatException("Unrecognized file format: " + url); + } + setValidationStringency(validationStringency); + } + catch (IOException e) { + throw new RuntimeIOException(e); + } + + } + + + private void init(final File file, File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) { + this.samFile = file; + + try { + final BufferedInputStream bufferedStream = new BufferedInputStream(new FileInputStream(file)); + if (isBAMFile(bufferedStream)) { + bufferedStream.close(); + mIsBinary = true; + final BAMFileReader2 reader = new BAMFileReader2(file, eagerDecode, validationStringency); + mReader = reader; + if (indexFile == null) { + indexFile = findIndexFile(file); + } + if (indexFile != null) { + mFileIndex = new BAMFileIndex(indexFile); + reader.setFileIndex(mFileIndex); + if (indexFile.lastModified() < file.lastModified()) { + System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() + + " is older than BAM " + file.getAbsolutePath()); + } + } + } else if (isGzippedSAMFile(bufferedStream)) { + mIsBinary = false; + mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency); + } else if (isSAMFile(bufferedStream)) { + if (indexFile != null) { + bufferedStream.close(); + throw new RuntimeException("Cannot use index file with textual SAM file"); + } + mIsBinary = false; + mReader = new SAMTextReader(bufferedStream, file, validationStringency); + } else { + bufferedStream.close(); + throw new SAMFormatException("Unrecognized file format"); + } + setValidationStringency(validationStringency); + } + catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + + /** + * Look for BAM index file according to standard naming convention. + * + * @param dataFile BAM file name. + * @return Index file name, or null if not found. + */ + private File findIndexFile(final File dataFile) { + // If input is foo.bam, look for foo.bai + final String bamExtension = ".bam"; + File indexFile; + final String fileName = dataFile.getName(); + if (fileName.endsWith(bamExtension)) { + final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + ".bai"; + indexFile = new File(dataFile.getParent(), bai); + if (indexFile.exists()) { + return indexFile; + } + } + + // If foo.bai doesn't exist look for foo.bam.bai + indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + /** + * @param stream stream.markSupported() must be true + * @return true if this looks like a BAM file. + */ + private boolean isBAMFile(final InputStream stream) + throws IOException { + return BlockCompressedInputStream.isValidFile(stream); + } + + /** + * Attempts to check whether the file is a gzipped sam file. Returns true if it + * is and false otherwise. + */ + private boolean isGzippedSAMFile(final BufferedInputStream stream) { + if (!stream.markSupported()) { + throw new IllegalArgumentException("Cannot test a stream that doesn't support marking."); + } + stream.mark(8000); + + try { + final GZIPInputStream gunzip = new GZIPInputStream(stream); + final int ch = gunzip.read(); + return true; + } + catch (IOException ioe) { + return false; + } + finally { + try { + stream.reset(); + } + catch (IOException ioe) { + throw new IllegalStateException("Could not reset stream."); + } + } + } + + private boolean isSAMFile(final InputStream stream) { + // For now, assume every non-binary file is a SAM text file. + return true; + } + + @Override + public String toString() { + if (this.samFile == null) { + return getClass().getSimpleName() + "{initialized with stream}"; + } else { + return getClass().getSimpleName() + "{" + this.samFile.getAbsolutePath() + "}"; + } + } +}