diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java new file mode 100644 index 000000000..5005b6265 --- /dev/null +++ b/public/java/src/net/sf/samtools/BAMFileReader.java @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader extends SAMFileReader.ReaderImplementation { + // True if reading from a File rather than an InputStream + private boolean mIsSeekable = false; + + // For converting bytes into other primitive types + private BinaryCodec mStream = null; + + // Underlying compressed data stream. + private final BAMInputStream mInputStream; + private SAMFileHeader mFileHeader = null; + + // Populated if the file is seekable and an index exists + private File mIndexFile; + private BAMIndex mIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + + // If true, all SAMRecords are fully decoded as they are read. + private final boolean eagerDecode; + + // For error-checking. + private ValidationStringency mValidationStringency; + + // For creating BAMRecords + private SAMRecordFactory samRecordFactory; + + /** + * Use the caching index reader implementation rather than the disk-hit-per-file model. + */ + private boolean mEnableIndexCaching = false; + + /** + * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. + */ + private boolean mEnableIndexMemoryMapping = true; + + /** + * Add information about the origin (reader and position) to SAM records. + */ + private SAMFileReader mFileReader = null; + + /** + * Prepare to read BAM from a stream (not seekable) + * @param stream source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final InputStream stream, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = false; + mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(null); + } + + /** + * Prepare to read BAM from a file (seekable) + * @param file source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final File file, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); + if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { + System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + + " is older than BAM " + file.getAbsolutePath()); + } + } + + BAMFileReader(final SeekableStream strm, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), + indexFile, + eagerDecode, + strm.getSource(), + validationStringency, + factory); + } + + private BAMFileReader(final BAMInputStream inputStream, + final File indexFile, + final boolean eagerDecode, + final String source, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = true; + mInputStream = inputStream; + mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(source); + mFirstRecordPointer = inputStream.getFilePointer(); + } + + /** + * If true, writes the source of every read into the source SAMRecords. + * @param enabled true to write source information into each SAMRecord. + */ + void enableFileSource(final SAMFileReader reader, final boolean enabled) { + this.mFileReader = enabled ? reader : null; + } + + /** + * If true, uses the caching version of the index reader. + * @param enabled true to write source information into each SAMRecord. + */ + public void enableIndexCaching(final boolean enabled) { + if(mIndex != null) + throw new SAMException("Unable to turn on index caching; index file has already been loaded."); + this.mEnableIndexCaching = enabled; + } + + /** + * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). + * This is slower but more scalable when accessing large numbers of BAM files sequentially. + * @param enabled True to use memory mapping, false to use regular I/O. + */ + public void enableIndexMemoryMapping(final boolean enabled) { + if (mIndex != null) { + throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); + } + this.mEnableIndexMemoryMapping = enabled; + } + + @Override void enableCrcChecking(final boolean enabled) { + this.mInputStream.setCheckCrcs(enabled); + } + + @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } + + /** + * @return true if ths is a BAM file, and has an index + */ + public boolean hasIndex() { + return (mIndexFile != null); + } + + /** + * Retrieves the index for the given file type. Ensure that the index is of the specified type. + * @return An index of the given type. + */ + public BAMIndex getIndex() { + if(mIndexFile == null) + throw new SAMException("No index is available for this BAM file."); + if(mIndex == null) + mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) + : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); + return mIndex; + } + + void close() { + if (mStream != null) { + mStream.close(); + } + if (mIndex != null) { + mIndex.close(); + } + mStream = null; + mFileHeader = null; + mIndex = null; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Set error-checking level for subsequent SAMRecord reads. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + SAMFileReader.ValidationStringency getValidationStringency() { + return this.mValidationStringency; + } + + /** + * Prepare to iterate through the SAMRecords in file order. + * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, + * that iterator must be closed before getIterator() can be called again. + * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to + * getIterator() begins its iteration where the last one left off. That is the best that can be + * done in that situation. + */ + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + @Override + CloseableIterator getIterator(final SAMFileSpan chunks) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!(chunks instanceof BAMFileSpan)) { + throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); + } + + // Create an iterator over the given chunk boundaries. + mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); + return mCurrentIterator; + } + + /** + * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know + * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, + * the rightmost bound is guaranteed to be after the last read in the file. + * @return An unbounded pointer to the first record in the BAM file. + */ + @Override + SAMFileSpan getFilePointerSpanningReads() { + return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); + } + + /** + * Prepare to iterate through the SAMRecords that match the given interval. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. + * A value of zero implies the start of the reference sequence. + * @param end A value of zero implies the end of the reference sequence. + * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval + * specified by start and end. If false, the SAMRecords need only overlap the interval. + * @return Iterator for the matching SAMRecords + */ + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords with the given alignment start. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Alignment start sought. + * @return Iterator for the matching SAMRecords. + */ + CloseableIterator queryAlignmentStart(final String sequence, final int start) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); + return mCurrentIterator; + } + + public CloseableIterator queryUnmapped() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + try { + final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); + if (startOfLastLinearBin != -1) { + mInputStream.seek(startOfLastLinearBin); + } else { + // No mapped reads in file, just start at the first read in file. + mInputStream.seek(mFirstRecordPointer); + } + mCurrentIterator = new BAMFileIndexUnmappedIterator(); + return mCurrentIterator; + } catch (IOException e) { + throw new RuntimeException("IOException seeking to unmapped reads", e); + } + } + + /** + * Reads the header from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private void readHeader(final String source) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(mValidationStringency); + mFileHeader = headerCodec.decode(new StringLineReader(textHeader), + source); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + mFileHeader.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + source); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + source); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(source)); + } + mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } + } + + /** + * Reads a single binary sequence record from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private SAMSequenceRecord readSequenceRecord(final String source) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); + } + + /** + * Iterator for non-indexed sequential iteration through all SAMRecords in file. + * Starting point of iteration is wherever current file position is when the iterator is constructed. + */ + private class BAMFileIterator implements CloseableIterator { + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec; + private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file + + BAMFileIterator() { + this(true); + } + + /** + * @param advance Trick to enable subclass to do more setup before advancing + */ + BAMFileIterator(final boolean advance) { + this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); + this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (mCurrentIterator != null && this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + + if (mNextRecord != null) { + ++this.samRecordIndex; + // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setValidationStringency(mValidationStringency); + + if (mValidationStringency != ValidationStringency.SILENT) { + final List validationErrors = mNextRecord.isValid(); + SAMUtils.processValidationErrors(validationErrors, + this.samRecordIndex, BAMFileReader.this.getValidationStringency()); + } + } + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + /** + * Read the next record from the input stream. + */ + SAMRecord getNextRecord() throws IOException { + final long startCoordinate = mInputStream.getFilePointer(); + final SAMRecord next = bamRecordCodec.decode(); + final long stopCoordinate = mInputStream.getFilePointer(); + + if(mFileReader != null && next != null) + next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); + + return next; + } + + /** + * @return The record that will be return by the next call to next() + */ + protected SAMRecord peek() { + return mNextRecord; + } + } + + /** + * Prepare to iterate through SAMRecords matching the target interval. + * @param sequence Desired reference sequence. + * @param start 1-based start of target interval, inclusive. + * @param end 1-based end of target interval, inclusive. + * @param queryType contained, overlapping, or starting-at query. + */ + private CloseableIterator createIndexIterator(final String sequence, + final int start, + final int end, + final QueryType queryType) { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + final SAMFileHeader fileHeader = getFileHeader(); + final int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMIndex fileIndex = getIndex(); + final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); + filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; + } + + // Create an iterator over the above chunk boundaries. + final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); + + // Add some preprocessing filters for edge-case reads that don't fit into this + // query type. + return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); + } + + enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} + + /** + * Look for BAM index file according to standard naming convention. + * + * @param dataFile BAM file name. + * @return Index file name, or null if not found. + */ + private static File findIndexFile(final File dataFile) { + // If input is foo.bam, look for foo.bai + final String bamExtension = ".bam"; + File indexFile; + final String fileName = dataFile.getName(); + if (fileName.endsWith(bamExtension)) { + final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; + indexFile = new File(dataFile.getParent(), bai); + if (indexFile.exists()) { + return indexFile; + } + } + + // If foo.bai doesn't exist look for foo.bam.bai + indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + private class BAMFileIndexIterator extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + + /** + * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. + * @param filePointers the block / offset combination, stored in chunk format. + */ + BAMFileIndexIterator(final long[] filePointers) { + super(false); // delay advance() until after construction + mFilePointers = filePointers; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + // Advance to next file block if necessary + while (mInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + return super.getNextRecord(); + } + } + + /** + * A decorating iterator that filters out records that are outside the bounds of the + * given query parameters. + */ + private class BAMQueryFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private final CloseableIterator wrappedIterator; + + /** + * The next record to be returned. Will be null if no such record exists. + */ + private SAMRecord mNextRecord; + + private final int mReferenceIndex; + private final int mRegionStart; + private final int mRegionEnd; + private final QueryType mQueryType; + + public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { + this.wrappedIterator = iterator; + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + mRegionStart = start; + if (queryType == QueryType.STARTING_AT) { + mRegionEnd = mRegionStart; + } else { + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + } + mQueryType = queryType; + mNextRecord = advance(); + } + + /** + * Returns true if a next element exists; false otherwise. + */ + public boolean hasNext() { + return mNextRecord != null; + } + + /** + * Gets the next record from the given iterator. + * @return The next SAM record in the iterator. + */ + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); + final SAMRecord currentRead = mNextRecord; + mNextRecord = advance(); + return currentRead; + } + + /** + * Closes down the existing iterator. + */ + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + SAMRecord advance() { + while (true) { + // Pull next record from stream + if(!wrappedIterator.hasNext()) + return null; + + final SAMRecord record = wrappedIterator.next(); + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + // If read is unmapped but has a coordinate, return it if the coordinate is within + // the query region, regardless of whether the mapped mate will be returned. + final int alignmentEnd; + if (mQueryType == QueryType.STARTING_AT) { + alignmentEnd = -1; + } else { + alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? + record.getAlignmentEnd(): alignmentStart); + } + + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + return null; + } + // Filter for overlap with region + if (mQueryType == QueryType.CONTAINED) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else if (mQueryType == QueryType.OVERLAPPING) { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } else { + if (alignmentStart == mRegionStart) { + return record; + } + } + } + } + } + + private class BAMFileIndexUnmappedIterator extends BAMFileIterator { + private BAMFileIndexUnmappedIterator() { + while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + advance(); + } + } + } + +} diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index 623f46291..4692c6671 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -25,6 +25,7 @@ package net.sf.samtools; import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan { super(); } + /** + * Create a new GATKBAMFileSpan from an existing BAMFileSpan. + * @param sourceFileSpan + */ + public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) { + if(!(sourceFileSpan instanceof BAMFileSpan)) + throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead"); + BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan; + for(Chunk chunk: sourceBAMFileSpan.getChunks()) + add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk)); + } + /** * Convenience constructor to construct a BAM file span from * a single chunk. diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index f590809e2..5d349e72e 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -69,6 +69,22 @@ public class GATKChunk extends Chunk { super.setChunkEnd(value); } + public long getBlockStart() { + return getChunkStart() >>> 16; + } + + public int getBlockOffsetStart() { + return (int)(getChunkStart() & 0xFFFF); + } + + public long getBlockEnd() { + return getChunkEnd() >>> 16; + } + + public int getBlockOffsetEnd() { + return ((int)getChunkEnd() & 0xFFFF); + } + /** * Computes an approximation of the uncompressed size of the * chunk, in bytes. Can be used to determine relative weights diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java new file mode 100644 index 000000000..d825c23d5 --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package net.sf.samtools.util; + +import java.io.IOException; + +/** + * An input stream formulated for use reading BAM files. Supports + */ +public interface BAMInputStream { + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) throws IOException; + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer(); + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check); + + public int read() throws java.io.IOException; + + public int read(byte[] bytes) throws java.io.IOException; + + public int read(byte[] bytes, int i, int i1) throws java.io.IOException; + + public long skip(long l) throws java.io.IOException; + + public int available() throws java.io.IOException; + + public void close() throws java.io.IOException; + + public void mark(int i); + + public void reset() throws java.io.IOException; + + public boolean markSupported(); +} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java new file mode 100755 index 000000000..fae2fc89b --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java @@ -0,0 +1,483 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools.util; + + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import net.sf.samtools.FileTruncatedException; + +/* + * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. + * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. + * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the + * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. + * + * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format + */ +public class BlockCompressedInputStream extends InputStream implements BAMInputStream { + private InputStream mStream = null; + private SeekableStream mFile = null; + private byte[] mFileBuffer = null; + private byte[] mCurrentBlock = null; + private int mCurrentOffset = 0; + private long mBlockAddress = 0; + private int mLastBlockLength = 0; + private final BlockGunzipper blockGunzipper = new BlockGunzipper(); + + + /** + * Note that seek() is not supported if this ctor is used. + */ + public BlockCompressedInputStream(final InputStream stream) { + mStream = IOUtil.toBufferedStream(stream); + mFile = null; + } + + /** + * Use this ctor if you wish to call seek() + */ + public BlockCompressedInputStream(final File file) + throws IOException { + mFile = new SeekableFileStream(file); + mStream = null; + + } + + public BlockCompressedInputStream(final URL url) { + mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); + mStream = null; + } + + /** + * For providing some arbitrary data source. No additional buffering is + * provided, so if the underlying source is not buffered, wrap it in a + * SeekableBufferedStream before passing to this ctor. + */ + public BlockCompressedInputStream(final SeekableStream strm) { + mFile = strm; + mStream = null; + } + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check) { + this.blockGunzipper.setCheckCrcs(check); + } + + /** + * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the + * next caller of a method for this input stream. The next caller might be the same thread or another thread. + * Note that although the next caller can read this many bytes without blocking, the available() method call itself + * may block in order to fill an internal buffer if it has been exhausted. + */ + public int available() + throws IOException { + if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { + readBlock(); + } + if (mCurrentBlock == null) { + return 0; + } + return mCurrentBlock.length - mCurrentOffset; + } + + /** + * Closes the underlying InputStream or RandomAccessFile + */ + public void close() + throws IOException { + if (mFile != null) { + mFile.close(); + mFile = null; + } else if (mStream != null) { + mStream.close(); + mStream = null; + } + // Encourage garbage collection + mFileBuffer = null; + mCurrentBlock = null; + } + + /** + * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. + * If no byte is available because the end of the stream has been reached, the value -1 is returned. + * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. + + * @return the next byte of data, or -1 if the end of the stream is reached. + */ + public int read() + throws IOException { + return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; + } + + /** + * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes + * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, + * or an exception is thrown. + * + * read(buf) has the same effect as read(buf, 0, buf.length). + * + * @param buffer the buffer into which the data is read. + * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer) + throws IOException { + return read(buffer, 0, buffer.length); + } + + private volatile ByteArrayOutputStream buf = null; + private static final byte eol = '\n'; + private static final byte eolCr = '\r'; + + /** + * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), + * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). + * + * @return A String containing the contents of the line, excluding the line terminating + * character, or null if the end of the stream has been reached + * + * @exception IOException If an I/O error occurs + * @ + */ + public String readLine() throws IOException { + int available = available(); + if (available == 0) { + return null; + } + if(null == buf){ // lazy initialisation + buf = new ByteArrayOutputStream(8192); + } + buf.reset(); + boolean done = false; + boolean foundCr = false; // \r found flag + while (!done) { + int linetmpPos = mCurrentOffset; + int bCnt = 0; + while((available-- > 0)){ + final byte c = mCurrentBlock[linetmpPos++]; + if(c == eol){ // found \n + done = true; + break; + } else if(foundCr){ // previous char was \r + --linetmpPos; // current char is not \n so put it back + done = true; + break; + } else if(c == eolCr){ // found \r + foundCr = true; + continue; // no ++bCnt + } + ++bCnt; + } + if(mCurrentOffset < linetmpPos){ + buf.write(mCurrentBlock, mCurrentOffset, bCnt); + mCurrentOffset = linetmpPos; + } + available = available(); + if(available == 0){ + // EOF + done = true; + } + } + return buf.toString(); + } + + /** + * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read + * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. + * + * This method blocks until input data is available, end of file is detected, or an exception is thrown. + * + * @param buffer buffer into which data is read. + * @param offset the start offset in array b at which the data is written. + * @param length the maximum number of bytes to read. + * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer, int offset, int length) + throws IOException { + final int originalLength = length; + while (length > 0) { + final int available = available(); + if (available == 0) { + // Signal EOF to caller + if (originalLength == length) { + return -1; + } + break; + } + final int copyLength = Math.min(length, available); + System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); + mCurrentOffset += copyLength; + offset += copyLength; + length -= copyLength; + } + return originalLength - length; + } + + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) + throws IOException { + if (mFile == null) { + throw new IOException("Cannot seek on stream based file"); + } + // Decode virtual file pointer + // Upper 48 bits is the byte offset into the compressed stream of a block. + // Lower 16 bits is the byte offset into the uncompressed stream inside the block. + final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); + final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); + final int available; + if (mBlockAddress == compressedOffset && mCurrentBlock != null) { + available = mCurrentBlock.length; + } else { + mFile.seek(compressedOffset); + mBlockAddress = compressedOffset; + mLastBlockLength = 0; + readBlock(); + available = available(); + } + if (uncompressedOffset > available || + (uncompressedOffset == available && !eof())) { + throw new IOException("Invalid file pointer: " + pos); + } + mCurrentOffset = uncompressedOffset; + } + + private boolean eof() throws IOException { + if (mFile.eof()) { + return true; + } + // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. + return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + } + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer() { + if (mCurrentOffset == mCurrentBlock.length) { + // If current offset is at the end of the current block, file pointer should point + // to the beginning of the next block. + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); + } + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); + } + + public static long getFileBlock(final long bgzfOffset) { + return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); + } + + /** + * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). + * @return true if the given file looks like a valid BGZF file. + */ + public static boolean isValidFile(final InputStream stream) + throws IOException { + if (!stream.markSupported()) { + throw new RuntimeException("Cannot test non-buffered stream"); + } + stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + stream.reset(); + return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); + } + + private static boolean isValidBlockHeader(final byte[] buffer) { + return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && + (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && + (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && + buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && + buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && + buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); + } + + private void readBlock() + throws IOException { + + if (mFileBuffer == null) { + mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; + } + int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + if (count == 0) { + // Handle case where there is no empty gzip block at end. + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mCurrentBlock = new byte[0]; + return; + } + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + throw new IOException("Premature end of file"); + } + final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; + if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { + throw new IOException("Unexpected compressed block length: " + blockLength); + } + final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; + count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); + if (count != remaining) { + throw new FileTruncatedException("Premature end of file"); + } + inflateBlock(mFileBuffer, blockLength); + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mLastBlockLength = blockLength; + } + + private void inflateBlock(final byte[] compressedBlock, final int compressedLength) + throws IOException { + final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); + byte[] buffer = mCurrentBlock; + mCurrentBlock = null; + if (buffer == null || buffer.length != uncompressedLength) { + try { + buffer = new byte[uncompressedLength]; + } catch (NegativeArraySizeException e) { + throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); + } + } + blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); + mCurrentBlock = buffer; + } + + private int readBytes(final byte[] buffer, final int offset, final int length) + throws IOException { + if (mFile != null) { + return readBytes(mFile, buffer, offset, length); + } else if (mStream != null) { + return readBytes(mStream, buffer, offset, length); + } else { + return 0; + } + } + + private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = file.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + private int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } + + public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} + + public static FileTermination checkTermination(final File file) + throws IOException { + final long fileSize = file.length(); + if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { + return FileTermination.DEFECTIVE; + } + final RandomAccessFile raFile = new RandomAccessFile(file, "r"); + try { + raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; + raFile.readFully(buf); + if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { + return FileTermination.HAS_TERMINATOR_BLOCK; + } + final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); + buf = new byte[bufsize]; + raFile.seek(fileSize - bufsize); + raFile.read(buf); + for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; + i >= 0; --i) { + if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, + buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { + continue; + } + final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; + if (buf.length - i == totalBlockSizeMinusOne + 1) { + return FileTermination.HAS_HEALTHY_LAST_BLOCK; + } else { + return FileTermination.DEFECTIVE; + } + } + return FileTermination.DEFECTIVE; + } finally { + raFile.close(); + } + } + + private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { + for (int i = 0; i < length; ++i) { + if (preamble[i] != buf[i + startOffset]) { + return false; + } + } + return true; + } +} + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index bed1e710e..b0b57f7fc 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -331,12 +331,12 @@ public abstract class CommandLineProgram { * used to indicate an error occured * * @param msg the message - * @param e the error + * @param t the error */ - public static void exitSystemWithError(String msg, final Exception e) { + public static void exitSystemWithError(String msg, final Throwable t) { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("stack trace %n"); - e.printStackTrace(); + t.printStackTrace(); errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); @@ -394,8 +394,8 @@ public abstract class CommandLineProgram { * * @param e the exception occured */ - public static void exitSystemWithError(Exception e) { - exitSystemWithError(e.getMessage(), e); + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b8488dc9a..d3db35c07 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -99,8 +99,8 @@ public class CommandLineGATK extends CommandLineExecutable { } catch (net.sf.samtools.SAMException e) { // Let's try this out and see how it is received by our users exitSystemWithSamError(e); - } catch (Exception e) { - exitSystemWithError(e); + } catch (Throwable t) { + exitSystemWithError(t); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 2ceb4ab46..f2e0b5d0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; @@ -126,6 +127,11 @@ public class GenomeAnalysisEngine { */ private Collection filters; + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + /** * A currently hacky unique name for this GATK instance */ @@ -199,6 +205,9 @@ public class GenomeAnalysisEngine { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + // Prepare the data for traversal. initializeDataSources(); @@ -218,7 +227,7 @@ public class GenomeAnalysisEngine { // create the output streams " initializeOutputStreams(microScheduler.getOutputTracker()); - ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); // execute the microscheduler, storing the results return microScheduler.execute(this.walker, shardStrategy); @@ -266,6 +275,16 @@ public class GenomeAnalysisEngine { return Collections.unmodifiableList(filters); } + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null; + Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null; + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads); + } + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. @@ -286,7 +305,7 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); } - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); } protected DownsamplingMethod getDownsamplingMethod() { @@ -397,103 +416,49 @@ public class GenomeAnalysisEngine { * @param intervals intervals * @return the sharding strategy */ - protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original - // sharding system; it's required with the new sharding system only for locus walkers. - if(readsDataSource != null && !readsDataSource.hasIndex() ) { - if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - Shard.ShardType shardType; if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - shardType = Shard.ShardType.LOCUS; + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); } - else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) - shardType = Shard.ShardType.READ; else - throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files"); - - List region; - if(intervals != null) - region = intervals.toList(); - else { - region = new ArrayList(); - for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences()) - region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength())); - } - - return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region); + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); } - - ShardStrategy shardStrategy; - ShardStrategyFactory.SHATTER_STRATEGY shardType; - - long SHARD_SIZE = 100000L; - - if (walker instanceof LocusWalker) { - if (walker instanceof RodWalker) SHARD_SIZE *= 1000; - - if (intervals != null && !intervals.isEmpty()) { - if (readsDataSource == null) - throw new IllegalArgumentException("readsDataSource is null"); - if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE,getGenomeLocParser()); - } else if (walker instanceof ReadWalker || - walker instanceof DuplicateWalker) { - shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; - - if (intervals != null && !intervals.isEmpty()) { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } - } else if (walker instanceof ReadPairWalker) { - if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } else - throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName()); - - return shardStrategy; } protected boolean flashbackData() { @@ -751,6 +716,8 @@ public class GenomeAnalysisEngine { return new SAMDataSource( samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, genomeLocParser, argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, @@ -763,8 +730,7 @@ public class GenomeAnalysisEngine { getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, - argCollection.defaultBaseQualities, - !argCollection.disableLowMemorySharding); + argCollection.defaultBaseQualities); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8078a1ea4..64b63dcd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -194,10 +194,14 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) - public int numberOfThreads = 1; + /** How many threads should be allocated to this analysis. */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + public Integer numberOfThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false) + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) + public Integer numberOfBAMFileHandles = null; + + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; // -------------------------------------------------------------------------------------------------------------- @@ -292,9 +296,6 @@ public class GATKArgumentCollection { @Hidden public boolean allowIntervalsWithUnindexedBAM = false; - @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false) - public boolean disableLowMemorySharding = false; - // -------------------------------------------------------------------------------------------------------------- // // methods @@ -365,7 +366,11 @@ public class GATKArgumentCollection { (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } - if (other.numberOfThreads != this.numberOfThreads) { + if (!other.numberOfThreads.equals(this.numberOfThreads)) { + return false; + } + if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) || + (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) { return false; } if (other.intervalMerging != this.intervalMerging) { @@ -389,9 +394,6 @@ public class GATKArgumentCollection { if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; - if (disableLowMemorySharding != other.disableLowMemorySharding) - return false; - return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java deleted file mode 100644 index de938e845..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 7, 2011 - * Time: 2:46:34 PM - * To change this template use File | Settings | File Templates. - */ -public class BAMBlockStartIterator implements Iterator { - /** - * How large is a BGZF header? - */ - private static int BGZF_HEADER_SIZE = 18; - - /** - * Where within the header does the BLOCKSIZE actually live? - */ - private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2; - - private FileChannel bamInputChannel; - private ByteBuffer headerByteBuffer; - - private long nextLocation = 0; - - public BAMBlockStartIterator(File bamFile) { - try { - FileInputStream bamInputStream = new FileInputStream(bamFile); - bamInputChannel = bamInputStream.getChannel(); - - headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE); - headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - } - catch(IOException ex) { - throw new StingException("Could not open file",ex); - } - } - - public boolean hasNext() { - return nextLocation != -1; - } - - public Long next() { - long currentLocation = nextLocation; - advance(); - return currentLocation; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator"); - } - - private void advance() { - int readStatus; - - headerByteBuffer.clear(); - try { - readStatus = bamInputChannel.read(headerByteBuffer); - } - catch(IOException ex) { - throw new StingException("Could not read header data",ex); - } - - if(readStatus == -1) { - nextLocation = -1; - try { - bamInputChannel.close(); - } - catch(IOException ex) { - throw new StingException("Could not close input file",ex); - } - return; - } - - headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION); - int blockSize = headerByteBuffer.getShort(); - - try { - bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1); - nextLocation = bamInputChannel.position(); - } - catch(IOException ex) { - throw new StingException("Could not reposition input stream",ex); - } - } - - public static void main(String argv[]) throws IOException { - BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam")); - int i = 0; - while(blockStartIterator.hasNext()) - System.out.printf("%d -> %d%n",i++,blockStartIterator.next()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java deleted file mode 100644 index 4d91fb45f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKChunk; -import net.sf.samtools.LinearIndex; - -import java.util.*; - -/** - * Represents the contents of a bam index file for one reference. - * A BAM index (.bai) file contains information for all references in the bam file. - * This class describes the data present in the index file for one of these references; - * including the bins, chunks, and linear index. - */ -class BAMIndexContent { - /** - * The reference sequence for the data currently loaded. - */ - private final int mReferenceSequence; - - /** - * A list of all bins in the above reference sequence. - */ - private final BinList mBinList; - - /** - * The linear index for the reference sequence above. - */ - private final LinearIndex mLinearIndex; - - - /** - * @param referenceSequence Content corresponds to this reference. - * @param bins Array of bins represented by this content, possibly sparse - * @param numberOfBins Number of non-null bins - * @param linearIndex Additional index used to optimize queries - */ - BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) { - this.mReferenceSequence = referenceSequence; - this.mBinList = new BinList(bins, numberOfBins); - this.mLinearIndex = linearIndex; - } - - /** - * Reference for this Content - */ - public int getReferenceSequence() { - return mReferenceSequence; - } - - /** - * Does this content have anything in this bin? - */ - public boolean containsBin(final GATKBin bin) { - return mBinList.getBin(bin.getBinNumber()) != null; - } - - /** - * @return iterable list of bins represented by this content - */ - public BinList getBins() { - return mBinList; - } - - /** - * @return the number of non-null bins represented by this content - */ - int getNumberOfNonNullBins() { - return mBinList.getNumberOfNonNullBins(); - } - - /** - * @return all chunks associated with all bins in this content - */ - public List getAllChunks() { - List allChunks = new ArrayList(); - for (GATKBin b : mBinList) - if (b.getChunkList() != null) { - allChunks.addAll(Arrays.asList(b.getChunkList())); - } - return Collections.unmodifiableList(allChunks); - } - - /** - * @return the linear index represented by this content - */ - public LinearIndex getLinearIndex() { - return mLinearIndex; - } - - /** - * This class is used to encapsulate the list of Bins store in the BAMIndexContent - * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure - */ - class BinList implements Iterable { - - private final GATKBin[] mBinArray; - public final int numberOfNonNullBins; - public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based - - /** - * @param binArray a sparse array representation of the bins. The index into the array is the bin number. - * @param numberOfNonNullBins - */ - BinList(GATKBin[] binArray, int numberOfNonNullBins) { - this.mBinArray = binArray; - this.numberOfNonNullBins = numberOfNonNullBins; - this.maxBinNumber = mBinArray.length - 1; - } - - GATKBin getBin(int binNumber) { - if (binNumber > maxBinNumber) return null; - return mBinArray[binNumber]; - } - - int getNumberOfNonNullBins() { - return numberOfNonNullBins; - } - - /** - * Gets an iterator over all non-null bins. - * - * @return An iterator over all bins. - */ - public Iterator iterator() { - return new BinIterator(); - } - - private class BinIterator implements Iterator { - /** - * Stores the bin # of the Bin currently in use. - */ - private int nextBin; - - public BinIterator() { - nextBin = 0; - } - - /** - * Are there more bins in this set, waiting to be returned? - * - * @return True if more bins are remaining. - */ - public boolean hasNext() { - while (nextBin <= maxBinNumber) { - if (getBin(nextBin) != null) return true; - nextBin++; - } - return false; - } - - /** - * Gets the next bin in the provided BinList. - * - * @return the next available bin in the BinList. - */ - public GATKBin next() { - if (!hasNext()) - throw new NoSuchElementException("This BinIterator is currently empty"); - GATKBin result = getBin(nextBin); - nextBin++; - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a bin iterator"); - } - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java deleted file mode 100644 index 15a372ca6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; - -import java.util.HashMap; -import java.util.Map; - -/** - * Models a bin at which all BAM files in the merged input stream overlap. - */ -class BAMOverlap { - public final int start; - public final int stop; - - private final Map bins = new HashMap(); - - public BAMOverlap(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - public void addBin(final SAMReaderID id, final Bin bin) { - bins.put(id,bin); - } - - public Bin getBin(final SAMReaderID id) { - return bins.get(id); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index 521bcd5a3..762722fcd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator { /** * Create a new BAM schedule based on the given index. - * @param indexFiles Index files. + * @param dataSource The SAM data source to use. * @param intervals List of */ - public BAMSchedule(final Map indexFiles, final List intervals) { + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); - referenceSequence = intervals.get(0).getContigIndex(); + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); createScheduleFile(); - readerIDs.addAll(indexFiles.keySet()); + readerIDs.addAll(dataSource.getReaderIDs()); for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = indexFiles.get(reader); + final GATKBAMIndex index = dataSource.getIndex(reader); final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); @@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator { if(selectedIterators.isEmpty()) return; + // Create the target schedule entry BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { PeekableIterator scheduleIterator = scheduleIterators.get(reader); BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); @@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator { scheduleIterator.next(); } + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + nextScheduleEntry = mergedScheduleEntry; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 47eb55b28..dca4cc771 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.*; @@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator { private FilePointer nextFilePointer = null; - private final GenomeLocSortedSet loci; + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; - private final PeekableIterator locusIterator; + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); + for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { + // Match only on sequence name; trust startup validation to make sure all the sequences match. + if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) + intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); + } + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } - private GenomeLoc currentLocus; + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } - public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } + + + private BAMScheduler(final SAMDataSource dataSource) { this.dataSource = dataSource; - for(SAMReaderID reader: dataSource.getReaderIDs()) - indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(); + Map currentPosition = dataSource.getCurrentPosition(); + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; } public boolean hasNext() { @@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator { if(!hasNext()) throw new NoSuchElementException("No next element available in interval sharder"); FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; advance(); + return currentFilePointer; } @@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator { if(loci.isEmpty()) return; - nextFilePointer = null; while(nextFilePointer == null && currentLocus != null) { // special case handling of the unmapped shard. if(currentLocus == GenomeLoc.UNMAPPED) { nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); currentLocus = null; continue; } @@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator { int coveredRegionStop = Integer.MAX_VALUE; GenomeLoc coveredRegion = null; - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); // No overlapping data at all. if(scheduleEntry != null) { @@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator { } else { // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); for(SAMReaderID reader: indexFiles.keySet()) nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); } @@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator { // Early exit if no bins were found. if(coveredRegion == null) { // for debugging only: maximum split is 16384. - if(currentLocus.size() > 16384) { - GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - } - else { - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - } + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; continue; } // Early exit if only part of the first interval was found. if(currentLocus.startsBefore(coveredRegion)) { - // for debugging only: maximum split is 16384. int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); GenomeLoc[] splitContigs = currentLocus.split(splitPoint); nextFilePointer.addLocation(splitContigs[0]); @@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator { /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM indices. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex(); + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { if(bamScheduleIterator != null) bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentLocus.getContigIndex(); + lastReferenceSequenceLoaded = currentContigIndex; // Naive algorithm: find all elements in current contig for proper schedule creation. List lociInContig = new LinkedList(); for(GenomeLoc locus: loci) { - if(locus.getContigIndex() == lastReferenceSequenceLoaded) + if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); } if(!bamScheduleIterator.hasNext()) @@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator { return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; } + /** + * Create a span from the given start point to the end of the file. + * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). + * @return A file span from the given point to the end of the file. + */ + private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { + return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java new file mode 100644 index 000000000..f468d2020 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.LinkedList; +import java.util.Queue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Preloads BGZF blocks in preparation for unzipping and data processing. + * TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should + * TODO: be replaced with a central thread management strategy. + */ +public class BGZFBlockLoadingDispatcher { + /** + * The file handle cache, used when allocating blocks from the dispatcher. + */ + private final FileHandleCache fileHandleCache; + + private final ExecutorService threadPool; + + private final Queue inputQueue; + + public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { + threadPool = Executors.newFixedThreadPool(numThreads); + fileHandleCache = new FileHandleCache(numFileHandles); + inputQueue = new LinkedList(); + + threadPool.execute(new BlockLoader(this,fileHandleCache,true)); + } + + /** + * Initiates a request for a new block load. + * @param readerPosition Position at which to load. + */ + void queueBlockLoad(final SAMReaderPosition readerPosition) { + synchronized(inputQueue) { + inputQueue.add(readerPosition); + inputQueue.notify(); + } + } + + /** + * Claims the next work request from the queue. + * @return The next work request, or null if none is available. + */ + SAMReaderPosition claimNextWorkRequest() { + synchronized(inputQueue) { + while(inputQueue.isEmpty()) { + try { + inputQueue.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item"); + } + } + return inputQueue.poll(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java new file mode 100644 index 000000000..e377f865d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BAMInputStream; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.RuntimeEOFException; +import net.sf.samtools.util.SeekableStream; +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends SeekableStream implements BAMInputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current position. + */ + private SAMReaderPosition position; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Has the buffer been filled since last request? + */ + private boolean bufferFilled = false; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.samFile.length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.samFile); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + if(buffer.remaining() > 0) { + // If there's data in the buffer, figure out from whence it came. + final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0; + final int blockOffset = buffer.position(); + filePointer = blockAddress << 16 | blockOffset; + } + else { + // Otherwise, find the next position to load. + filePointer = position.getBlockAddress() << 16; + } + } + + if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) + throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", + BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer), + BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()))); + + return filePointer; + } + + public void seek(long target) { + // TODO: Validate the seek point. + //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); + synchronized(lock) { + clearBuffers(); + position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target)); + waitForBufferFill(); + buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target)); + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(target); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + } + + private void clearBuffers() { + this.position.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + blockOffsets.clear(); + blockPositions.clear(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return position != null && position.getBlockAddress() >= length; + } + } + + public void setCheckCrcs(final boolean check) { + // TODO: Implement + } + + /** + * Submits a new access plan for the given dataset. + * @param position The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final SAMReaderPosition position) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + synchronized(lock) { + // Assume that the access plan is going to tell us to start where we are and move forward. + // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. + if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) + position.advancePosition(this.position.getBlockAddress()); + } + this.position = position; + } + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first block in the series. + while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) { + bytesToRemove += blockOffsets.remove(); + blockPositions.remove(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + buffer.position(bytesToRemove); + buffer.compact(); + + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param position target position for the data. + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + synchronized(lock) { + try { + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Advance the position to take the most recent read into account. + long lastReadPosition = position.getBlockAddress(); + + byte[] validBytes = null; + if(validatingInputStream != null) { + validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(lastReadPosition << 16); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + this.position = position; + position.advancePosition(filePosition); + + if(buffer.remaining() < incomingBuffer.remaining()) { + //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + lock.wait(); + //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); + } + + // Queue list of block offsets / block positions. + blockOffsets.add(buffer.position()); + blockPositions.add(lastReadPosition); + + buffer.put(incomingBuffer); + + // Set up the buffer for reading. + buffer.flip(); + bufferFilled = true; + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + + if(validatingInputStream != null) { + byte[] validBytes = new byte[length]; + try { + validatingInputStream.read(validBytes,offset,length); + for(int i = offset; i < offset+length; i++) { + if(bytes[i] != validBytes[i]) { + System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId()); + throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); + } + } + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + return length - remaining; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + bufferFilled = false; + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(position); + try { + lock.wait(); + } + catch(InterruptedException ex) { + // TODO: handle me. + throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); + } + + if(bufferFilled && buffer.remaining() == 0) + throw new RuntimeEOFException("No more data left in InputStream"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java new file mode 100644 index 000000000..ab4299802 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +/** + * An engine for loading blocks. + */ +class BlockLoader implements Runnable { + /** + * Coordinates the input queue. + */ + private BGZFBlockLoadingDispatcher dispatcher; + + /** + * A cache from which to retrieve open file handles. + */ + private final FileHandleCache fileHandleCache; + + /** + * Whether asynchronous decompression should happen. + */ + private final boolean decompress; + + /** + * An direct input buffer for incoming data from disk. + */ + private final ByteBuffer inputBuffer; + + public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) { + this.dispatcher = dispatcher; + this.fileHandleCache = fileHandleCache; + this.decompress = decompress; + + this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + inputBuffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void run() { + for(;;) { + SAMReaderPosition readerPosition = null; + try { + readerPosition = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + + long blockAddress = readerPosition.getBlockAddress(); + //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); + + ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + long nextBlockAddress = position(inputStream); + fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + + ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; + int bytesCopied = block.remaining(); + + BlockInputStream bamInputStream = readerPosition.getInputStream(); + bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + + //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); + } + catch(Throwable error) { + if(readerPosition != null && readerPosition.getInputStream() != null) + readerPosition.getInputStream().reportException(error); + } + } + + } + + private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException { + FileChannel channel = inputStream.getChannel(); + + // Read the block header + channel.position(blockAddress); + + int uncompressedDataSize = 0; + int bufferSize = 0; + + do { + inputBuffer.clear(); + inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + inputBuffer.flip(); + if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) + throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass."); + + // Verify that the file was read at a valid point. + if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 || + unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 || + unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG || + unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN || + unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 || + unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) { + throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position"); + } + + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET); + bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1; + + // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block. + inputBuffer.limit(bufferSize); + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block. + uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4); + //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize); + } + while(uncompressedDataSize == 0 && channel.position() < channel.size()); + + // Prepare the buffer for reading. + inputBuffer.flip(); + + return inputBuffer; + } + + private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException { + final int compressedBufferSize = bgzfBlock.remaining(); + + // Determine the uncompressed buffer size ( + bgzfBlock.position(bgzfBlock.limit()-4); + int uncompressedBufferSize = bgzfBlock.getInt(); + byte[] uncompressedContent = new byte[uncompressedBufferSize]; + + // Bound the CDATA section of the buffer. + bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH); + bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + byte[] compressedContent = new byte[bgzfBlock.remaining()]; + ByteBuffer.wrap(compressedContent).put(bgzfBlock); + + // Decompress the buffer. + final Inflater inflater = new Inflater(true); + inflater.setInput(compressedContent); + int bytesUncompressed = inflater.inflate(uncompressedContent); + if(bytesUncompressed != uncompressedBufferSize) + throw new ReviewedStingException("Error decompressing block"); + + return ByteBuffer.wrap(uncompressedContent); + } + + private long position(final FileInputStream inputStream) throws IOException { + return inputStream.getChannel().position(); + } + + private int unpackUByte8(final ByteBuffer buffer,final int position) { + return buffer.get(position) & 0xFF; + } + + private int unpackUInt16(final ByteBuffer buffer,final int position) { + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + return buffer.getShort(position) & 0xFFFF; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java new file mode 100644 index 000000000..29de6eb37 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index e4141f61c..df7827250 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.SAMFileSpan; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -40,28 +41,25 @@ import java.util.*; */ public class FilePointer { protected final SortedMap fileSpans = new TreeMap(); - protected final BAMOverlap overlap; - protected final List locations; + protected final List locations = new ArrayList(); /** * Does this file pointer point into an unmapped region? */ protected final boolean isRegionUnmapped; - public FilePointer() { - this((BAMOverlap)null); - } - - public FilePointer(final GenomeLoc location) { - this.overlap = null; - this.locations = Collections.singletonList(location); - this.isRegionUnmapped = GenomeLoc.isUnmapped(location); - } - - public FilePointer(final BAMOverlap overlap) { - this.overlap = overlap; - this.locations = new ArrayList(); - this.isRegionUnmapped = false; + public FilePointer(final GenomeLoc... locations) { + this.locations.addAll(Arrays.asList(locations)); + boolean foundMapped = false, foundUnmapped = false; + for(GenomeLoc location: locations) { + if(GenomeLoc.isUnmapped(location)) + foundUnmapped = true; + else + foundMapped = true; + } + if(foundMapped && foundUnmapped) + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + this.isRegionUnmapped = foundUnmapped; } /** @@ -217,4 +215,20 @@ public class FilePointer { fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); combined.addFileSpans(initialElement.getKey(),fileSpan); } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FilePointer:%n"); + builder.append("\tlocations = {"); + builder.append(Utils.join(";",locations)); + builder.append("}%n\tregions = %n"); + for(Map.Entry entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 4ddf28dce..f78693c27 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -25,419 +25,58 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.AbstractBAMFileIndex; -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.*; +import java.util.Iterator; /** - * Shard intervals based on position within the BAM file. - * - * @author mhanna - * @version 0.1 + * Handles the process of aggregating BAM intervals into individual shards. + * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder. */ -public class IntervalSharder { - private static Logger logger = Logger.getLogger(IntervalSharder.class); +public class IntervalSharder implements Iterator { + /** + * The iterator actually laying out the data for BAM scheduling. + */ + private final PeekableIterator wrappedIterator; - public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - return new IntervalSharder.FilePointerIterator(dataSource,loci); + /** + * The parser, for interval manipulation. + */ + private final GenomeLocParser parser; + + public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); + } + + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + } + + public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser()); + } + + private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) { + wrappedIterator = new PeekableIterator(scheduler); + this.parser = parser; + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); } /** - * A lazy-loading iterator over file pointers. + * Accumulate shards where there's no additional cost to processing the next shard in the sequence. + * @return The next file pointer to process. */ - private static class FilePointerIterator implements Iterator { - final SAMDataSource dataSource; - final GenomeLocSortedSet loci; - final PeekableIterator locusIterator; - final Queue cachedFilePointers = new LinkedList(); - - public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - this.dataSource = dataSource; - this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - advance(); - } - - public boolean hasNext() { - return !cachedFilePointers.isEmpty(); - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("FilePointerIterator iteration is complete"); - FilePointer filePointer = cachedFilePointers.remove(); - if(cachedFilePointers.isEmpty()) - advance(); - return filePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator"); - } - - private void advance() { - GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); - String contig = null; - - // If the next section of the BAM to be processed is unmapped, handle this region separately. - while(locusIterator.hasNext() && nextBatch.isEmpty()) { - contig = null; - while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) { - GenomeLoc nextLocus = locusIterator.next(); - contig = nextLocus.getContig(); - nextBatch.add(nextLocus); - } - } - - if(nextBatch.size() > 0) { - cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); - } - } + public FilePointer next() { + FilePointer current = wrappedIterator.next(); + while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); + return current; } - /** - * Merge / split intervals based on an awareness of the structure of the BAM file. - * @param dataSource - * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads. - * @param loci - * @return - */ - private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { - // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs. - if(contig == null) { - FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - filePointer.addFileSpans(id,null); - return Collections.singletonList(filePointer); - } - - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - List filePointers = new ArrayList(); - FilePointer lastFilePointer = null; - BAMOverlap lastBAMOverlap = null; - - Map readerToIndexMap = new HashMap(); - IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator(); - for(SAMReaderID id: dataSource.getReaderIDs()) { - final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); - // If this contig can't be found in the reference, skip over it. - if(referenceSequence == null && contig != null) - continue; - final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id); - binMerger.addReader(id, - index, - referenceSequence.getSequenceIndex(), - index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator()); - // Cache the reader for later data lookup. - readerToIndexMap.put(id,index); - } - - PeekableIterator binIterator = new PeekableIterator(binMerger); - - for(GenomeLoc location: loci) { - if(!location.getContig().equals(contig)) - throw new ReviewedStingException("Location outside bounds of contig"); - - if(!binIterator.hasNext()) - break; - - int locationStart = location.getStart(); - final int locationStop = location.getStop(); - - // Advance to first bin. - while(binIterator.peek().stop < locationStart) - binIterator.next(); - - // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make - // sure the extending bin is not pruned from the list. - List bamOverlaps = new ArrayList(); - while(binIterator.hasNext() && binIterator.peek().stop <= locationStop) - bamOverlaps.add(binIterator.next()); - if(binIterator.hasNext() && binIterator.peek().start <= locationStop) - bamOverlaps.add(binIterator.peek()); - - // Bins found; try to match bins with locations. - Iterator bamOverlapIterator = bamOverlaps.iterator(); - - while(locationStop >= locationStart) { - int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0; - int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0; - - while(binStop < locationStart && bamOverlapIterator.hasNext()) { - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - lastBAMOverlap = bamOverlapIterator.next(); - lastFilePointer = new FilePointer(lastBAMOverlap); - binStart = lastFilePointer.overlap.start; - binStop = lastFilePointer.overlap.stop; - } - - if(locationStart < binStart) { - // The region starts before the first bin in the sequence. Add the region occurring before the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - final int regionStop = Math.min(locationStop,binStart-1); - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop); - lastFilePointer = new FilePointer(subset); - - locationStart = regionStop + 1; - } - else if(locationStart > binStop) { - // The region starts after the last bin in the sequence. Add the region occurring after the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.add(new FilePointer(subset)); - - locationStart = locationStop + 1; - } - else { - if(lastFilePointer == null) - throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer."); - - // The start of the region overlaps the bin. Add the overlapping subset. - final int regionStop = Math.min(locationStop,binStop); - lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop)); - locationStart = regionStop + 1; - } - } - } - - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - // Lookup the locations for every file pointer in the index. - for(SAMReaderID id: readerToIndexMap.keySet()) { - BrowseableBAMIndex index = readerToIndexMap.get(id); - for(FilePointer filePointer: filePointers) - filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id))); - } - - return filePointers; - } - - private static class BinMergingIterator implements Iterator { - private PriorityQueue binQueue = new PriorityQueue(); - private Queue pendingOverlaps = new LinkedList(); - - public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) { - binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins))); - } - - public boolean hasNext() { - return pendingOverlaps.size() > 0 || !binQueue.isEmpty(); - } - - public BAMOverlap next() { - if(!hasNext()) - throw new NoSuchElementException("No elements left in merging iterator"); - if(pendingOverlaps.isEmpty()) - advance(); - return pendingOverlaps.remove(); - } - - public void advance() { - List bins = new ArrayList(); - int boundsStart, boundsStop; - - // Prime the pump - if(binQueue.isEmpty()) - return; - bins.add(getNextBin()); - boundsStart = bins.get(0).getStart(); - boundsStop = bins.get(0).getStop(); - - // Accumulate all the bins that overlap the current bin, in sorted order. - while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) { - ReaderBin bin = getNextBin(); - bins.add(bin); - boundsStart = Math.min(boundsStart,bin.getStart()); - boundsStop = Math.max(boundsStop,bin.getStop()); - } - - List> range = new ArrayList>(); - int start = bins.get(0).getStart(); - int stop = bins.get(0).getStop(); - while(start <= boundsStop) { - // Find the next stopping point. - for(ReaderBin bin: bins) { - stop = Math.min(stop,bin.getStop()); - if(start < bin.getStart()) - stop = Math.min(stop,bin.getStart()-1); - } - - range.add(new Pair(start,stop)); - // If the last entry added included the last element, stop. - if(stop >= boundsStop) - break; - - // Find the next start. - start = stop + 1; - for(ReaderBin bin: bins) { - if(start >= bin.getStart() && start <= bin.getStop()) - break; - else if(start < bin.getStart()) { - start = bin.getStart(); - break; - } - } - } - - // Add the next series of BAM overlaps to the window. - for(Pair window: range) { - BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second); - for(ReaderBin bin: bins) - bamOverlap.addBin(bin.id,bin.bin); - pendingOverlaps.add(bamOverlap); - } - } - - public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } - - private ReaderBin peekNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.peek(); - return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin()); - } - - private ReaderBin getNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.remove(); - ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin()); - if(current.hasNextBin()) - binQueue.add(current); - return readerBin; - } - - } - - /** - * Filters out bins not at the lowest level in the tree. - */ - private static class LowestLevelBinFilteringIterator implements Iterator { - private BrowseableBAMIndex index; - private Iterator wrappedIterator; - - private Bin nextBin; - - public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) { - this.index = index; - this.wrappedIterator = iterator; - advance(); - } - - public boolean hasNext() { - return nextBin != null; - } - - public Bin next() { - Bin bin = nextBin; - advance(); - return bin; - } - - public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); } - - private void advance() { - nextBin = null; - while(wrappedIterator.hasNext() && nextBin == null) { - Bin bin = wrappedIterator.next(); - if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1) - nextBin = bin; - } - } - } + public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } } - -class BinQueueState implements Comparable { - private final SAMReaderID id; - private final BrowseableBAMIndex index; - private final int referenceSequence; - private final PeekableIterator bins; - - private int firstLocusInCurrentBin; - private int lastLocusInCurrentBin; - - public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bins = new PeekableIterator(bins); - refreshLocusInBinCache(); - } - - public SAMReaderID getReaderID() { - return id; - } - - public BrowseableBAMIndex getIndex() { - return index; - } - - public int getReferenceSequence() { - return referenceSequence; - } - - public boolean hasNextBin() { - return bins.hasNext(); - } - - public Bin peekNextBin() { - return bins.peek(); - } - - public Bin nextBin() { - Bin nextBin = bins.next(); - refreshLocusInBinCache(); - return nextBin; - } - - public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) { - if(!this.bins.hasNext() && !other.bins.hasNext()) return 0; - if(!this.bins.hasNext()) return -1; - if(!this.bins.hasNext()) return 1; - - // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid. - if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 || - other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) { - throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid."); - } - - // Straight integer subtraction works here because lhsStart, rhsStart always positive. - if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin) - return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin; - - // Straight integer subtraction works here because lhsStop, rhsStop always positive. - return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin; - } - - private void refreshLocusInBinCache() { - firstLocusInCurrentBin = -1; - lastLocusInCurrentBin = -1; - if(bins.hasNext()) { - Bin bin = bins.peek(); - firstLocusInCurrentBin = index.getFirstLocusInBin(bin); - lastLocusInCurrentBin = index.getLastLocusInBin(bin); - } - } -} \ No newline at end of file diff --git a/public/java/src/net/sf/samtools/GATKBinList.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java similarity index 52% rename from public/java/src/net/sf/samtools/GATKBinList.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java index b53062aaf..585b63457 100644 --- a/public/java/src/net/sf/samtools/GATKBinList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java @@ -22,30 +22,34 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package net.sf.samtools; +package org.broadinstitute.sting.gatk.datasources.reads; -import java.util.BitSet; +import java.util.Iterator; /** - * A temporary solution to work around Java access rights issues: - * override chunk and make it public. - * TODO: Eliminate once we determine the final fate of the BAM index reading code. + * Batch granular file pointers into potentially larger shards. */ -public class GATKBinList extends BinList { +public class LocusShardBalancer extends ShardBalancer { /** - * Create a new BinList over sequenceCount sequences, consisting of the given bins. - * @param referenceSequence Reference sequence to which these bins are relevant. - * @param bins The given bins to include. + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. */ - public GATKBinList(final int referenceSequence, final BitSet bins) { - super(referenceSequence,bins); - } + public Iterator iterator() { + return new Iterator() { + public boolean hasNext() { + return filePointers.hasNext(); + } - /** - * Retrieves the bins stored in this list. - * @return A bitset where a bin is present in the list if the bit is true. - */ - public BitSet getBins() { - return super.getBins(); + public Shard next() { + FilePointer current = filePointers.next(); + while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0) + current = current.combine(parser,filePointers.next()); + return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + }; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java deleted file mode 100755 index a5ca07853..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileSpan; -import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * A sharding strategy for loci based on reading of the index. - */ -public class LocusShardStrategy implements ShardStrategy { - /** - * The data source to use when performing this sharding. - */ - private final SAMDataSource reads; - - /** - * the parser for creating shards - */ - private GenomeLocParser genomeLocParser; - - /** - * An iterator through the available file pointers. - */ - private final Iterator filePointerIterator; - - /** - * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs - * @param reads Data source from which to load index data. - * @param locations List of locations for which to load data. - */ - public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) { - this.reads = reads; - this.genomeLocParser = genomeLocParser; - - if(!reads.isEmpty()) { - GenomeLocSortedSet intervals; - if(locations == null) { - // If no locations were passed in, shard the entire BAM file. - SAMFileHeader header = reads.getHeader(); - intervals = new GenomeLocSortedSet(genomeLocParser); - - for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) { - // Check this sequence against the reference sequence dictionary. - // TODO: Do a better job of merging reads + reference. - SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName()); - if(refSequenceRecord != null) { - final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength()); - intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length)); - } - } - } - else - intervals = locations; - - if(reads.isLowMemoryShardingEnabled()) { - /* - Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - List filePointers = new ArrayList(); - while(filePointerIterator.hasNext()) - filePointers.add(filePointerIterator.next()); - this.filePointerIterator = filePointers.iterator(); - */ - this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - } - else - this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); - } - else { - final int maxShardSize = 100000; - List filePointers = new ArrayList(); - if(locations == null) { - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop))); - } - } - } - else { - for(GenomeLoc interval: locations) { - while(interval.size() > maxShardSize) { - filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1))); - interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - filePointers.add(new FilePointer(interval)); - } - } - filePointerIterator = filePointers.iterator(); - } - - } - - /** - * returns true if there are additional shards - * - * @return false if we're done processing shards - */ - public boolean hasNext() { - return filePointerIterator.hasNext(); - } - - public long shardNumber = 0; - - /** - * gets the next Shard - * - * @return the next shard - */ - public LocusShard next() { - FilePointer nextFilePointer = filePointerIterator.next(); - Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null; - - /* - System.out.printf("Shard %d: interval = {",++shardNumber); - for(GenomeLoc locus: nextFilePointer.locations) - System.out.printf("%s;",locus); - System.out.printf("}; "); - - if(fileSpansBounding == null) - System.out.printf("no shard data%n"); - else { - SortedMap sortedSpans = new TreeMap(fileSpansBounding); - for(Map.Entry entry: sortedSpans.entrySet()) { - System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue()); - } - } - */ - - return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding); - } - - /** we don't support the remove command */ - public void remove() { - throw new UnsupportedOperationException("ShardStrategies don't support remove()"); - } - - /** - * makes the IntervalShard iterable, i.e. usable in a for loop. - * - * @return - */ - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java deleted file mode 100644 index bf5f33dc3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.Iterator; - -/** - * Handles the process of aggregating BAM intervals into individual shards. - */ -public class LowMemoryIntervalSharder implements Iterator { - /** - * The iterator actually laying out the data for BAM scheduling. - */ - private final PeekableIterator wrappedIterator; - - /** - * The parser, for interval manipulation. - */ - private final GenomeLocParser parser; - - public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); - parser = loci.getGenomeLocParser(); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Accumulate shards where there's no additional cost to processing the next shard in the sequence. - * @return The next file pointer to process. - */ - public FilePointer next() { - FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); - return current; - } - - public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java deleted file mode 100644 index 278eeb898..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.List; - -/** - * A single, monolithic shard bridging all available data. - * @author mhanna - * @version 0.1 - */ -public class MonolithicShard extends Shard { - /** - * Creates a new monolithic shard of the given type. - * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic. - * @param locs Intervals that this monolithic shard should process. - */ - public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) { - super(parser, shardType, locs, readsDataSource, null, false); - if(shardType != ShardType.LOCUS && shardType != ShardType.READ) - throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType); - } - - /** - * String representation of this shard. - * @return "entire genome". - */ - @Override - public String toString() { - return "entire genome"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java deleted file mode 100644 index 28b737f28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Create a giant shard representing all the data in the input BAM(s). - * - * @author mhanna - * @version 0.1 - */ -public class MonolithicShardStrategy implements ShardStrategy { - /** - * The single shard associated with this sharding strategy. - */ - private MonolithicShard shard; - - /** - * Create a new shard strategy for shards of the given type. - * @param shardType The shard type. - */ - public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) { - shard = new MonolithicShard(parser,readsDataSource,shardType,region); - } - - /** - * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator; - * will only return another copy of the active iterator. - * @return A copy of this. - */ - public Iterator iterator() { - return this; - } - - /** - * Returns true if the monolithic shard has not yet been consumed, or false otherwise. - * @return True if shard has been consumed, false otherwise. - */ - public boolean hasNext() { - return shard != null; - } - - /** - * Returns the monolithic shard if it has not already been retrieved. - * @return The monolithic shard. - * @throws NoSuchElementException if no such data exists. - */ - public Shard next() { - if(shard == null) - throw new NoSuchElementException("Monolithic shard has already been retrived."); - - Shard working = shard; - shard = null; - return working; - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a shard strategy"); - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - * @param size adjust the next size to this - */ - public void adjustNextShardSize( long size ) { - throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard."); - } - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 4d9c9092d..5f40c0ea5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -35,10 +35,15 @@ import java.util.Map; * @version 0.1 */ public class ReadShard extends Shard { + /** + * What is the maximum number of reads which should go into a read shard. + */ + public static final int MAX_READS = 10000; + /** * The reads making up this shard. */ - private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS); + private final Collection reads = new ArrayList(MAX_READS); public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); @@ -66,7 +71,7 @@ public class ReadShard extends Shard { * @return True if this shard's buffer is full (and the shard can buffer reads). */ public boolean isBufferFull() { - return reads.size() > ReadShardStrategy.MAX_READS; + return reads.size() > ReadShard.MAX_READS; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java new file mode 100644 index 000000000..fa8a7d454 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + */ +public class ReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + if(!fileSpan.isEmpty()) + selectedReaders.put(id,fileSpan); + } + + if(selectedReaders.size() > 0) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java deleted file mode 100755 index 5ea75dbb0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.SAMFileSpan; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.*; - -/** - * The sharding strategy for reads using a simple counting mechanism. Each read shard - * has a specific number of reads (default to 10K) which is configured in the constructor. - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - */ -public class ReadShardStrategy implements ShardStrategy { - /** - * What is the maximum number of reads which should go into a read shard. - */ - protected static final int MAX_READS = 10000; - - /** - * The data source used to shard. - */ - private final SAMDataSource dataSource; - - /** - * The intervals to be processed. - */ - private final GenomeLocSortedSet locations; - - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** our storage of the genomic locations they'd like to shard over */ - private final List filePointers = new ArrayList(); - - /** - * Iterator over the list of file pointers. - */ - private final Iterator filePointerIterator; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentFilePointer; - - /** - * Ending position of the last shard in the file. - */ - private Map position; - - /** - * An indicator whether the strategy has sharded into the unmapped region. - */ - private boolean isIntoUnmappedRegion = false; - - private final GenomeLocParser parser; - - /** - * Create a new read shard strategy, loading read shards from the given BAM file. - * @param dataSource Data source from which to load shards. - * @param locations intervals to use for sharding. - */ - public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) { - this.dataSource = dataSource; - this.parser = parser; - this.position = this.dataSource.getCurrentPosition(); - this.locations = locations; - - if(locations != null) - filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations); - else - filePointerIterator = filePointers.iterator(); - - if(filePointerIterator.hasNext()) - currentFilePointer = filePointerIterator.next(); - - advance(); - } - - /** - * do we have another read shard? - * @return True if any more data is available. False otherwise. - */ - public boolean hasNext() { - return nextShard != null; - } - - /** - * Retrieves the next shard, if available. - * @return The next shard, if available. - * @throws java.util.NoSuchElementException if no such shard is available. - */ - public Shard next() { - if(!hasNext()) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - public void advance() { - Map shardPosition = new HashMap(); - nextShard = null; - - if(locations != null) { - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; - - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); - } - - if(selectedReaders.size() > 0) { - Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - dataSource.fillShard(shard); - - if(!shard.isBufferEmpty()) { - nextShard = shard; - break; - } - } - - selectedReaders.clear(); - currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null; - } - } - else { - // todo -- this nulling of intervals is a bit annoying since readwalkers without - // todo -- any -L values need to be special cased throughout the code. - Shard shard = new ReadShard(parser,dataSource,position,null,false); - dataSource.fillShard(shard); - nextShard = !shard.isBufferEmpty() ? shard : null; - } - - this.position = dataSource.getCurrentPosition(); - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - - /** - * Convenience method for using ShardStrategy in an foreach loop. - * @return A iterator over shards. - */ - public Iterator iterator() { - return this; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java deleted file mode 100644 index c76c1d8ae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 2, 2011 - * Time: 4:36:40 PM - * To change this template use File | Settings | File Templates. - */ -class ReaderBin { - public final SAMReaderID id; - public final BrowseableBAMIndex index; - public final int referenceSequence; - public final Bin bin; - - public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bin = bin; - } - - public int getStart() { - return index.getFirstLocusInBin(bin); - } - - public int getStop() { - return index.getLastLocusInBin(bin); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 8452aadfd..0a1eb0563 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -71,7 +73,7 @@ public class SAMDataSource { /** * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. */ - private final GenomeLocParser genomeLocParser; + protected final GenomeLocParser genomeLocParser; /** * Identifiers for the readers driving this data source. @@ -91,13 +93,18 @@ public class SAMDataSource { /** * How far along is each reader? */ - private final Map readerPositions = new HashMap(); + private final Map readerPositions = new HashMap(); /** * The merged header. */ private final SAMFileHeader mergedHeader; + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + /** * The sort order of the BAM files. Files without a sort order tag are assumed to be * in coordinate order. @@ -131,17 +138,24 @@ public class SAMDataSource { private final SAMResourcePool resourcePool; /** - * Whether to enable the new low-memory sharding mechanism. + * Asynchronously loads BGZF blocks. */ - private boolean enableLowMemorySharding = false; + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. */ - public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocParser) { + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, @@ -150,8 +164,7 @@ public class SAMDataSource { new ValidationExclusion(), new ArrayList(), false, - false, - true); + false); } /** @@ -159,6 +172,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -167,9 +182,10 @@ public class SAMDataSource { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, - boolean enableLowMemorySharding) { + boolean generateExtendedEvents) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, useOriginalBaseQualities, strictness, @@ -182,8 +198,7 @@ public class SAMDataSource { BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ - (byte) -1, - enableLowMemorySharding); + (byte) -1); } /** @@ -205,6 +220,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -217,13 +234,19 @@ public class SAMDataSource { BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, - byte defaultBaseQualities, - boolean enableLowMemorySharding) { - this.enableLowMemorySharding(enableLowMemorySharding); + byte defaultBaseQualities) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + else + dispatcher = null; + validationStringency = strictness; for (SAMReaderID readerID : samFiles) { if (!readerID.samFile.canRead()) @@ -235,10 +258,13 @@ public class SAMDataSource { SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. - for(SAMFileReader reader: readers.values()) { + for(SAMReaderID readerID: readerIDs) { // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); + headers.put(readerID,header); + if ( header.getReadGroups().isEmpty() ) { throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); @@ -275,7 +301,7 @@ public class SAMDataSource { qmode, refReader, defaultBaseQualities); - + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -296,12 +322,10 @@ public class SAMDataSource { originalToMergedReadGroupMappings.put(id,mappingToMerged); } - if(enableLowMemorySharding) { - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); } resourcePool.releaseReaders(readers); @@ -314,22 +338,6 @@ public class SAMDataSource { */ public ReadProperties getReadsInfo() { return readProperties; } - /** - * Enable experimental low-memory sharding. - * @param enable True to enable sharding. False otherwise. - */ - public void enableLowMemorySharding(final boolean enable) { - enableLowMemorySharding = enable; - } - - /** - * Returns whether low-memory sharding is enabled. - * @return True if enabled, false otherwise. - */ - public boolean isLowMemoryShardingEnabled() { - return enableLowMemorySharding; - } - /** * Checks to see whether any reads files are supplying data. * @return True if no reads files are supplying data to the traversal; false otherwise. @@ -368,7 +376,7 @@ public class SAMDataSource { * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. */ - public Map getCurrentPosition() { + public Map getCurrentPosition() { return readerPositions; } @@ -381,7 +389,7 @@ public class SAMDataSource { } public SAMFileHeader getHeader(SAMReaderID id) { - return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); + return headers.get(id); } /** @@ -404,45 +412,21 @@ public class SAMDataSource { return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); } - /** - * No read group collisions at this time because only one SAM file is currently supported. - * @return False always. - */ - public boolean hasReadGroupCollisions() { - return hasReadGroupCollisions; - } - /** * True if all readers have an index. * @return True if all readers have an index. */ public boolean hasIndex() { - if(enableLowMemorySharding) - return readerIDs.size() == bamIndices.size(); - else { - for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { - if(!reader.hasIndex()) - return false; - } - return true; - } + return readerIDs.size() == bamIndices.size(); } /** * Gets the index for a particular reader. Always preloaded. - * TODO: Should return object of type GATKBAMIndex, but cannot because there - * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new - * TODO: sharding system goes live. * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ - public Object getIndex(final SAMReaderID id) { - if(enableLowMemorySharding) - return bamIndices.get(id); - else { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - return readers.getReader(id).getBrowseableIndex(); - } + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); } /** @@ -454,7 +438,7 @@ public class SAMDataSource { } /** - * Gets the cumulative read metrics for shards already processed. + * Gets the cumulative read metrics for shards already processed. * @return Cumulative read metrics. */ public ReadMetrics getCumulativeReadMetrics() { @@ -507,10 +491,6 @@ public class SAMDataSource { } public StingSAMIterator seek(Shard shard) { - // todo: refresh monolithic sharding implementation - if(shard instanceof MonolithicShard) - return seekMonolithic(shard); - if(shard.buffersReads()) { return shard.iterator(); } @@ -540,7 +520,7 @@ public class SAMDataSource { */ private void initializeReaderPositions(SAMReaders readers) { for(SAMReaderID id: getReaderIDs()) - readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads()); + readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); } /** @@ -548,7 +528,6 @@ public class SAMDataSource { * @param readers Readers from which to load data. * @param shard The shard specifying the data limits. * @param enableVerification True to verify. For compatibility with old sharding strategy. - * TODO: Collapse this flag when the two sharding systems are merged. * @return An iterator over the selected data. */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { @@ -559,14 +538,20 @@ public class SAMDataSource { for(SAMReaderID id: getReaderIDs()) { CloseableIterator iterator = null; - if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null) - continue; - iterator = shard.getFileSpans().get(id) != null ? - readers.getReader(id).iterator(shard.getFileSpans().get(id)) : - readers.getReader(id).queryUnmapped(); + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + } + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(readProperties.getReadBufferSize() != null) iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); - if(shard.getGenomeLocs() != null) + if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); mergingIterator.addIterator(readers.getReader(id),iterator); } @@ -584,33 +569,6 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } - /** - * A stopgap measure to handle monolithic sharding - * @param shard the (monolithic) shard. - * @return An iterator over the monolithic shard. - */ - private StingSAMIterator seekMonolithic(Shard shard) { - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); - for(SAMReaderID id: getReaderIDs()) - mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator()); - - return applyDecoratingIterators(shard.getReadMetrics(), - shard instanceof ReadShard, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getDownsamplingMethod().toFraction, - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.defaultBaseQualities()); - } - /** * Adds this read to the given shard. * @param shard The shard to which to add the read. @@ -618,7 +576,7 @@ public class SAMDataSource { * @param read The read to add to the shard. */ private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing(); + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); shard.addRead(read); readerPositions.put(id,endChunk); } @@ -689,19 +647,6 @@ public class SAMDataSource { this.maxEntries = maxEntries; } - /** - * Dangerous internal method; retrieves any set of readers, whether in iteration or not. - * Used to handle non-exclusive, stateless operations, such as index queries. - * @return Any collection of SAMReaders, whether in iteration or not. - */ - protected SAMReaders getReadersWithoutLocking() { - synchronized(this) { - if(allResources.size() == 0) - createNewResource(); - } - return allResources.get(0); - } - /** * Choose a set of readers from the pool to use for this query. When complete, * @return @@ -753,6 +698,11 @@ public class SAMDataSource { */ private final Map readers = new LinkedHashMap(); + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + /** * Derive a new set of readers from the Reads metadata. * @param readerIDs reads to load. @@ -760,12 +710,20 @@ public class SAMDataSource { */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = new SAMFileReader(readerID.samFile); + File indexFile = findIndexFile(readerID.samFile); + + SAMFileReader reader = null; + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + inputStreams.put(readerID,blockInputStream); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); - reader.enableIndexMemoryMapping(false); - if(!enableLowMemorySharding) - reader.enableIndexCaching(true); reader.setValidationStringency(validationStringency); final SAMFileHeader header = reader.getFileHeader(); @@ -786,6 +744,15 @@ public class SAMDataSource { return readers.get(id); } + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + /** * Searches for the reader id of this reader. * @param reader Reader for which to search. @@ -883,7 +850,7 @@ public class SAMDataSource { * Filters out reads that do not overlap the current GenomeLoc. * Note the custom implementation: BAM index querying returns all reads that could * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. + * down performance, this implementation is highly customized to its task. */ private class IntervalOverlapFilteringIterator implements CloseableIterator { /** @@ -903,7 +870,7 @@ public class SAMDataSource { /** * Custom representation of interval bounds. - * Makes it simpler to track current position. + * Makes it simpler to track current position. */ private int[] intervalContigIndices; private int[] intervalStarts; @@ -941,7 +908,7 @@ public class SAMDataSource { i++; } } - + advance(); } @@ -1070,6 +1037,40 @@ public class SAMDataSource { return indexFile; } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * @return + */ + public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); + return shardBalancer; + } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java new file mode 100644 index 000000000..f9f6539a7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class SAMReaderPosition { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = filePosition; + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + try { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + positionIterator.next(); + // Check to see if the iterator has more data available. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) { + nextBlockAddress = positionIterator.peek().getBlockStart(); + break; + } + } + } + catch(Exception ex) { + throw new ReviewedStingException(""); + } + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java new file mode 100644 index 000000000..962208086 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Iterator; + +/** + * Balances maximally granular file pointers into shards of reasonable size. + */ +public abstract class ShardBalancer implements Iterable { + protected SAMDataSource readsDataSource; + protected PeekableIterator filePointers; + protected GenomeLocParser parser; + + public void initialize(final SAMDataSource readsDataSource, final Iterator filePointers, final GenomeLocParser parser) { + this.readsDataSource = readsDataSource; + this.filePointers = new PeekableIterator(filePointers); + this.parser = parser; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java deleted file mode 100644 index 989cf9fce..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import java.util.Iterator; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 4:55:37 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Interface ShardStrategy - *

- * The base interface for the sharding strategy; before we had a base abstract - * class, but not this will be an interface to accomidate read based sharding - */ -public interface ShardStrategy extends Iterator, Iterable { -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java deleted file mode 100644 index 780b41ef7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * - * User: aaron - * Date: Apr 6, 2009 - * Time: 7:09:22 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 6, 2009 - *

- * Class ShardStrategyFactory - *

- * The Shard Strategy Factory, use this class to create and transfer shard strategies - * between different approaches. - */ -public class ShardStrategyFactory { - public enum SHATTER_STRATEGY { - MONOLITHIC, // Put all of the available data into one shard. - LOCUS_EXPERIMENTAL, - READS_EXPERIMENTAL - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L); - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser,readsDataSource,null); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); - } - - } - - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l); - - } - - /** - * get a new shatter strategy - * - * @param readsDataSource The reads used to shatter this file. - * @param referenceDataSource The reference used to shatter this file. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return A strategy for shattering this data. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser, readsDataSource,lst); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented"); - } - - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 673df6dfa..577db0965 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -30,10 +30,12 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; -import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; +import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram { // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals GenomeLocSortedSet intervalSortedSet = null; @@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram { logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); @@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram { logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index c8c79bb14..2c33a19b8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.CreateSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; +import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * Loads reference data from fasta file * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile index; + private IndexedFastaSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -173,7 +185,7 @@ public class ReferenceDataSource { logger.info("Treating existing index file as complete."); } - index = new CachingIndexedFastaSequenceFile(fastaFile); + reference = new CachingIndexedFastaSequenceFile(fastaFile); } catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); @@ -192,6 +204,52 @@ public class ReferenceDataSource { * @return IndexedFastaSequenceFile that was created from file */ public IndexedFastaSequenceFile getReference() { - return this.index; + return this.reference; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + return shards; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 162baed00..b0043e68c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; @@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } - public Object execute( Walker walker, ShardStrategy shardStrategy ) { + public Object execute( Walker walker, Iterable shardStrategy ) { // Fast fail for walkers not supporting TreeReducible interface. if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index deafcd0cc..ff5e1064b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; @@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler { * @param walker Computation to perform over dataset. * @param shardStrategy A strategy for sharding the data. */ - public Object execute(Walker walker, ShardStrategy shardStrategy) { + public Object execute(Walker walker, Iterable shardStrategy) { walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index e731b9864..d013db7e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -30,11 +30,11 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads the informations associated with the reads * @param reference the reference file * @param rods the rods to include in the traversal - * @param nThreadsToUse Number of threads to utilize. + * @param threadAllocation Number of threads to utilize. * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (walker instanceof TreeReducible && nThreadsToUse > 1) { + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse)); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads()); } else { - if(nThreadsToUse > 1) + if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods); } @@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * * @return the return type of the walker */ - public abstract Object execute(Walker walker, ShardStrategy shardStrategy); + public abstract Object execute(Walker walker, Iterable shardStrategy); /** * Retrieves the object responsible for tracking and managing output. diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java new file mode 100644 index 000000000..0c81af07b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.resourcemanagement; + +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Models how threads are distributed between various components of the GATK. + */ +public class ThreadAllocation { + /** + * The number of CPU threads to be used by the GATK. + */ + private final int numCPUThreads; + + /** + * Number of threads to devote exclusively to IO. Default is 0. + */ + private final int numIOThreads; + + public int getNumCPUThreads() { + return numCPUThreads; + } + + public int getNumIOThreads() { + return numIOThreads; + } + + /** + * Construct the default thread allocation. + */ + public ThreadAllocation() { + this(1,null,null); + } + + /** + * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. + * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). + * @param totalThreads Complete number of threads to allocate. + * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numIOThreads Total number of threads allocated exclusively to IO. + */ + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { + // If no allocation information is present, allocate all threads to CPU + if(numCPUThreads == null && numIOThreads == null) { + this.numCPUThreads = totalThreads; + this.numIOThreads = 0; + } + // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). + else if(numIOThreads == null) { + if(numCPUThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = totalThreads - numCPUThreads; + } + // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). + else if(numCPUThreads == null) { + if(numIOThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); + this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); + this.numIOThreads = numIOThreads; + } + else { + if(numCPUThreads + numIOThreads != totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = numIOThreads; + } + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 8d7dd82ac..17a7d1974 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java index dc3a6cafe..62c93bddd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -42,7 +43,7 @@ import java.util.Collections; public class MockLocusShard extends LocusShard { public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { super( genomeLocParser, - new SAMDataSource(Collections.emptyList(),genomeLocParser), + new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), intervals, null); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java deleted file mode 100755 index e41a6b3b7..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java +++ /dev/null @@ -1,223 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import static org.testng.Assert.fail; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * - * User: aaron - * Date: Apr 8, 2009 - * Time: 8:14:23 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 - *

- * Class SAMBAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMBAMDataSourceUnitTest extends BaseTest { - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - StingSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testMergingTwoBAMFiles() { - logger.warn("Executing testMergingTwoBAMFiles"); - - // setup the test files - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - - ArrayList readcountPerShard = new ArrayList(); - ArrayList readcountPerShard2 = new ArrayList(); - - // count up the first hundred shards - int shardsToCount = 100; - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - - } - readcountPerShard.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - - // setup the data and the counter before our second run - readers.clear(); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - count = 0; - // the sharding strat. - data = new SAMDataSource(readers,genomeLocParser); - strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser); - - logger.debug("Pile two:"); - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - // can we leave? - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - } - - readcountPerShard2.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - /*int pos = 0; - for (; pos < 100; pos++) { - if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { - fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos)); - } - } */ - - } - - - - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java new file mode 100755 index 000000000..ba2d68ec9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import static org.testng.Assert.fail; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * @author aaron + * @version 1.0 + * @date Apr 8, 2009 + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + false); + + Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + StingSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 7f4d96add..9226f97e2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest { private List bamList; private Walker countReadWalker; private File output; - private long readSize = 100000; private TraverseReads traversalEngine = null; private IndexedFastaSequenceFile ref = null; @@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testUnmappedReadCount() { - SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - ref.getSequenceDictionary(), - readSize, - genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); - while (shardStrategy.hasNext()) { + for(Shard shard: shardStrategy) { traversalEngine.startTimersIfNecessary(); - Shard shard = shardStrategy.next(); if (shard == null) { fail("Shard == null");