First pass of the asynchronous block loader.

Block loads are only triggered on queue empty at this point.  Disabled by
default (enable with nt:io=?).
This commit is contained in:
Matt Hanna 2011-09-13 10:49:16 -04:00
parent 6459784351
commit 8bb4d4dca3
45 changed files with 3292 additions and 2037 deletions

View File

@ -0,0 +1,762 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools;
import net.sf.samtools.util.*;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Internal class for reading and querying BAM files.
*/
class BAMFileReader extends SAMFileReader.ReaderImplementation {
// True if reading from a File rather than an InputStream
private boolean mIsSeekable = false;
// For converting bytes into other primitive types
private BinaryCodec mStream = null;
// Underlying compressed data stream.
private final BAMInputStream mInputStream;
private SAMFileHeader mFileHeader = null;
// Populated if the file is seekable and an index exists
private File mIndexFile;
private BAMIndex mIndex = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<SAMRecord> mCurrentIterator = null;
// If true, all SAMRecords are fully decoded as they are read.
private final boolean eagerDecode;
// For error-checking.
private ValidationStringency mValidationStringency;
// For creating BAMRecords
private SAMRecordFactory samRecordFactory;
/**
* Use the caching index reader implementation rather than the disk-hit-per-file model.
*/
private boolean mEnableIndexCaching = false;
/**
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
*/
private boolean mEnableIndexMemoryMapping = true;
/**
* Add information about the origin (reader and position) to SAM records.
*/
private SAMFileReader mFileReader = null;
/**
* Prepare to read BAM from a stream (not seekable)
* @param stream source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final InputStream stream,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = false;
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(null);
}
/**
* Prepare to read BAM from a file (seekable)
* @param file source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final File file,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
" is older than BAM " + file.getAbsolutePath());
}
}
BAMFileReader(final SeekableStream strm,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
indexFile,
eagerDecode,
strm.getSource(),
validationStringency,
factory);
}
private BAMFileReader(final BAMInputStream inputStream,
final File indexFile,
final boolean eagerDecode,
final String source,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = true;
mInputStream = inputStream;
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(source);
mFirstRecordPointer = inputStream.getFilePointer();
}
/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
*/
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
this.mFileReader = enabled ? reader : null;
}
/**
* If true, uses the caching version of the index reader.
* @param enabled true to write source information into each SAMRecord.
*/
public void enableIndexCaching(final boolean enabled) {
if(mIndex != null)
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
this.mEnableIndexCaching = enabled;
}
/**
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
* @param enabled True to use memory mapping, false to use regular I/O.
*/
public void enableIndexMemoryMapping(final boolean enabled) {
if (mIndex != null) {
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
}
this.mEnableIndexMemoryMapping = enabled;
}
@Override void enableCrcChecking(final boolean enabled) {
this.mInputStream.setCheckCrcs(enabled);
}
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
/**
* @return true if ths is a BAM file, and has an index
*/
public boolean hasIndex() {
return (mIndexFile != null);
}
/**
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
* @return An index of the given type.
*/
public BAMIndex getIndex() {
if(mIndexFile == null)
throw new SAMException("No index is available for this BAM file.");
if(mIndex == null)
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
return mIndex;
}
void close() {
if (mStream != null) {
mStream.close();
}
if (mIndex != null) {
mIndex.close();
}
mStream = null;
mFileHeader = null;
mIndex = null;
}
SAMFileHeader getFileHeader() {
return mFileHeader;
}
/**
* Set error-checking level for subsequent SAMRecord reads.
*/
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
this.mValidationStringency = validationStringency;
}
SAMFileReader.ValidationStringency getValidationStringency() {
return this.mValidationStringency;
}
/**
* Prepare to iterate through the SAMRecords in file order.
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
* that iterator must be closed before getIterator() can be called again.
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
* getIterator() begins its iteration where the last one left off. That is the best that can be
* done in that situation.
*/
CloseableIterator<SAMRecord> getIterator() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (mIsSeekable) {
try {
mInputStream.seek(mFirstRecordPointer);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
mCurrentIterator = new BAMFileIterator();
return mCurrentIterator;
}
@Override
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!(chunks instanceof BAMFileSpan)) {
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
}
// Create an iterator over the given chunk boundaries.
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
return mCurrentIterator;
}
/**
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
* the rightmost bound is guaranteed to be after the last read in the file.
* @return An unbounded pointer to the first record in the BAM file.
*/
@Override
SAMFileSpan getFilePointerSpanningReads() {
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
}
/**
* Prepare to iterate through the SAMRecords that match the given interval.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
* A value of zero implies the start of the reference sequence.
* @param end A value of zero implies the end of the reference sequence.
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
* specified by start and end. If false, the SAMRecords need only overlap the interval.
* @return Iterator for the matching SAMRecords
*/
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
return mCurrentIterator;
}
/**
* Prepare to iterate through the SAMRecords with the given alignment start.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Alignment start sought.
* @return Iterator for the matching SAMRecords.
*/
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
return mCurrentIterator;
}
public CloseableIterator<SAMRecord> queryUnmapped() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
try {
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
if (startOfLastLinearBin != -1) {
mInputStream.seek(startOfLastLinearBin);
} else {
// No mapped reads in file, just start at the first read in file.
mInputStream.seek(mFirstRecordPointer);
}
mCurrentIterator = new BAMFileIndexUnmappedIterator();
return mCurrentIterator;
} catch (IOException e) {
throw new RuntimeException("IOException seeking to unmapped reads", e);
}
}
/**
* Reads the header from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private void readHeader(final String source)
throws IOException {
final byte[] buffer = new byte[4];
mStream.readBytes(buffer);
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
throw new IOException("Invalid BAM file header");
}
final int headerTextLength = mStream.readInt();
final String textHeader = mStream.readString(headerTextLength);
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
headerCodec.setValidationStringency(mValidationStringency);
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
source);
final int sequenceCount = mStream.readInt();
if (mFileHeader.getSequenceDictionary().size() > 0) {
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
throw new SAMFormatException("Number of sequences in text header (" +
mFileHeader.getSequenceDictionary().size() +
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
}
for (int i = 0; i < sequenceCount; i++) {
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
source);
}
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
source);
}
}
} else {
// If only binary sequences are present, copy them into mFileHeader
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
for (int i = 0; i < sequenceCount; i++) {
sequences.add(readSequenceRecord(source));
}
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
}
}
/**
* Reads a single binary sequence record from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private SAMSequenceRecord readSequenceRecord(final String source) {
final int nameLength = mStream.readInt();
if (nameLength <= 1) {
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
}
final String sequenceName = mStream.readString(nameLength - 1);
// Skip the null terminator
mStream.readByte();
final int sequenceLength = mStream.readInt();
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
}
/**
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
* Starting point of iteration is wherever current file position is when the iterator is constructed.
*/
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
private SAMRecord mNextRecord = null;
private final BAMRecordCodec bamRecordCodec;
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
BAMFileIterator() {
this(true);
}
/**
* @param advance Trick to enable subclass to do more setup before advancing
*/
BAMFileIterator(final boolean advance) {
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
if (advance) {
advance();
}
}
public void close() {
if (mCurrentIterator != null && this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
public boolean hasNext() {
return (mNextRecord != null);
}
public SAMRecord next() {
final SAMRecord result = mNextRecord;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
void advance() {
try {
mNextRecord = getNextRecord();
if (mNextRecord != null) {
++this.samRecordIndex;
// Because some decoding is done lazily, the record needs to remember the validation stringency.
mNextRecord.setValidationStringency(mValidationStringency);
if (mValidationStringency != ValidationStringency.SILENT) {
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
SAMUtils.processValidationErrors(validationErrors,
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
}
}
if (eagerDecode && mNextRecord != null) {
mNextRecord.eagerDecode();
}
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
/**
* Read the next record from the input stream.
*/
SAMRecord getNextRecord() throws IOException {
final long startCoordinate = mInputStream.getFilePointer();
final SAMRecord next = bamRecordCodec.decode();
final long stopCoordinate = mInputStream.getFilePointer();
if(mFileReader != null && next != null)
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
return next;
}
/**
* @return The record that will be return by the next call to next()
*/
protected SAMRecord peek() {
return mNextRecord;
}
}
/**
* Prepare to iterate through SAMRecords matching the target interval.
* @param sequence Desired reference sequence.
* @param start 1-based start of target interval, inclusive.
* @param end 1-based end of target interval, inclusive.
* @param queryType contained, overlapping, or starting-at query.
*/
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
final int start,
final int end,
final QueryType queryType) {
long[] filePointers = null;
// Hit the index to determine the chunk boundaries for the required data.
final SAMFileHeader fileHeader = getFileHeader();
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMIndex fileIndex = getIndex();
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
}
// Create an iterator over the above chunk boundaries.
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
// Add some preprocessing filters for edge-case reads that don't fit into this
// query type.
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
}
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
/**
* Look for BAM index file according to standard naming convention.
*
* @param dataFile BAM file name.
* @return Index file name, or null if not found.
*/
private static File findIndexFile(final File dataFile) {
// If input is foo.bam, look for foo.bai
final String bamExtension = ".bam";
File indexFile;
final String fileName = dataFile.getName();
if (fileName.endsWith(bamExtension)) {
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
indexFile = new File(dataFile.getParent(), bai);
if (indexFile.exists()) {
return indexFile;
}
}
// If foo.bai doesn't exist look for foo.bam.bai
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
if (indexFile.exists()) {
return indexFile;
} else {
return null;
}
}
private class BAMFileIndexIterator extends BAMFileIterator {
private long[] mFilePointers = null;
private int mFilePointerIndex = 0;
private long mFilePointerLimit = -1;
/**
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
* @param filePointers the block / offset combination, stored in chunk format.
*/
BAMFileIndexIterator(final long[] filePointers) {
super(false); // delay advance() until after construction
mFilePointers = filePointers;
advance();
}
SAMRecord getNextRecord()
throws IOException {
// Advance to next file block if necessary
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
if (mFilePointers == null ||
mFilePointerIndex >= mFilePointers.length) {
return null;
}
final long startOffset = mFilePointers[mFilePointerIndex++];
final long endOffset = mFilePointers[mFilePointerIndex++];
mInputStream.seek(startOffset);
mFilePointerLimit = endOffset;
}
// Pull next record from stream
return super.getNextRecord();
}
}
/**
* A decorating iterator that filters out records that are outside the bounds of the
* given query parameters.
*/
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private final CloseableIterator<SAMRecord> wrappedIterator;
/**
* The next record to be returned. Will be null if no such record exists.
*/
private SAMRecord mNextRecord;
private final int mReferenceIndex;
private final int mRegionStart;
private final int mRegionEnd;
private final QueryType mQueryType;
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
this.wrappedIterator = iterator;
final SAMFileHeader fileHeader = getFileHeader();
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
mRegionStart = start;
if (queryType == QueryType.STARTING_AT) {
mRegionEnd = mRegionStart;
} else {
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
}
mQueryType = queryType;
mNextRecord = advance();
}
/**
* Returns true if a next element exists; false otherwise.
*/
public boolean hasNext() {
return mNextRecord != null;
}
/**
* Gets the next record from the given iterator.
* @return The next SAM record in the iterator.
*/
public SAMRecord next() {
if(!hasNext())
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
final SAMRecord currentRead = mNextRecord;
mNextRecord = advance();
return currentRead;
}
/**
* Closes down the existing iterator.
*/
public void close() {
if (this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
/**
* @throws UnsupportedOperationException always.
*/
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
SAMRecord advance() {
while (true) {
// Pull next record from stream
if(!wrappedIterator.hasNext())
return null;
final SAMRecord record = wrappedIterator.next();
// If beyond the end of this reference sequence, end iteration
final int referenceIndex = record.getReferenceIndex();
if (referenceIndex != mReferenceIndex) {
if (referenceIndex < 0 ||
referenceIndex > mReferenceIndex) {
return null;
}
// If before this reference sequence, continue
continue;
}
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
// Quick exit to avoid expensive alignment end calculation
return record;
}
final int alignmentStart = record.getAlignmentStart();
// If read is unmapped but has a coordinate, return it if the coordinate is within
// the query region, regardless of whether the mapped mate will be returned.
final int alignmentEnd;
if (mQueryType == QueryType.STARTING_AT) {
alignmentEnd = -1;
} else {
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
record.getAlignmentEnd(): alignmentStart);
}
if (alignmentStart > mRegionEnd) {
// If scanned beyond target region, end iteration
return null;
}
// Filter for overlap with region
if (mQueryType == QueryType.CONTAINED) {
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
return record;
}
} else if (mQueryType == QueryType.OVERLAPPING) {
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
return record;
}
} else {
if (alignmentStart == mRegionStart) {
return record;
}
}
}
}
}
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
private BAMFileIndexUnmappedIterator() {
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
advance();
}
}
}
}

View File

@ -25,6 +25,7 @@
package net.sf.samtools; package net.sf.samtools;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
super(); super();
} }
/**
* Create a new GATKBAMFileSpan from an existing BAMFileSpan.
* @param sourceFileSpan
*/
public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
if(!(sourceFileSpan instanceof BAMFileSpan))
throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
for(Chunk chunk: sourceBAMFileSpan.getChunks())
add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
}
/** /**
* Convenience constructor to construct a BAM file span from * Convenience constructor to construct a BAM file span from
* a single chunk. * a single chunk.

View File

@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
super.setChunkEnd(value); super.setChunkEnd(value);
} }
public long getBlockStart() {
return getChunkStart() >>> 16;
}
public int getBlockOffsetStart() {
return (int)(getChunkStart() & 0xFFFF);
}
public long getBlockEnd() {
return getChunkEnd() >>> 16;
}
public int getBlockOffsetEnd() {
return ((int)getChunkEnd() & 0xFFFF);
}
/** /**
* Computes an approximation of the uncompressed size of the * Computes an approximation of the uncompressed size of the
* chunk, in bytes. Can be used to determine relative weights * chunk, in bytes. Can be used to determine relative weights

View File

@ -0,0 +1,72 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.IOException;
/**
* An input stream formulated for use reading BAM files. Supports
*/
public interface BAMInputStream {
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos) throws IOException;
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer();
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check);
public int read() throws java.io.IOException;
public int read(byte[] bytes) throws java.io.IOException;
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
public long skip(long l) throws java.io.IOException;
public int available() throws java.io.IOException;
public void close() throws java.io.IOException;
public void mark(int i);
public void reset() throws java.io.IOException;
public boolean markSupported();
}

View File

@ -0,0 +1,483 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import net.sf.samtools.FileTruncatedException;
/*
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
*
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
*/
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
private InputStream mStream = null;
private SeekableStream mFile = null;
private byte[] mFileBuffer = null;
private byte[] mCurrentBlock = null;
private int mCurrentOffset = 0;
private long mBlockAddress = 0;
private int mLastBlockLength = 0;
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
/**
* Note that seek() is not supported if this ctor is used.
*/
public BlockCompressedInputStream(final InputStream stream) {
mStream = IOUtil.toBufferedStream(stream);
mFile = null;
}
/**
* Use this ctor if you wish to call seek()
*/
public BlockCompressedInputStream(final File file)
throws IOException {
mFile = new SeekableFileStream(file);
mStream = null;
}
public BlockCompressedInputStream(final URL url) {
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
mStream = null;
}
/**
* For providing some arbitrary data source. No additional buffering is
* provided, so if the underlying source is not buffered, wrap it in a
* SeekableBufferedStream before passing to this ctor.
*/
public BlockCompressedInputStream(final SeekableStream strm) {
mFile = strm;
mStream = null;
}
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check) {
this.blockGunzipper.setCheckCrcs(check);
}
/**
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
* may block in order to fill an internal buffer if it has been exhausted.
*/
public int available()
throws IOException {
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
readBlock();
}
if (mCurrentBlock == null) {
return 0;
}
return mCurrentBlock.length - mCurrentOffset;
}
/**
* Closes the underlying InputStream or RandomAccessFile
*/
public void close()
throws IOException {
if (mFile != null) {
mFile.close();
mFile = null;
} else if (mStream != null) {
mStream.close();
mStream = null;
}
// Encourage garbage collection
mFileBuffer = null;
mCurrentBlock = null;
}
/**
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
* @return the next byte of data, or -1 if the end of the stream is reached.
*/
public int read()
throws IOException {
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
}
/**
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
* or an exception is thrown.
*
* read(buf) has the same effect as read(buf, 0, buf.length).
*
* @param buffer the buffer into which the data is read.
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer)
throws IOException {
return read(buffer, 0, buffer.length);
}
private volatile ByteArrayOutputStream buf = null;
private static final byte eol = '\n';
private static final byte eolCr = '\r';
/**
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
*
* @return A String containing the contents of the line, excluding the line terminating
* character, or null if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
* @
*/
public String readLine() throws IOException {
int available = available();
if (available == 0) {
return null;
}
if(null == buf){ // lazy initialisation
buf = new ByteArrayOutputStream(8192);
}
buf.reset();
boolean done = false;
boolean foundCr = false; // \r found flag
while (!done) {
int linetmpPos = mCurrentOffset;
int bCnt = 0;
while((available-- > 0)){
final byte c = mCurrentBlock[linetmpPos++];
if(c == eol){ // found \n
done = true;
break;
} else if(foundCr){ // previous char was \r
--linetmpPos; // current char is not \n so put it back
done = true;
break;
} else if(c == eolCr){ // found \r
foundCr = true;
continue; // no ++bCnt
}
++bCnt;
}
if(mCurrentOffset < linetmpPos){
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
mCurrentOffset = linetmpPos;
}
available = available();
if(available == 0){
// EOF
done = true;
}
}
return buf.toString();
}
/**
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
*
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
*
* @param buffer buffer into which data is read.
* @param offset the start offset in array b at which the data is written.
* @param length the maximum number of bytes to read.
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer, int offset, int length)
throws IOException {
final int originalLength = length;
while (length > 0) {
final int available = available();
if (available == 0) {
// Signal EOF to caller
if (originalLength == length) {
return -1;
}
break;
}
final int copyLength = Math.min(length, available);
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
mCurrentOffset += copyLength;
offset += copyLength;
length -= copyLength;
}
return originalLength - length;
}
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos)
throws IOException {
if (mFile == null) {
throw new IOException("Cannot seek on stream based file");
}
// Decode virtual file pointer
// Upper 48 bits is the byte offset into the compressed stream of a block.
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
final int available;
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
available = mCurrentBlock.length;
} else {
mFile.seek(compressedOffset);
mBlockAddress = compressedOffset;
mLastBlockLength = 0;
readBlock();
available = available();
}
if (uncompressedOffset > available ||
(uncompressedOffset == available && !eof())) {
throw new IOException("Invalid file pointer: " + pos);
}
mCurrentOffset = uncompressedOffset;
}
private boolean eof() throws IOException {
if (mFile.eof()) {
return true;
}
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
}
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer() {
if (mCurrentOffset == mCurrentBlock.length) {
// If current offset is at the end of the current block, file pointer should point
// to the beginning of the next block.
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
}
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
}
public static long getFileBlock(final long bgzfOffset) {
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
}
/**
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
* @return true if the given file looks like a valid BGZF file.
*/
public static boolean isValidFile(final InputStream stream)
throws IOException {
if (!stream.markSupported()) {
throw new RuntimeException("Cannot test non-buffered stream");
}
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
stream.reset();
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
}
private static boolean isValidBlockHeader(final byte[] buffer) {
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
}
private void readBlock()
throws IOException {
if (mFileBuffer == null) {
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
}
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
if (count == 0) {
// Handle case where there is no empty gzip block at end.
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mCurrentBlock = new byte[0];
return;
}
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
throw new IOException("Premature end of file");
}
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
throw new IOException("Unexpected compressed block length: " + blockLength);
}
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
if (count != remaining) {
throw new FileTruncatedException("Premature end of file");
}
inflateBlock(mFileBuffer, blockLength);
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mLastBlockLength = blockLength;
}
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
throws IOException {
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
byte[] buffer = mCurrentBlock;
mCurrentBlock = null;
if (buffer == null || buffer.length != uncompressedLength) {
try {
buffer = new byte[uncompressedLength];
} catch (NegativeArraySizeException e) {
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
}
}
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
mCurrentBlock = buffer;
}
private int readBytes(final byte[] buffer, final int offset, final int length)
throws IOException {
if (mFile != null) {
return readBytes(mFile, buffer, offset, length);
} else if (mStream != null) {
return readBytes(mStream, buffer, offset, length);
} else {
return 0;
}
}
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private int unpackInt16(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8));
}
private int unpackInt32(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8) |
((buffer[offset+2] & 0xFF) << 16) |
((buffer[offset+3] & 0xFF) << 24));
}
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
public static FileTermination checkTermination(final File file)
throws IOException {
final long fileSize = file.length();
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
return FileTermination.DEFECTIVE;
}
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
try {
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
raFile.readFully(buf);
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
return FileTermination.HAS_TERMINATOR_BLOCK;
}
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
buf = new byte[bufsize];
raFile.seek(fileSize - bufsize);
raFile.read(buf);
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
i >= 0; --i) {
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
continue;
}
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
if (buf.length - i == totalBlockSizeMinusOne + 1) {
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
} else {
return FileTermination.DEFECTIVE;
}
}
return FileTermination.DEFECTIVE;
} finally {
raFile.close();
}
}
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
for (int i = 0; i < length; ++i) {
if (preamble[i] != buf[i + startOffset]) {
return false;
}
}
return true;
}
}

View File

@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
* used to indicate an error occured * used to indicate an error occured
* *
* @param msg the message * @param msg the message
* @param e the error * @param t the error
*/ */
public static void exitSystemWithError(String msg, final Exception e) { public static void exitSystemWithError(String msg, final Throwable t) {
errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("stack trace %n"); errorPrintf("stack trace %n");
e.printStackTrace(); t.printStackTrace();
errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
@ -394,8 +394,8 @@ public abstract class CommandLineProgram {
* *
* @param e the exception occured * @param e the exception occured
*/ */
public static void exitSystemWithError(Exception e) { public static void exitSystemWithError(Throwable t) {
exitSystemWithError(e.getMessage(), e); exitSystemWithError(t.getMessage(), t);
} }
/** /**

View File

@ -99,8 +99,8 @@ public class CommandLineGATK extends CommandLineExecutable {
} catch (net.sf.samtools.SAMException e) { } catch (net.sf.samtools.SAMException e) {
// Let's try this out and see how it is received by our users // Let's try this out and see how it is received by our users
exitSystemWithSamError(e); exitSystemWithSamError(e);
} catch (Exception e) { } catch (Throwable t) {
exitSystemWithError(e); exitSystemWithError(t);
} }
} }

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.FilterManager;
@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
*/ */
private Collection<ReadFilter> filters; private Collection<ReadFilter> filters;
/**
* Controls the allocation of threads between CPU vs IO.
*/
private ThreadAllocation threadAllocation;
/** /**
* A currently hacky unique name for this GATK instance * A currently hacky unique name for this GATK instance
*/ */
@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed) if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis()); resetRandomGenerator(System.currentTimeMillis());
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();
// Prepare the data for traversal. // Prepare the data for traversal.
initializeDataSources(); initializeDataSources();
@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
// create the output streams " // create the output streams "
initializeOutputStreams(microScheduler.getOutputTracker()); initializeOutputStreams(microScheduler.getOutputTracker());
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results // execute the microscheduler, storing the results
return microScheduler.execute(this.walker, shardStrategy); return microScheduler.execute(this.walker, shardStrategy);
@ -266,6 +275,16 @@ public class GenomeAnalysisEngine {
return Collections.unmodifiableList(filters); return Collections.unmodifiableList(filters);
} }
/**
* Parse out the thread allocation from the given command-line argument.
*/
private void determineThreadAllocation() {
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null;
Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null;
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
}
/** /**
* Allow subclasses and others within this package direct access to the walker manager. * Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package. * @return The walker manager used by this package.
@ -286,7 +305,7 @@ public class GenomeAnalysisEngine {
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
} }
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
} }
protected DownsamplingMethod getDownsamplingMethod() { protected DownsamplingMethod getDownsamplingMethod() {
@ -397,103 +416,49 @@ public class GenomeAnalysisEngine {
* @param intervals intervals * @param intervals intervals
* @return the sharding strategy * @return the sharding strategy
*/ */
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers. // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
if(readsDataSource != null && !readsDataSource.hasIndex() ) { if(!readsDataSource.isEmpty()) {
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
Shard.ShardType shardType;
if(walker instanceof LocusWalker) { if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
shardType = Shard.ShardType.LOCUS; if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.
if(walker instanceof ReadPairWalker) {
if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
if(intervals != null && !intervals.isEmpty())
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
}
if(intervals == null)
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
} }
else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
shardType = Shard.ShardType.READ;
else else
throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files"); throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
}
List<GenomeLoc> region; else {
if(intervals != null) final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000;
region = intervals.toList(); if(intervals == null)
else { return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
region = new ArrayList<GenomeLoc>(); else
for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences()) return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
}
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
} }
ShardStrategy shardStrategy;
ShardStrategyFactory.SHATTER_STRATEGY shardType;
long SHARD_SIZE = 100000L;
if (walker instanceof LocusWalker) {
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
if (intervals != null && !intervals.isEmpty()) {
if (readsDataSource == null)
throw new IllegalArgumentException("readsDataSource is null");
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser(),
intervals);
} else
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,getGenomeLocParser());
} else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
if (intervals != null && !intervals.isEmpty()) {
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser(),
intervals);
} else {
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser());
}
} else if (walker instanceof ReadPairWalker) {
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
if(intervals != null && !intervals.isEmpty())
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser());
} else
throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
return shardStrategy;
} }
protected boolean flashbackData() { protected boolean flashbackData() {
@ -751,6 +716,8 @@ public class GenomeAnalysisEngine {
return new SAMDataSource( return new SAMDataSource(
samReaderIDs, samReaderIDs,
threadAllocation,
argCollection.numberOfBAMFileHandles,
genomeLocParser, genomeLocParser,
argCollection.useOriginalBaseQualities, argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel, argCollection.strictnessLevel,
@ -763,8 +730,7 @@ public class GenomeAnalysisEngine {
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(), getWalkerBAQQualityMode(),
refReader, refReader,
argCollection.defaultBaseQualities, argCollection.defaultBaseQualities);
!argCollection.disableLowMemorySharding);
} }
/** /**

View File

@ -194,10 +194,14 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe; public ValidationExclusion.TYPE unsafe;
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) /** How many threads should be allocated to this analysis. */
public int numberOfThreads = 1; @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
public Integer numberOfThreads = 1;
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false) @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
public Integer numberOfBAMFileHandles = null;
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
public List<String> readGroupBlackList = null; public List<String> readGroupBlackList = null;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
@ -292,9 +296,6 @@ public class GATKArgumentCollection {
@Hidden @Hidden
public boolean allowIntervalsWithUnindexedBAM = false; public boolean allowIntervalsWithUnindexedBAM = false;
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
public boolean disableLowMemorySharding = false;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// methods // methods
@ -365,7 +366,11 @@ public class GATKArgumentCollection {
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
return false; return false;
} }
if (other.numberOfThreads != this.numberOfThreads) { if (!other.numberOfThreads.equals(this.numberOfThreads)) {
return false;
}
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
return false; return false;
} }
if (other.intervalMerging != this.intervalMerging) { if (other.intervalMerging != this.intervalMerging) {
@ -389,9 +394,6 @@ public class GATKArgumentCollection {
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false; return false;
if (disableLowMemorySharding != other.disableLowMemorySharding)
return false;
return true; return true;
} }

View File

@ -1,128 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 7, 2011
* Time: 2:46:34 PM
* To change this template use File | Settings | File Templates.
*/
public class BAMBlockStartIterator implements Iterator<Long> {
/**
* How large is a BGZF header?
*/
private static int BGZF_HEADER_SIZE = 18;
/**
* Where within the header does the BLOCKSIZE actually live?
*/
private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
private FileChannel bamInputChannel;
private ByteBuffer headerByteBuffer;
private long nextLocation = 0;
public BAMBlockStartIterator(File bamFile) {
try {
FileInputStream bamInputStream = new FileInputStream(bamFile);
bamInputChannel = bamInputStream.getChannel();
headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
}
catch(IOException ex) {
throw new StingException("Could not open file",ex);
}
}
public boolean hasNext() {
return nextLocation != -1;
}
public Long next() {
long currentLocation = nextLocation;
advance();
return currentLocation;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
}
private void advance() {
int readStatus;
headerByteBuffer.clear();
try {
readStatus = bamInputChannel.read(headerByteBuffer);
}
catch(IOException ex) {
throw new StingException("Could not read header data",ex);
}
if(readStatus == -1) {
nextLocation = -1;
try {
bamInputChannel.close();
}
catch(IOException ex) {
throw new StingException("Could not close input file",ex);
}
return;
}
headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
int blockSize = headerByteBuffer.getShort();
try {
bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
nextLocation = bamInputChannel.position();
}
catch(IOException ex) {
throw new StingException("Could not reposition input stream",ex);
}
}
public static void main(String argv[]) throws IOException {
BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
int i = 0;
while(blockStartIterator.hasNext())
System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
}
}

View File

@ -1,195 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.LinearIndex;
import java.util.*;
/**
* Represents the contents of a bam index file for one reference.
* A BAM index (.bai) file contains information for all references in the bam file.
* This class describes the data present in the index file for one of these references;
* including the bins, chunks, and linear index.
*/
class BAMIndexContent {
/**
* The reference sequence for the data currently loaded.
*/
private final int mReferenceSequence;
/**
* A list of all bins in the above reference sequence.
*/
private final BinList mBinList;
/**
* The linear index for the reference sequence above.
*/
private final LinearIndex mLinearIndex;
/**
* @param referenceSequence Content corresponds to this reference.
* @param bins Array of bins represented by this content, possibly sparse
* @param numberOfBins Number of non-null bins
* @param linearIndex Additional index used to optimize queries
*/
BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
this.mReferenceSequence = referenceSequence;
this.mBinList = new BinList(bins, numberOfBins);
this.mLinearIndex = linearIndex;
}
/**
* Reference for this Content
*/
public int getReferenceSequence() {
return mReferenceSequence;
}
/**
* Does this content have anything in this bin?
*/
public boolean containsBin(final GATKBin bin) {
return mBinList.getBin(bin.getBinNumber()) != null;
}
/**
* @return iterable list of bins represented by this content
*/
public BinList getBins() {
return mBinList;
}
/**
* @return the number of non-null bins represented by this content
*/
int getNumberOfNonNullBins() {
return mBinList.getNumberOfNonNullBins();
}
/**
* @return all chunks associated with all bins in this content
*/
public List<GATKChunk> getAllChunks() {
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
for (GATKBin b : mBinList)
if (b.getChunkList() != null) {
allChunks.addAll(Arrays.asList(b.getChunkList()));
}
return Collections.unmodifiableList(allChunks);
}
/**
* @return the linear index represented by this content
*/
public LinearIndex getLinearIndex() {
return mLinearIndex;
}
/**
* This class is used to encapsulate the list of Bins store in the BAMIndexContent
* While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
*/
class BinList implements Iterable<GATKBin> {
private final GATKBin[] mBinArray;
public final int numberOfNonNullBins;
public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
/**
* @param binArray a sparse array representation of the bins. The index into the array is the bin number.
* @param numberOfNonNullBins
*/
BinList(GATKBin[] binArray, int numberOfNonNullBins) {
this.mBinArray = binArray;
this.numberOfNonNullBins = numberOfNonNullBins;
this.maxBinNumber = mBinArray.length - 1;
}
GATKBin getBin(int binNumber) {
if (binNumber > maxBinNumber) return null;
return mBinArray[binNumber];
}
int getNumberOfNonNullBins() {
return numberOfNonNullBins;
}
/**
* Gets an iterator over all non-null bins.
*
* @return An iterator over all bins.
*/
public Iterator<GATKBin> iterator() {
return new BinIterator();
}
private class BinIterator implements Iterator<GATKBin> {
/**
* Stores the bin # of the Bin currently in use.
*/
private int nextBin;
public BinIterator() {
nextBin = 0;
}
/**
* Are there more bins in this set, waiting to be returned?
*
* @return True if more bins are remaining.
*/
public boolean hasNext() {
while (nextBin <= maxBinNumber) {
if (getBin(nextBin) != null) return true;
nextBin++;
}
return false;
}
/**
* Gets the next bin in the provided BinList.
*
* @return the next available bin in the BinList.
*/
public GATKBin next() {
if (!hasNext())
throw new NoSuchElementException("This BinIterator is currently empty");
GATKBin result = getBin(nextBin);
nextBin++;
return result;
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from a bin iterator");
}
}
}
}

View File

@ -1,29 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.Bin;
import java.util.HashMap;
import java.util.Map;
/**
* Models a bin at which all BAM files in the merged input stream overlap.
*/
class BAMOverlap {
public final int start;
public final int stop;
private final Map<SAMReaderID,Bin> bins = new HashMap<SAMReaderID,Bin>();
public BAMOverlap(final int start, final int stop) {
this.start = start;
this.stop = stop;
}
public void addBin(final SAMReaderID id, final Bin bin) {
bins.put(id,bin);
}
public Bin getBin(final SAMReaderID id) {
return bins.get(id);
}
}

View File

@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
/** /**
* Create a new BAM schedule based on the given index. * Create a new BAM schedule based on the given index.
* @param indexFiles Index files. * @param dataSource The SAM data source to use.
* @param intervals List of * @param intervals List of
*/ */
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) { public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
if(intervals.isEmpty()) if(intervals.isEmpty())
throw new ReviewedStingException("Tried to write schedule for empty interval list."); throw new ReviewedStingException("Tried to write schedule for empty interval list.");
referenceSequence = intervals.get(0).getContigIndex(); referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
createScheduleFile(); createScheduleFile();
readerIDs.addAll(indexFiles.keySet()); readerIDs.addAll(dataSource.getReaderIDs());
for(final SAMReaderID reader: readerIDs) { for(final SAMReaderID reader: readerIDs) {
final GATKBAMIndex index = indexFiles.get(reader); final GATKBAMIndex index = dataSource.getIndex(reader);
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
if(selectedIterators.isEmpty()) if(selectedIterators.isEmpty())
return; return;
// Create the target schedule entry
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
// For each schedule entry with data, load the data into the merged schedule.
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader); PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
scheduleIterator.next(); scheduleIterator.next();
} }
// For each schedule entry without data, add a blank entry.
for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
}
nextScheduleEntry = mergedScheduleEntry; nextScheduleEntry = mergedScheduleEntry;
} }

View File

@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk; import net.sf.samtools.GATKChunk;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*; import java.util.*;
@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator<FilePointer> {
private FilePointer nextFilePointer = null; private FilePointer nextFilePointer = null;
private final GenomeLocSortedSet loci; private GenomeLocSortedSet loci;
private PeekableIterator<GenomeLoc> locusIterator;
private final PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus; private GenomeLoc currentLocus;
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
// Match only on sequence name; trust startup validation to make sure all the sequences match.
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
}
scheduler.populateFilteredIntervalList(intervals);
return scheduler;
}
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
scheduler.populateUnfilteredIntervalList(parser);
return scheduler;
}
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
scheduler.populateFilteredIntervalList(loci);
return scheduler;
}
private BAMScheduler(final SAMDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
for(SAMReaderID reader: dataSource.getReaderIDs()) for(SAMReaderID reader: dataSource.getReaderIDs()) {
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); GATKBAMIndex index = dataSource.getIndex(reader);
if(index != null)
indexFiles.put(reader,dataSource.getIndex(reader));
}
}
/**
* The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
* @param loci The list of locations to search and iterate over.
*/
private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
this.loci = loci; this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator()); if(!indexFiles.isEmpty()) {
if(locusIterator.hasNext()) // If index data is available, start up the iterator.
currentLocus = locusIterator.next(); locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
advance(); if(locusIterator.hasNext())
currentLocus = locusIterator.next();
advance();
}
else {
// Otherwise, seed the iterator with a single file pointer over the entire region.
nextFilePointer = generatePointerOverEntireFileset();
for(GenomeLoc locus: loci)
nextFilePointer.addLocation(locus);
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
}
}
/**
* The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
* from just before the start of the region to the end of the region.
*/
private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
this.loci = new GenomeLocSortedSet(parser);
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
nextFilePointer = generatePointerOverEntireFileset();
}
/**
* Generate a span that runs from the end of the BAM header to the end of the fle.
* @return A file pointer over the specified region.
*/
private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer();
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
for(SAMReaderID reader: dataSource.getReaderIDs())
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
return filePointer;
} }
public boolean hasNext() { public boolean hasNext() {
@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(!hasNext()) if(!hasNext())
throw new NoSuchElementException("No next element available in interval sharder"); throw new NoSuchElementException("No next element available in interval sharder");
FilePointer currentFilePointer = nextFilePointer; FilePointer currentFilePointer = nextFilePointer;
nextFilePointer = null;
advance(); advance();
return currentFilePointer; return currentFilePointer;
} }
@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(loci.isEmpty()) if(loci.isEmpty())
return; return;
nextFilePointer = null;
while(nextFilePointer == null && currentLocus != null) { while(nextFilePointer == null && currentLocus != null) {
// special case handling of the unmapped shard. // special case handling of the unmapped shard.
if(currentLocus == GenomeLoc.UNMAPPED) { if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs()) for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
currentLocus = null; currentLocus = null;
continue; continue;
} }
@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
int coveredRegionStop = Integer.MAX_VALUE; int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null; GenomeLoc coveredRegion = null;
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
// No overlapping data at all. // No overlapping data at all.
if(scheduleEntry != null) { if(scheduleEntry != null) {
@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator<FilePointer> {
} }
else { else {
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
for(SAMReaderID reader: indexFiles.keySet()) for(SAMReaderID reader: indexFiles.keySet())
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
} }
@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
// Early exit if no bins were found. // Early exit if no bins were found.
if(coveredRegion == null) { if(coveredRegion == null) {
// for debugging only: maximum split is 16384. // for debugging only: maximum split is 16384.
if(currentLocus.size() > 16384) { nextFilePointer.addLocation(currentLocus);
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
nextFilePointer.addLocation(splitContigs[0]);
currentLocus = splitContigs[1];
}
else {
nextFilePointer.addLocation(currentLocus);
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
}
continue; continue;
} }
// Early exit if only part of the first interval was found. // Early exit if only part of the first interval was found.
if(currentLocus.startsBefore(coveredRegion)) { if(currentLocus.startsBefore(coveredRegion)) {
// for debugging only: maximum split is 16384.
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
GenomeLoc[] splitContigs = currentLocus.split(splitPoint); GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
nextFilePointer.addLocation(splitContigs[0]); nextFilePointer.addLocation(splitContigs[0]);
@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator<FilePointer> {
/** /**
* Get the next overlapping tree of bins associated with the given BAM file. * Get the next overlapping tree of bins associated with the given BAM file.
* @param indices BAM indices.
* @param currentLocus The actual locus for which to check overlap. * @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci. * @return The next schedule entry overlapping with the given list of loci.
*/ */
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) { private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
// Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
// we'll be using the correct contig index for the BAMs.
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
// Stale reference sequence or first invocation. (Re)create the binTreeIterator. // Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
if(bamScheduleIterator != null) if(bamScheduleIterator != null)
bamScheduleIterator.close(); bamScheduleIterator.close();
lastReferenceSequenceLoaded = currentLocus.getContigIndex(); lastReferenceSequenceLoaded = currentContigIndex;
// Naive algorithm: find all elements in current contig for proper schedule creation. // Naive algorithm: find all elements in current contig for proper schedule creation.
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>(); List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
for(GenomeLoc locus: loci) { for(GenomeLoc locus: loci) {
if(locus.getContigIndex() == lastReferenceSequenceLoaded) if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
lociInContig.add(locus); lociInContig.add(locus);
} }
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig)); bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(dataSource,lociInContig));
} }
if(!bamScheduleIterator.hasNext()) if(!bamScheduleIterator.hasNext())
@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
} }
/**
* Create a span from the given start point to the end of the file.
* @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
* @return A file span from the given point to the end of the file.
*/
private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
}
} }

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.LinkedList;
import java.util.Queue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* Preloads BGZF blocks in preparation for unzipping and data processing.
* TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
* TODO: be replaced with a central thread management strategy.
*/
public class BGZFBlockLoadingDispatcher {
/**
* The file handle cache, used when allocating blocks from the dispatcher.
*/
private final FileHandleCache fileHandleCache;
private final ExecutorService threadPool;
private final Queue<SAMReaderPosition> inputQueue;
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
threadPool = Executors.newFixedThreadPool(numThreads);
fileHandleCache = new FileHandleCache(numFileHandles);
inputQueue = new LinkedList<SAMReaderPosition>();
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
}
/**
* Initiates a request for a new block load.
* @param readerPosition Position at which to load.
*/
void queueBlockLoad(final SAMReaderPosition readerPosition) {
synchronized(inputQueue) {
inputQueue.add(readerPosition);
inputQueue.notify();
}
}
/**
* Claims the next work request from the queue.
* @return The next work request, or null if none is available.
*/
SAMReaderPosition claimNextWorkRequest() {
synchronized(inputQueue) {
while(inputQueue.isEmpty()) {
try {
inputQueue.wait();
}
catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
}
}
return inputQueue.poll();
}
}
}

View File

@ -0,0 +1,436 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.BAMInputStream;
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.RuntimeEOFException;
import net.sf.samtools.util.SeekableStream;
import org.broad.tribble.util.BlockCompressedStreamConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.LinkedList;
/**
* Presents decompressed blocks to the SAMFileReader.
*/
public class BlockInputStream extends SeekableStream implements BAMInputStream {
/**
* Mechanism for triggering block loads.
*/
private final BGZFBlockLoadingDispatcher dispatcher;
/**
* The reader whose data is supplied by this input stream.
*/
private final SAMReaderID reader;
/**
* Length of the input stream.
*/
private final long length;
/**
* The latest error reported by an asynchronous block load.
*/
private Throwable error;
/**
* Current position.
*/
private SAMReaderPosition position;
/**
* A stream of compressed data blocks.
*/
private final ByteBuffer buffer;
/**
* Offsets of the given blocks in the buffer.
*/
private LinkedList<Integer> blockOffsets = new LinkedList<Integer>();
/**
* Source positions of the given blocks in the buffer.
*/
private LinkedList<Long> blockPositions = new LinkedList<Long>();
/**
* Provides a lock to wait for more data to arrive.
*/
private final Object lock = new Object();
/**
* An input stream to use when comparing data back to what it should look like.
*/
private final BlockCompressedInputStream validatingInputStream;
/**
* Has the buffer been filled since last request?
*/
private boolean bufferFilled = false;
/**
* Create a new block presenting input stream with a dedicated buffer.
* @param dispatcher the block loading messenger.
* @param reader the reader for which to load data.
* @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
*/
BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
this.reader = reader;
this.length = reader.samFile.length();
buffer = ByteBuffer.wrap(new byte[64*1024]);
buffer.order(ByteOrder.LITTLE_ENDIAN);
// The state of the buffer assumes that the range of data written into the buffer appears in the range
// [position,limit), while extra capacity exists in the range [limit,capacity)
buffer.limit(0);
this.dispatcher = dispatcher;
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
try {
if(validate) {
System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
validatingInputStream = new BlockCompressedInputStream(reader.samFile);
// A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
// Poke the stream to start reading data.
validatingInputStream.available();
}
else
validatingInputStream = null;
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
public long length() {
return length;
}
public long getFilePointer() {
long filePointer;
synchronized(lock) {
if(buffer.remaining() > 0) {
// If there's data in the buffer, figure out from whence it came.
final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
final int blockOffset = buffer.position();
filePointer = blockAddress << 16 | blockOffset;
}
else {
// Otherwise, find the next position to load.
filePointer = position.getBlockAddress() << 16;
}
}
if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
return filePointer;
}
public void seek(long target) {
// TODO: Validate the seek point.
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
synchronized(lock) {
clearBuffers();
position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
waitForBufferFill();
buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
if(validatingInputStream != null) {
try {
validatingInputStream.seek(target);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
}
private void clearBuffers() {
this.position.reset();
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
// Indicate no data to be read.
buffer.clear();
buffer.limit(0);
blockOffsets.clear();
blockPositions.clear();
}
public boolean eof() {
synchronized(lock) {
// TODO: Handle multiple empty BGZF blocks at end of the file.
return position != null && position.getBlockAddress() >= length;
}
}
public void setCheckCrcs(final boolean check) {
// TODO: Implement
}
/**
* Submits a new access plan for the given dataset.
* @param position The next seek point for BAM data in this reader.
*/
public void submitAccessPlan(final SAMReaderPosition position) {
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
synchronized(lock) {
// Assume that the access plan is going to tell us to start where we are and move forward.
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
position.advancePosition(this.position.getBlockAddress());
}
this.position = position;
}
private void compactBuffer() {
// Compact buffer to maximize storage space.
int bytesToRemove = 0;
// Look ahead to see if we can compact away the first block in the series.
while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
bytesToRemove += blockOffsets.remove();
blockPositions.remove();
}
// If we end up with an empty block at the end of the series, compact this as well.
if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
bytesToRemove += buffer.position();
blockOffsets.remove();
blockPositions.remove();
}
int finalBufferStart = buffer.position() - bytesToRemove;
int finalBufferSize = buffer.remaining();
buffer.position(bytesToRemove);
buffer.compact();
buffer.position(finalBufferStart);
buffer.limit(finalBufferStart+finalBufferSize);
}
/**
* Push contents of incomingBuffer into the end of this buffer.
* MUST be called from a thread that is NOT the reader thread.
* @param incomingBuffer The data being pushed into this input stream.
* @param position target position for the data.
*/
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
synchronized(lock) {
try {
compactBuffer();
// Open up the buffer for more reading.
buffer.limit(buffer.capacity());
// Advance the position to take the most recent read into account.
long lastReadPosition = position.getBlockAddress();
byte[] validBytes = null;
if(validatingInputStream != null) {
validBytes = new byte[incomingBuffer.remaining()];
byte[] currentBytes = new byte[incomingBuffer.remaining()];
int pos = incomingBuffer.position();
int lim = incomingBuffer.limit();
incomingBuffer.get(currentBytes);
incomingBuffer.limit(lim);
incomingBuffer.position(pos);
long currentFilePointer = validatingInputStream.getFilePointer();
validatingInputStream.seek(lastReadPosition << 16);
validatingInputStream.read(validBytes);
validatingInputStream.seek(currentFilePointer);
if(!Arrays.equals(validBytes,currentBytes))
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
}
this.position = position;
position.advancePosition(filePosition);
if(buffer.remaining() < incomingBuffer.remaining()) {
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
lock.wait();
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
}
// Queue list of block offsets / block positions.
blockOffsets.add(buffer.position());
blockPositions.add(lastReadPosition);
buffer.put(incomingBuffer);
// Set up the buffer for reading.
buffer.flip();
bufferFilled = true;
lock.notify();
}
catch(Exception ex) {
reportException(ex);
lock.notify();
}
}
}
void reportException(Throwable t) {
synchronized(lock) {
this.error = t;
lock.notify();
}
}
private void checkForErrors() {
synchronized(lock) {
if(error != null) {
ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
toThrow.setStackTrace(error.getStackTrace());
throw toThrow;
}
}
}
/**
* Reads the next byte of data from the input stream.
* @return Next byte of data, from 0->255, as an int.
*/
@Override
public int read() {
byte[] singleByte = new byte[1];
read(singleByte);
return singleByte[0];
}
/**
* Fills the given byte array to the extent possible.
* @param bytes byte array to be filled.
* @return The number of bytes actually read.
*/
@Override
public int read(byte[] bytes) {
return read(bytes,0,bytes.length);
}
@Override
public int read(byte[] bytes, final int offset, final int length) {
int remaining = length;
synchronized(lock) {
while(remaining > 0) {
// Check for error conditions during last read.
checkForErrors();
// If completely out of space, queue up another buffer fill.
waitForBufferFill();
// Couldn't manage to load any data at all; abort and return what's available.
if(buffer.remaining() == 0)
break;
int numBytesToCopy = Math.min(buffer.remaining(),remaining);
buffer.get(bytes,length-remaining+offset,numBytesToCopy);
remaining -= numBytesToCopy;
//if(remaining > 0)
// System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
// TODO: Assert that we don't copy across a block boundary
}
// Notify any waiting threads that some of the contents of the buffer were removed.
if(length-remaining > 0)
lock.notify();
}
if(validatingInputStream != null) {
byte[] validBytes = new byte[length];
try {
validatingInputStream.read(validBytes,offset,length);
for(int i = offset; i < offset+length; i++) {
if(bytes[i] != validBytes[i]) {
System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
}
}
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
return length - remaining;
}
public void close() {
if(validatingInputStream != null) {
try {
validatingInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
public String getSource() {
return reader.getSamFilePath();
}
private void waitForBufferFill() {
synchronized(lock) {
bufferFilled = false;
if(buffer.remaining() == 0 && !eof()) {
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
dispatcher.queueBlockLoad(position);
try {
lock.wait();
}
catch(InterruptedException ex) {
// TODO: handle me.
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
}
if(bufferFilled && buffer.remaining() == 0)
throw new RuntimeEOFException("No more data left in InputStream");
}
}
}
}

View File

@ -0,0 +1,188 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broad.tribble.util.BlockCompressedStreamConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/**
* An engine for loading blocks.
*/
class BlockLoader implements Runnable {
/**
* Coordinates the input queue.
*/
private BGZFBlockLoadingDispatcher dispatcher;
/**
* A cache from which to retrieve open file handles.
*/
private final FileHandleCache fileHandleCache;
/**
* Whether asynchronous decompression should happen.
*/
private final boolean decompress;
/**
* An direct input buffer for incoming data from disk.
*/
private final ByteBuffer inputBuffer;
public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
this.dispatcher = dispatcher;
this.fileHandleCache = fileHandleCache;
this.decompress = decompress;
this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
}
public void run() {
for(;;) {
SAMReaderPosition readerPosition = null;
try {
readerPosition = dispatcher.claimNextWorkRequest();
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
long blockAddress = readerPosition.getBlockAddress();
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
long nextBlockAddress = position(inputStream);
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
int bytesCopied = block.remaining();
BlockInputStream bamInputStream = readerPosition.getInputStream();
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
}
catch(Throwable error) {
if(readerPosition != null && readerPosition.getInputStream() != null)
readerPosition.getInputStream().reportException(error);
}
}
}
private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
FileChannel channel = inputStream.getChannel();
// Read the block header
channel.position(blockAddress);
int uncompressedDataSize = 0;
int bufferSize = 0;
do {
inputBuffer.clear();
inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
channel.read(inputBuffer);
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
// value into an int buffer to transfer the bitwise contents into an int.
inputBuffer.flip();
if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
// Verify that the file was read at a valid point.
if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
}
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
// Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
inputBuffer.limit(bufferSize);
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
channel.read(inputBuffer);
// Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
//System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
}
while(uncompressedDataSize == 0 && channel.position() < channel.size());
// Prepare the buffer for reading.
inputBuffer.flip();
return inputBuffer;
}
private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
final int compressedBufferSize = bgzfBlock.remaining();
// Determine the uncompressed buffer size (
bgzfBlock.position(bgzfBlock.limit()-4);
int uncompressedBufferSize = bgzfBlock.getInt();
byte[] uncompressedContent = new byte[uncompressedBufferSize];
// Bound the CDATA section of the buffer.
bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
byte[] compressedContent = new byte[bgzfBlock.remaining()];
ByteBuffer.wrap(compressedContent).put(bgzfBlock);
// Decompress the buffer.
final Inflater inflater = new Inflater(true);
inflater.setInput(compressedContent);
int bytesUncompressed = inflater.inflate(uncompressedContent);
if(bytesUncompressed != uncompressedBufferSize)
throw new ReviewedStingException("Error decompressing block");
return ByteBuffer.wrap(uncompressedContent);
}
private long position(final FileInputStream inputStream) throws IOException {
return inputStream.getChannel().position();
}
private int unpackUByte8(final ByteBuffer buffer,final int position) {
return buffer.get(position) & 0xFF;
}
private int unpackUInt16(final ByteBuffer buffer,final int position) {
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
// value into an int buffer to transfer the bitwise contents into an int.
return buffer.getShort(position) & 0xFFFF;
}
}

View File

@ -0,0 +1,231 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
/**
* Caches frequently used file handles. Right now, caches only a single file handle.
* TODO: Generalize to support arbitrary file handle caches.
*/
public class FileHandleCache {
/**
* The underlying data structure storing file handles.
*/
private final FileHandleStorage fileHandleStorage;
/**
* How many file handles should be kept open at once.
*/
private final int cacheSize;
/**
* A uniquifier: assign a unique ID to every instance of a file handle.
*/
private final Map<SAMReaderID,Integer> keyCounter = new HashMap<SAMReaderID,Integer>();
/**
* A shared lock, private so that outside users cannot notify it.
*/
private final Object lock = new Object();
/**
* Indicates how many file handles are outstanding at this point.
*/
private int numOutstandingFileHandles = 0;
/**
* Create a new file handle cache of the given cache size.
* @param cacheSize how many readers to hold open at once.
*/
public FileHandleCache(final int cacheSize) {
this.cacheSize = cacheSize;
fileHandleStorage = new FileHandleStorage();
}
/**
* Retrieves or opens a file handle for the given reader ID.
* @param key The ke
* @return A file input stream from the cache, if available, or otherwise newly opened.
*/
public FileInputStream claimFileInputStream(final SAMReaderID key) {
synchronized(lock) {
FileInputStream inputStream = findExistingEntry(key);
if(inputStream == null) {
try {
// If the cache is maxed out, wait for another file handle to emerge.
if(numOutstandingFileHandles >= cacheSize)
lock.wait();
}
catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupted while waiting for a file handle");
}
inputStream = openInputStream(key);
}
numOutstandingFileHandles++;
//System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
return inputStream;
}
}
/**
* Releases the current reader and returns it to the cache.
* @param key The reader.
* @param inputStream The stream being used.
*/
public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
synchronized(lock) {
numOutstandingFileHandles--;
UniqueKey newID = allocateKey(key);
fileHandleStorage.put(newID,inputStream);
// Let any listeners know that another file handle has become available.
lock.notify();
}
}
/**
* Finds an existing entry in the storage mechanism.
* @param key Reader.
* @return a cached stream, if available. Otherwise,
*/
private FileInputStream findExistingEntry(final SAMReaderID key) {
int existingHandles = getMostRecentUniquifier(key);
// See if any of the keys currently exist in the repository.
for(int i = 0; i <= existingHandles; i++) {
UniqueKey uniqueKey = new UniqueKey(key,i);
if(fileHandleStorage.containsKey(uniqueKey))
return fileHandleStorage.remove(uniqueKey);
}
return null;
}
/**
* Gets the most recent uniquifier used for the given reader.
* @param reader Reader for which to determine uniqueness.
* @return
*/
private int getMostRecentUniquifier(final SAMReaderID reader) {
if(keyCounter.containsKey(reader))
return keyCounter.get(reader);
else return -1;
}
private UniqueKey allocateKey(final SAMReaderID reader) {
int uniquifier = getMostRecentUniquifier(reader)+1;
keyCounter.put(reader,uniquifier);
return new UniqueKey(reader,uniquifier);
}
private FileInputStream openInputStream(final SAMReaderID reader) {
try {
return new FileInputStream(reader.getSamFilePath());
}
catch(IOException ex) {
throw new StingException("Unable to open input file");
}
}
private void closeInputStream(final FileInputStream inputStream) {
try {
inputStream.close();
}
catch(IOException ex) {
throw new StingException("Unable to open input file");
}
}
/**
* Actually contains the file handles, purging them as they get too old.
*/
private class FileHandleStorage extends LinkedHashMap<UniqueKey,FileInputStream> {
/**
* Remove the oldest entry
* @param entry Entry to consider removing.
* @return True if the cache size has been exceeded. False otherwise.
*/
@Override
protected boolean removeEldestEntry(Map.Entry<UniqueKey,FileInputStream> entry) {
synchronized (lock) {
if(size() > cacheSize) {
keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
closeInputStream(entry.getValue());
return true;
}
}
return false;
}
}
/**
* Uniquifies a key by adding a numerical uniquifier.
*/
private class UniqueKey {
/**
* The file handle's key.
*/
private final SAMReaderID key;
/**
* A uniquifier, so that multiple of the same reader can exist in the cache.
*/
private final int uniqueID;
public UniqueKey(final SAMReaderID reader, final int uniqueID) {
this.key = reader;
this.uniqueID = uniqueID;
}
@Override
public boolean equals(Object other) {
if(!(other instanceof UniqueKey))
return false;
UniqueKey otherUniqueKey = (UniqueKey)other;
return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
}
@Override
public int hashCode() {
return key.hashCode();
}
}
}

View File

@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.SAMFileSpan; import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.IntervalUtils;
@ -40,28 +41,25 @@ import java.util.*;
*/ */
public class FilePointer { public class FilePointer {
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>(); protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
protected final BAMOverlap overlap; protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
protected final List<GenomeLoc> locations;
/** /**
* Does this file pointer point into an unmapped region? * Does this file pointer point into an unmapped region?
*/ */
protected final boolean isRegionUnmapped; protected final boolean isRegionUnmapped;
public FilePointer() { public FilePointer(final GenomeLoc... locations) {
this((BAMOverlap)null); this.locations.addAll(Arrays.asList(locations));
} boolean foundMapped = false, foundUnmapped = false;
for(GenomeLoc location: locations) {
public FilePointer(final GenomeLoc location) { if(GenomeLoc.isUnmapped(location))
this.overlap = null; foundUnmapped = true;
this.locations = Collections.singletonList(location); else
this.isRegionUnmapped = GenomeLoc.isUnmapped(location); foundMapped = true;
} }
if(foundMapped && foundUnmapped)
public FilePointer(final BAMOverlap overlap) { throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
this.overlap = overlap; this.isRegionUnmapped = foundUnmapped;
this.locations = new ArrayList<GenomeLoc>();
this.isRegionUnmapped = false;
} }
/** /**
@ -217,4 +215,20 @@ public class FilePointer {
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
combined.addFileSpans(initialElement.getKey(),fileSpan); combined.addFileSpans(initialElement.getKey(),fileSpan);
} }
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("FilePointer:%n");
builder.append("\tlocations = {");
builder.append(Utils.join(";",locations));
builder.append("}%n\tregions = %n");
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
builder.append(entry.getKey());
builder.append("= {");
builder.append(entry.getValue());
builder.append("}");
}
return builder.toString();
}
} }

View File

@ -25,419 +25,58 @@
package org.broadinstitute.sting.gatk.datasources.reads; package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.AbstractBAMFileIndex; import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.Bin; import org.broadinstitute.sting.utils.GenomeLocParser;
import net.sf.samtools.BrowseableBAMIndex;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*; import java.util.Iterator;
/** /**
* Shard intervals based on position within the BAM file. * Handles the process of aggregating BAM intervals into individual shards.
* * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
* @author mhanna
* @version 0.1
*/ */
public class IntervalSharder { public class IntervalSharder implements Iterator<FilePointer> {
private static Logger logger = Logger.getLogger(IntervalSharder.class); /**
* The iterator actually laying out the data for BAM scheduling.
*/
private final PeekableIterator<FilePointer> wrappedIterator;
public static Iterator<FilePointer> shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { /**
return new IntervalSharder.FilePointerIterator(dataSource,loci); * The parser, for interval manipulation.
*/
private final GenomeLocParser parser;
public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
}
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
}
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
}
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
this.parser = parser;
}
public boolean hasNext() {
return wrappedIterator.hasNext();
} }
/** /**
* A lazy-loading iterator over file pointers. * Accumulate shards where there's no additional cost to processing the next shard in the sequence.
* @return The next file pointer to process.
*/ */
private static class FilePointerIterator implements Iterator<FilePointer> { public FilePointer next() {
final SAMDataSource dataSource; FilePointer current = wrappedIterator.next();
final GenomeLocSortedSet loci; while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
final PeekableIterator<GenomeLoc> locusIterator; current = current.combine(parser,wrappedIterator.next());
final Queue<FilePointer> cachedFilePointers = new LinkedList<FilePointer>(); return current;
public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
this.dataSource = dataSource;
this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
advance();
}
public boolean hasNext() {
return !cachedFilePointers.isEmpty();
}
public FilePointer next() {
if(!hasNext())
throw new NoSuchElementException("FilePointerIterator iteration is complete");
FilePointer filePointer = cachedFilePointers.remove();
if(cachedFilePointers.isEmpty())
advance();
return filePointer;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
}
private void advance() {
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
String contig = null;
// If the next section of the BAM to be processed is unmapped, handle this region separately.
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
contig = null;
while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
GenomeLoc nextLocus = locusIterator.next();
contig = nextLocus.getContig();
nextBatch.add(nextLocus);
}
}
if(nextBatch.size() > 0) {
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
}
}
} }
/** public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
* Merge / split intervals based on an awareness of the structure of the BAM file.
* @param dataSource
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
* @param loci
* @return
*/
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
if(contig == null) {
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
filePointer.addFileSpans(id,null);
return Collections.singletonList(filePointer);
}
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
List<FilePointer> filePointers = new ArrayList<FilePointer>();
FilePointer lastFilePointer = null;
BAMOverlap lastBAMOverlap = null;
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
for(SAMReaderID id: dataSource.getReaderIDs()) {
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
// If this contig can't be found in the reference, skip over it.
if(referenceSequence == null && contig != null)
continue;
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
binMerger.addReader(id,
index,
referenceSequence.getSequenceIndex(),
index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
// Cache the reader for later data lookup.
readerToIndexMap.put(id,index);
}
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
for(GenomeLoc location: loci) {
if(!location.getContig().equals(contig))
throw new ReviewedStingException("Location outside bounds of contig");
if(!binIterator.hasNext())
break;
int locationStart = location.getStart();
final int locationStop = location.getStop();
// Advance to first bin.
while(binIterator.peek().stop < locationStart)
binIterator.next();
// Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
// sure the extending bin is not pruned from the list.
List<BAMOverlap> bamOverlaps = new ArrayList<BAMOverlap>();
while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
bamOverlaps.add(binIterator.next());
if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
bamOverlaps.add(binIterator.peek());
// Bins found; try to match bins with locations.
Iterator<BAMOverlap> bamOverlapIterator = bamOverlaps.iterator();
while(locationStop >= locationStart) {
int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
while(binStop < locationStart && bamOverlapIterator.hasNext()) {
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
filePointers.add(lastFilePointer);
lastBAMOverlap = bamOverlapIterator.next();
lastFilePointer = new FilePointer(lastBAMOverlap);
binStart = lastFilePointer.overlap.start;
binStop = lastFilePointer.overlap.stop;
}
if(locationStart < binStart) {
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
filePointers.add(lastFilePointer);
lastFilePointer = null;
lastBAMOverlap = null;
}
final int regionStop = Math.min(locationStop,binStart-1);
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
lastFilePointer = new FilePointer(subset);
locationStart = regionStop + 1;
}
else if(locationStart > binStop) {
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
filePointers.add(lastFilePointer);
lastFilePointer = null;
lastBAMOverlap = null;
}
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
filePointers.add(new FilePointer(subset));
locationStart = locationStop + 1;
}
else {
if(lastFilePointer == null)
throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
// The start of the region overlaps the bin. Add the overlapping subset.
final int regionStop = Math.min(locationStop,binStop);
lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
locationStart = regionStop + 1;
}
}
}
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
filePointers.add(lastFilePointer);
// Lookup the locations for every file pointer in the index.
for(SAMReaderID id: readerToIndexMap.keySet()) {
BrowseableBAMIndex index = readerToIndexMap.get(id);
for(FilePointer filePointer: filePointers)
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
}
return filePointers;
}
private static class BinMergingIterator implements Iterator<BAMOverlap> {
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
}
public boolean hasNext() {
return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
}
public BAMOverlap next() {
if(!hasNext())
throw new NoSuchElementException("No elements left in merging iterator");
if(pendingOverlaps.isEmpty())
advance();
return pendingOverlaps.remove();
}
public void advance() {
List<ReaderBin> bins = new ArrayList<ReaderBin>();
int boundsStart, boundsStop;
// Prime the pump
if(binQueue.isEmpty())
return;
bins.add(getNextBin());
boundsStart = bins.get(0).getStart();
boundsStop = bins.get(0).getStop();
// Accumulate all the bins that overlap the current bin, in sorted order.
while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
ReaderBin bin = getNextBin();
bins.add(bin);
boundsStart = Math.min(boundsStart,bin.getStart());
boundsStop = Math.max(boundsStop,bin.getStop());
}
List<Pair<Integer,Integer>> range = new ArrayList<Pair<Integer,Integer>>();
int start = bins.get(0).getStart();
int stop = bins.get(0).getStop();
while(start <= boundsStop) {
// Find the next stopping point.
for(ReaderBin bin: bins) {
stop = Math.min(stop,bin.getStop());
if(start < bin.getStart())
stop = Math.min(stop,bin.getStart()-1);
}
range.add(new Pair<Integer,Integer>(start,stop));
// If the last entry added included the last element, stop.
if(stop >= boundsStop)
break;
// Find the next start.
start = stop + 1;
for(ReaderBin bin: bins) {
if(start >= bin.getStart() && start <= bin.getStop())
break;
else if(start < bin.getStart()) {
start = bin.getStart();
break;
}
}
}
// Add the next series of BAM overlaps to the window.
for(Pair<Integer,Integer> window: range) {
BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
for(ReaderBin bin: bins)
bamOverlap.addBin(bin.id,bin.bin);
pendingOverlaps.add(bamOverlap);
}
}
public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
private ReaderBin peekNextBin() {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.peek();
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
}
private ReaderBin getNextBin() {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.remove();
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
if(current.hasNextBin())
binQueue.add(current);
return readerBin;
}
}
/**
* Filters out bins not at the lowest level in the tree.
*/
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
private BrowseableBAMIndex index;
private Iterator<Bin> wrappedIterator;
private Bin nextBin;
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
this.index = index;
this.wrappedIterator = iterator;
advance();
}
public boolean hasNext() {
return nextBin != null;
}
public Bin next() {
Bin bin = nextBin;
advance();
return bin;
}
public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
private void advance() {
nextBin = null;
while(wrappedIterator.hasNext() && nextBin == null) {
Bin bin = wrappedIterator.next();
if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
nextBin = bin;
}
}
}
}
class BinQueueState implements Comparable<org.broadinstitute.sting.gatk.datasources.reads.BinQueueState> {
private final SAMReaderID id;
private final BrowseableBAMIndex index;
private final int referenceSequence;
private final PeekableIterator<Bin> bins;
private int firstLocusInCurrentBin;
private int lastLocusInCurrentBin;
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bins = new PeekableIterator<Bin>(bins);
refreshLocusInBinCache();
}
public SAMReaderID getReaderID() {
return id;
}
public BrowseableBAMIndex getIndex() {
return index;
}
public int getReferenceSequence() {
return referenceSequence;
}
public boolean hasNextBin() {
return bins.hasNext();
}
public Bin peekNextBin() {
return bins.peek();
}
public Bin nextBin() {
Bin nextBin = bins.next();
refreshLocusInBinCache();
return nextBin;
}
public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
if(!this.bins.hasNext()) return -1;
if(!this.bins.hasNext()) return 1;
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
}
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
}
private void refreshLocusInBinCache() {
firstLocusInCurrentBin = -1;
lastLocusInCurrentBin = -1;
if(bins.hasNext()) {
Bin bin = bins.peek();
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
}
}
} }

View File

@ -22,30 +22,34 @@
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
package net.sf.samtools; package org.broadinstitute.sting.gatk.datasources.reads;
import java.util.BitSet; import java.util.Iterator;
/** /**
* A temporary solution to work around Java access rights issues: * Batch granular file pointers into potentially larger shards.
* override chunk and make it public.
* TODO: Eliminate once we determine the final fate of the BAM index reading code.
*/ */
public class GATKBinList extends BinList { public class LocusShardBalancer extends ShardBalancer {
/** /**
* Create a new BinList over sequenceCount sequences, consisting of the given bins. * Convert iterators of file pointers into balanced iterators of shards.
* @param referenceSequence Reference sequence to which these bins are relevant. * @return An iterator over balanced shards.
* @param bins The given bins to include.
*/ */
public GATKBinList(final int referenceSequence, final BitSet bins) { public Iterator<Shard> iterator() {
super(referenceSequence,bins); return new Iterator<Shard>() {
} public boolean hasNext() {
return filePointers.hasNext();
}
/** public Shard next() {
* Retrieves the bins stored in this list. FilePointer current = filePointers.next();
* @return A bitset where a bin is present in the list if the bit is true. while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
*/ current = current.combine(parser,filePointers.next());
public BitSet getBins() { return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
return super.getBins(); }
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
};
} }
} }

View File

@ -1,178 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* A sharding strategy for loci based on reading of the index.
*/
public class LocusShardStrategy implements ShardStrategy {
/**
* The data source to use when performing this sharding.
*/
private final SAMDataSource reads;
/**
* the parser for creating shards
*/
private GenomeLocParser genomeLocParser;
/**
* An iterator through the available file pointers.
*/
private final Iterator<FilePointer> filePointerIterator;
/**
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
* @param reads Data source from which to load index data.
* @param locations List of locations for which to load data.
*/
public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
this.reads = reads;
this.genomeLocParser = genomeLocParser;
if(!reads.isEmpty()) {
GenomeLocSortedSet intervals;
if(locations == null) {
// If no locations were passed in, shard the entire BAM file.
SAMFileHeader header = reads.getHeader();
intervals = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
// Check this sequence against the reference sequence dictionary.
// TODO: Do a better job of merging reads + reference.
SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
if(refSequenceRecord != null) {
final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
}
}
}
else
intervals = locations;
if(reads.isLowMemoryShardingEnabled()) {
/*
Iterator<FilePointer> filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
List<FilePointer> filePointers = new ArrayList<FilePointer>();
while(filePointerIterator.hasNext())
filePointers.add(filePointerIterator.next());
this.filePointerIterator = filePointers.iterator();
*/
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
}
else
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
}
else {
final int maxShardSize = 100000;
List<FilePointer> filePointers = new ArrayList<FilePointer>();
if(locations == null) {
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
}
}
}
else {
for(GenomeLoc interval: locations) {
while(interval.size() > maxShardSize) {
filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
}
filePointers.add(new FilePointer(interval));
}
}
filePointerIterator = filePointers.iterator();
}
}
/**
* returns true if there are additional shards
*
* @return false if we're done processing shards
*/
public boolean hasNext() {
return filePointerIterator.hasNext();
}
public long shardNumber = 0;
/**
* gets the next Shard
*
* @return the next shard
*/
public LocusShard next() {
FilePointer nextFilePointer = filePointerIterator.next();
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
/*
System.out.printf("Shard %d: interval = {",++shardNumber);
for(GenomeLoc locus: nextFilePointer.locations)
System.out.printf("%s;",locus);
System.out.printf("}; ");
if(fileSpansBounding == null)
System.out.printf("no shard data%n");
else {
SortedMap<SAMReaderID,SAMFileSpan> sortedSpans = new TreeMap<SAMReaderID,SAMFileSpan>(fileSpansBounding);
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: sortedSpans.entrySet()) {
System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
}
}
*/
return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
}
/** we don't support the remove command */
public void remove() {
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
}
/**
* makes the IntervalShard iterable, i.e. usable in a for loop.
*
* @return
*/
public Iterator<Shard> iterator() {
return this;
}
}

View File

@ -1,68 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.Iterator;
/**
* Handles the process of aggregating BAM intervals into individual shards.
*/
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
/**
* The iterator actually laying out the data for BAM scheduling.
*/
private final PeekableIterator<FilePointer> wrappedIterator;
/**
* The parser, for interval manipulation.
*/
private final GenomeLocParser parser;
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
parser = loci.getGenomeLocParser();
}
public boolean hasNext() {
return wrappedIterator.hasNext();
}
/**
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
* @return The next file pointer to process.
*/
public FilePointer next() {
FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
}

View File

@ -1,34 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.List;
/**
* A single, monolithic shard bridging all available data.
* @author mhanna
* @version 0.1
*/
public class MonolithicShard extends Shard {
/**
* Creates a new monolithic shard of the given type.
* @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
* @param locs Intervals that this monolithic shard should process.
*/
public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List<GenomeLoc> locs) {
super(parser, shardType, locs, readsDataSource, null, false);
if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
}
/**
* String representation of this shard.
* @return "entire genome".
*/
@Override
public String toString() {
return "entire genome";
}
}

View File

@ -1,77 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Create a giant shard representing all the data in the input BAM(s).
*
* @author mhanna
* @version 0.1
*/
public class MonolithicShardStrategy implements ShardStrategy {
/**
* The single shard associated with this sharding strategy.
*/
private MonolithicShard shard;
/**
* Create a new shard strategy for shards of the given type.
* @param shardType The shard type.
*/
public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List<GenomeLoc> region) {
shard = new MonolithicShard(parser,readsDataSource,shardType,region);
}
/**
* Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
* will only return another copy of the active iterator.
* @return A copy of this.
*/
public Iterator<Shard> iterator() {
return this;
}
/**
* Returns true if the monolithic shard has not yet been consumed, or false otherwise.
* @return True if shard has been consumed, false otherwise.
*/
public boolean hasNext() {
return shard != null;
}
/**
* Returns the monolithic shard if it has not already been retrieved.
* @return The monolithic shard.
* @throws NoSuchElementException if no such data exists.
*/
public Shard next() {
if(shard == null)
throw new NoSuchElementException("Monolithic shard has already been retrived.");
Shard working = shard;
shard = null;
return working;
}
/**
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
*/
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a shard strategy");
}
/**
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
* @param size adjust the next size to this
*/
public void adjustNextShardSize( long size ) {
throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
}
}

View File

@ -35,10 +35,15 @@ import java.util.Map;
* @version 0.1 * @version 0.1
*/ */
public class ReadShard extends Shard { public class ReadShard extends Shard {
/**
* What is the maximum number of reads which should go into a read shard.
*/
public static final int MAX_READS = 10000;
/** /**
* The reads making up this shard. * The reads making up this shard.
*/ */
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS); private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(MAX_READS);
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) { public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
@ -66,7 +71,7 @@ public class ReadShard extends Shard {
* @return True if this shard's buffer is full (and the shard can buffer reads). * @return True if this shard's buffer is full (and the shard can buffer reads).
*/ */
public boolean isBufferFull() { public boolean isBufferFull() {
return reads.size() > ReadShardStrategy.MAX_READS; return reads.size() > ReadShard.MAX_READS;
} }
/** /**

View File

@ -0,0 +1,115 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.SAMFileSpan;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
/**
* Divide up large file pointers containing reads into more manageable subcomponents.
*/
public class ReadShardBalancer extends ShardBalancer {
/**
* Convert iterators of file pointers into balanced iterators of shards.
* @return An iterator over balanced shards.
*/
public Iterator<Shard> iterator() {
return new Iterator<Shard>() {
/**
* The cached shard to be returned next. Prefetched in the peekable iterator style.
*/
private Shard nextShard = null;
/**
* The file pointer currently being processed.
*/
private FilePointer currentFilePointer;
/**
* Ending position of the last shard in the file.
*/
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
{
if(filePointers.hasNext())
currentFilePointer = filePointers.next();
advance();
}
public boolean hasNext() {
return nextShard != null;
}
public Shard next() {
if(!hasNext())
throw new NoSuchElementException("No next read shard available");
Shard currentShard = nextShard;
advance();
return currentShard;
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
private void advance() {
Map<SAMReaderID,SAMFileSpan> shardPosition;
nextShard = null;
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
readsDataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
nextShard = shard;
break;
}
}
selectedReaders.clear();
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
}
position = readsDataSource.getCurrentPosition();
}
};
}
}

View File

@ -1,183 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*;
/**
* The sharding strategy for reads using a simple counting mechanism. Each read shard
* has a specific number of reads (default to 10K) which is configured in the constructor.
* @author aaron
* @version 1.0
* @date Apr 14, 2009
*/
public class ReadShardStrategy implements ShardStrategy {
/**
* What is the maximum number of reads which should go into a read shard.
*/
protected static final int MAX_READS = 10000;
/**
* The data source used to shard.
*/
private final SAMDataSource dataSource;
/**
* The intervals to be processed.
*/
private final GenomeLocSortedSet locations;
/**
* The cached shard to be returned next. Prefetched in the peekable iterator style.
*/
private Shard nextShard = null;
/** our storage of the genomic locations they'd like to shard over */
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
/**
* Iterator over the list of file pointers.
*/
private final Iterator<FilePointer> filePointerIterator;
/**
* The file pointer currently being processed.
*/
private FilePointer currentFilePointer;
/**
* Ending position of the last shard in the file.
*/
private Map<SAMReaderID,SAMFileSpan> position;
/**
* An indicator whether the strategy has sharded into the unmapped region.
*/
private boolean isIntoUnmappedRegion = false;
private final GenomeLocParser parser;
/**
* Create a new read shard strategy, loading read shards from the given BAM file.
* @param dataSource Data source from which to load shards.
* @param locations intervals to use for sharding.
*/
public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
this.dataSource = dataSource;
this.parser = parser;
this.position = this.dataSource.getCurrentPosition();
this.locations = locations;
if(locations != null)
filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
else
filePointerIterator = filePointers.iterator();
if(filePointerIterator.hasNext())
currentFilePointer = filePointerIterator.next();
advance();
}
/**
* do we have another read shard?
* @return True if any more data is available. False otherwise.
*/
public boolean hasNext() {
return nextShard != null;
}
/**
* Retrieves the next shard, if available.
* @return The next shard, if available.
* @throws java.util.NoSuchElementException if no such shard is available.
*/
public Shard next() {
if(!hasNext())
throw new NoSuchElementException("No next read shard available");
Shard currentShard = nextShard;
advance();
return currentShard;
}
public void advance() {
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
nextShard = null;
if(locations != null) {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {
Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
dataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
nextShard = shard;
break;
}
}
selectedReaders.clear();
currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
}
}
else {
// todo -- this nulling of intervals is a bit annoying since readwalkers without
// todo -- any -L values need to be special cased throughout the code.
Shard shard = new ReadShard(parser,dataSource,position,null,false);
dataSource.fillShard(shard);
nextShard = !shard.isBufferEmpty() ? shard : null;
}
this.position = dataSource.getCurrentPosition();
}
/**
* @throws UnsupportedOperationException always.
*/
public void remove() {
throw new UnsupportedOperationException("Remove not supported");
}
/**
* Convenience method for using ShardStrategy in an foreach loop.
* @return A iterator over shards.
*/
public Iterator<Shard> iterator() {
return this;
}
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.Bin;
import net.sf.samtools.BrowseableBAMIndex;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 2, 2011
* Time: 4:36:40 PM
* To change this template use File | Settings | File Templates.
*/
class ReaderBin {
public final SAMReaderID id;
public final BrowseableBAMIndex index;
public final int referenceSequence;
public final Bin bin;
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bin = bin;
}
public int getStart() {
return index.getFirstLocusInBin(bin);
}
public int getStop() {
return index.getLastLocusInBin(bin);
}
}

View File

@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -71,7 +73,7 @@ public class SAMDataSource {
/** /**
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
*/ */
private final GenomeLocParser genomeLocParser; protected final GenomeLocParser genomeLocParser;
/** /**
* Identifiers for the readers driving this data source. * Identifiers for the readers driving this data source.
@ -91,13 +93,18 @@ public class SAMDataSource {
/** /**
* How far along is each reader? * How far along is each reader?
*/ */
private final Map<SAMReaderID, SAMFileSpan> readerPositions = new HashMap<SAMReaderID,SAMFileSpan>(); private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
/** /**
* The merged header. * The merged header.
*/ */
private final SAMFileHeader mergedHeader; private final SAMFileHeader mergedHeader;
/**
* The constituent headers of the unmerged files.
*/
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
/** /**
* The sort order of the BAM files. Files without a sort order tag are assumed to be * The sort order of the BAM files. Files without a sort order tag are assumed to be
* in coordinate order. * in coordinate order.
@ -131,17 +138,24 @@ public class SAMDataSource {
private final SAMResourcePool resourcePool; private final SAMResourcePool resourcePool;
/** /**
* Whether to enable the new low-memory sharding mechanism. * Asynchronously loads BGZF blocks.
*/ */
private boolean enableLowMemorySharding = false; private final BGZFBlockLoadingDispatcher dispatcher;
/**
* How are threads allocated.
*/
private final ThreadAllocation threadAllocation;
/** /**
* Create a new SAM data source given the supplied read metadata. * Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files. * @param samFiles list of reads files.
*/ */
public SAMDataSource(Collection<SAMReaderID> samFiles,GenomeLocParser genomeLocParser) { public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
this( this(
samFiles, samFiles,
threadAllocation,
numFileHandles,
genomeLocParser, genomeLocParser,
false, false,
SAMFileReader.ValidationStringency.STRICT, SAMFileReader.ValidationStringency.STRICT,
@ -150,8 +164,7 @@ public class SAMDataSource {
new ValidationExclusion(), new ValidationExclusion(),
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
false, false,
false, false);
true);
} }
/** /**
@ -159,6 +172,8 @@ public class SAMDataSource {
*/ */
public SAMDataSource( public SAMDataSource(
Collection<SAMReaderID> samFiles, Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser, GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities, boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness, SAMFileReader.ValidationStringency strictness,
@ -167,9 +182,10 @@ public class SAMDataSource {
ValidationExclusion exclusionList, ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters, Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci, boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents, boolean generateExtendedEvents) {
boolean enableLowMemorySharding) {
this( samFiles, this( samFiles,
threadAllocation,
numFileHandles,
genomeLocParser, genomeLocParser,
useOriginalBaseQualities, useOriginalBaseQualities,
strictness, strictness,
@ -182,8 +198,7 @@ public class SAMDataSource {
BAQ.CalculationMode.OFF, BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY, BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ null, // no BAQ
(byte) -1, (byte) -1);
enableLowMemorySharding);
} }
/** /**
@ -205,6 +220,8 @@ public class SAMDataSource {
*/ */
public SAMDataSource( public SAMDataSource(
Collection<SAMReaderID> samFiles, Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser, GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities, boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness, SAMFileReader.ValidationStringency strictness,
@ -217,13 +234,19 @@ public class SAMDataSource {
BAQ.CalculationMode cmode, BAQ.CalculationMode cmode,
BAQ.QualityMode qmode, BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader, IndexedFastaSequenceFile refReader,
byte defaultBaseQualities, byte defaultBaseQualities) {
boolean enableLowMemorySharding) {
this.enableLowMemorySharding(enableLowMemorySharding);
this.readMetrics = new ReadMetrics(); this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser; this.genomeLocParser = genomeLocParser;
readerIDs = samFiles; readerIDs = samFiles;
this.threadAllocation = threadAllocation;
// TODO: Consider a borrowed-thread dispatcher implementation.
if(this.threadAllocation.getNumIOThreads() > 0)
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
else
dispatcher = null;
validationStringency = strictness; validationStringency = strictness;
for (SAMReaderID readerID : samFiles) { for (SAMReaderID readerID : samFiles) {
if (!readerID.samFile.canRead()) if (!readerID.samFile.canRead())
@ -235,10 +258,13 @@ public class SAMDataSource {
SAMReaders readers = resourcePool.getAvailableReaders(); SAMReaders readers = resourcePool.getAvailableReaders();
// Determine the sort order. // Determine the sort order.
for(SAMFileReader reader: readers.values()) { for(SAMReaderID readerID: readerIDs) {
// Get the sort order, forcing it to coordinate if unsorted. // Get the sort order, forcing it to coordinate if unsorted.
SAMFileReader reader = readers.getReader(readerID);
SAMFileHeader header = reader.getFileHeader(); SAMFileHeader header = reader.getFileHeader();
headers.put(readerID,header);
if ( header.getReadGroups().isEmpty() ) { if ( header.getReadGroups().isEmpty() ) {
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
@ -296,12 +322,10 @@ public class SAMDataSource {
originalToMergedReadGroupMappings.put(id,mappingToMerged); originalToMergedReadGroupMappings.put(id,mappingToMerged);
} }
if(enableLowMemorySharding) { for(SAMReaderID id: readerIDs) {
for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.samFile);
File indexFile = findIndexFile(id.samFile); if(indexFile != null)
if(indexFile != null) bamIndices.put(id,new GATKBAMIndex(indexFile));
bamIndices.put(id,new GATKBAMIndex(indexFile));
}
} }
resourcePool.releaseReaders(readers); resourcePool.releaseReaders(readers);
@ -314,22 +338,6 @@ public class SAMDataSource {
*/ */
public ReadProperties getReadsInfo() { return readProperties; } public ReadProperties getReadsInfo() { return readProperties; }
/**
* Enable experimental low-memory sharding.
* @param enable True to enable sharding. False otherwise.
*/
public void enableLowMemorySharding(final boolean enable) {
enableLowMemorySharding = enable;
}
/**
* Returns whether low-memory sharding is enabled.
* @return True if enabled, false otherwise.
*/
public boolean isLowMemoryShardingEnabled() {
return enableLowMemorySharding;
}
/** /**
* Checks to see whether any reads files are supplying data. * Checks to see whether any reads files are supplying data.
* @return True if no reads files are supplying data to the traversal; false otherwise. * @return True if no reads files are supplying data to the traversal; false otherwise.
@ -368,7 +376,7 @@ public class SAMDataSource {
* Retrieves the current position within the BAM file. * Retrieves the current position within the BAM file.
* @return A mapping of reader to current position. * @return A mapping of reader to current position.
*/ */
public Map<SAMReaderID,SAMFileSpan> getCurrentPosition() { public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
return readerPositions; return readerPositions;
} }
@ -381,7 +389,7 @@ public class SAMDataSource {
} }
public SAMFileHeader getHeader(SAMReaderID id) { public SAMFileHeader getHeader(SAMReaderID id) {
return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); return headers.get(id);
} }
/** /**
@ -404,45 +412,21 @@ public class SAMDataSource {
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
} }
/**
* No read group collisions at this time because only one SAM file is currently supported.
* @return False always.
*/
public boolean hasReadGroupCollisions() {
return hasReadGroupCollisions;
}
/** /**
* True if all readers have an index. * True if all readers have an index.
* @return True if all readers have an index. * @return True if all readers have an index.
*/ */
public boolean hasIndex() { public boolean hasIndex() {
if(enableLowMemorySharding) return readerIDs.size() == bamIndices.size();
return readerIDs.size() == bamIndices.size();
else {
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
if(!reader.hasIndex())
return false;
}
return true;
}
} }
/** /**
* Gets the index for a particular reader. Always preloaded. * Gets the index for a particular reader. Always preloaded.
* TODO: Should return object of type GATKBAMIndex, but cannot because there
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
* TODO: sharding system goes live.
* @param id Id of the reader. * @param id Id of the reader.
* @return The index. Will preload the index if necessary. * @return The index. Will preload the index if necessary.
*/ */
public Object getIndex(final SAMReaderID id) { public GATKBAMIndex getIndex(final SAMReaderID id) {
if(enableLowMemorySharding) return bamIndices.get(id);
return bamIndices.get(id);
else {
SAMReaders readers = resourcePool.getReadersWithoutLocking();
return readers.getReader(id).getBrowseableIndex();
}
} }
/** /**
@ -507,10 +491,6 @@ public class SAMDataSource {
} }
public StingSAMIterator seek(Shard shard) { public StingSAMIterator seek(Shard shard) {
// todo: refresh monolithic sharding implementation
if(shard instanceof MonolithicShard)
return seekMonolithic(shard);
if(shard.buffersReads()) { if(shard.buffersReads()) {
return shard.iterator(); return shard.iterator();
} }
@ -540,7 +520,7 @@ public class SAMDataSource {
*/ */
private void initializeReaderPositions(SAMReaders readers) { private void initializeReaderPositions(SAMReaders readers) {
for(SAMReaderID id: getReaderIDs()) for(SAMReaderID id: getReaderIDs())
readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads()); readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
} }
/** /**
@ -548,7 +528,6 @@ public class SAMDataSource {
* @param readers Readers from which to load data. * @param readers Readers from which to load data.
* @param shard The shard specifying the data limits. * @param shard The shard specifying the data limits.
* @param enableVerification True to verify. For compatibility with old sharding strategy. * @param enableVerification True to verify. For compatibility with old sharding strategy.
* TODO: Collapse this flag when the two sharding systems are merged.
* @return An iterator over the selected data. * @return An iterator over the selected data.
*/ */
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
@ -559,14 +538,20 @@ public class SAMDataSource {
for(SAMReaderID id: getReaderIDs()) { for(SAMReaderID id: getReaderIDs()) {
CloseableIterator<SAMRecord> iterator = null; CloseableIterator<SAMRecord> iterator = null;
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
continue; // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
iterator = shard.getFileSpans().get(id) != null ? // TODO: Kill this check once we've proven that the design elements are gone.
readers.getReader(id).iterator(shard.getFileSpans().get(id)) : if(shard.getFileSpans().get(id) == null)
readers.getReader(id).queryUnmapped(); throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream inputStream = readers.getInputStream(id);
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
}
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
if(readProperties.getReadBufferSize() != null) if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
if(shard.getGenomeLocs() != null) if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
mergingIterator.addIterator(readers.getReader(id),iterator); mergingIterator.addIterator(readers.getReader(id),iterator);
} }
@ -584,33 +569,6 @@ public class SAMDataSource {
readProperties.defaultBaseQualities()); readProperties.defaultBaseQualities());
} }
/**
* A stopgap measure to handle monolithic sharding
* @param shard the (monolithic) shard.
* @return An iterator over the monolithic shard.
*/
private StingSAMIterator seekMonolithic(Shard shard) {
SAMReaders readers = resourcePool.getAvailableReaders();
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
for(SAMReaderID id: getReaderIDs())
mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
return applyDecoratingIterators(shard.getReadMetrics(),
shard instanceof ReadShard,
readProperties.useOriginalBaseQualities(),
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
readProperties.getDownsamplingMethod().toFraction,
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
readProperties.getSupplementalFilters(),
readProperties.getBAQCalculationMode(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
readProperties.defaultBaseQualities());
}
/** /**
* Adds this read to the given shard. * Adds this read to the given shard.
* @param shard The shard to which to add the read. * @param shard The shard to which to add the read.
@ -618,7 +576,7 @@ public class SAMDataSource {
* @param read The read to add to the shard. * @param read The read to add to the shard.
*/ */
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing(); GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
shard.addRead(read); shard.addRead(read);
readerPositions.put(id,endChunk); readerPositions.put(id,endChunk);
} }
@ -689,19 +647,6 @@ public class SAMDataSource {
this.maxEntries = maxEntries; this.maxEntries = maxEntries;
} }
/**
* Dangerous internal method; retrieves any set of readers, whether in iteration or not.
* Used to handle non-exclusive, stateless operations, such as index queries.
* @return Any collection of SAMReaders, whether in iteration or not.
*/
protected SAMReaders getReadersWithoutLocking() {
synchronized(this) {
if(allResources.size() == 0)
createNewResource();
}
return allResources.get(0);
}
/** /**
* Choose a set of readers from the pool to use for this query. When complete, * Choose a set of readers from the pool to use for this query. When complete,
* @return * @return
@ -753,6 +698,11 @@ public class SAMDataSource {
*/ */
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>(); private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
/**
* The inptu streams backing
*/
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
/** /**
* Derive a new set of readers from the Reads metadata. * Derive a new set of readers from the Reads metadata.
* @param readerIDs reads to load. * @param readerIDs reads to load.
@ -760,12 +710,20 @@ public class SAMDataSource {
*/ */
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) { public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
for(SAMReaderID readerID: readerIDs) { for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = new SAMFileReader(readerID.samFile); File indexFile = findIndexFile(readerID.samFile);
SAMFileReader reader = null;
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
reader = new SAMFileReader(blockInputStream,indexFile,false);
inputStreams.put(readerID,blockInputStream);
}
else
reader = new SAMFileReader(readerID.samFile,indexFile,false);
reader.setSAMRecordFactory(factory); reader.setSAMRecordFactory(factory);
reader.enableFileSource(true); reader.enableFileSource(true);
reader.enableIndexMemoryMapping(false);
if(!enableLowMemorySharding)
reader.enableIndexCaching(true);
reader.setValidationStringency(validationStringency); reader.setValidationStringency(validationStringency);
final SAMFileHeader header = reader.getFileHeader(); final SAMFileHeader header = reader.getFileHeader();
@ -786,6 +744,15 @@ public class SAMDataSource {
return readers.get(id); return readers.get(id);
} }
/**
* Retrieve the input stream backing a reader.
* @param id The ID of the reader to retrieve.
* @return the reader associated with the given id.
*/
public BlockInputStream getInputStream(final SAMReaderID id) {
return inputStreams.get(id);
}
/** /**
* Searches for the reader id of this reader. * Searches for the reader id of this reader.
* @param reader Reader for which to search. * @param reader Reader for which to search.
@ -1070,6 +1037,40 @@ public class SAMDataSource {
return indexFile; return indexFile;
} }
/**
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
* @return An iterator that spans all reads in all BAM files.
*/
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
* read that has been assigned
* @return
*/
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Create a schedule for processing the initialized BAM file using the given interval list.
* The returned schedule should be as granular as possible.
* @param intervals The list of intervals for which to create the schedule.
* @return A granular iterator over file pointers.
*/
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
if(intervals == null)
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
return shardBalancer;
}
} }

View File

@ -0,0 +1,120 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: 10/14/11
* Time: 10:47 PM
* To change this template use File | Settings | File Templates.
*/
class SAMReaderPosition {
private final SAMReaderID reader;
private final BlockInputStream inputStream;
private final List<GATKChunk> positions;
private PeekableIterator<GATKChunk> positionIterator;
/**
* Stores the next block address to read, or -1 if no such block is available.
*/
private long nextBlockAddress;
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
this.reader = reader;
this.inputStream = inputStream;
this.positions = fileSpan.getGATKChunks();
initialize();
}
public SAMReaderID getReader() {
return reader;
}
public BlockInputStream getInputStream() {
return inputStream;
}
/**
* Retrieves the next block address to be read.
* @return Next block address to be read.
*/
public long getBlockAddress() {
return nextBlockAddress;
}
public void reset() {
initialize();
}
/**
* Resets the SAM reader position to its original state.
*/
private void initialize() {
this.positionIterator = new PeekableIterator<GATKChunk>(positions.iterator());
if(positionIterator.hasNext())
nextBlockAddress = positionIterator.peek().getBlockStart();
else
nextBlockAddress = -1;
}
/**
* Advances the current position to the next block to read, given the current position in the file.
* @param filePosition The current position within the file.
*/
void advancePosition(final long filePosition) {
nextBlockAddress = filePosition;
// Check the current file position against the iterator; if the iterator is before the current file position,
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
try {
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
positionIterator.next();
// Check to see if the iterator has more data available.
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
nextBlockAddress = positionIterator.peek().getBlockStart();
break;
}
}
}
catch(Exception ex) {
throw new ReviewedStingException("");
}
}
private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
}
}

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
/**
* Balances maximally granular file pointers into shards of reasonable size.
*/
public abstract class ShardBalancer implements Iterable<Shard> {
protected SAMDataSource readsDataSource;
protected PeekableIterator<FilePointer> filePointers;
protected GenomeLocParser parser;
public void initialize(final SAMDataSource readsDataSource, final Iterator<FilePointer> filePointers, final GenomeLocParser parser) {
this.readsDataSource = readsDataSource;
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
this.parser = parser;
}
}

View File

@ -1,31 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import java.util.Iterator;
/**
*
* User: aaron
* Date: Apr 10, 2009
* Time: 4:55:37 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 10, 2009
* <p/>
* Interface ShardStrategy
* <p/>
* The base interface for the sharding strategy; before we had a base abstract
* class, but not this will be an interface to accomidate read based sharding
*/
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
}

View File

@ -1,117 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 7:09:22 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class ShardStrategyFactory
* <p/>
* The Shard Strategy Factory, use this class to create and transfer shard strategies
* between different approaches.
*/
public class ShardStrategyFactory {
public enum SHATTER_STRATEGY {
MONOLITHIC, // Put all of the available data into one shard.
LOCUS_EXPERIMENTAL,
READS_EXPERIMENTAL
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
switch (strat) {
case LOCUS_EXPERIMENTAL:
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
case READS_EXPERIMENTAL:
return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
default:
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
}
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
}
/**
* get a new shatter strategy
*
* @param readsDataSource The reads used to shatter this file.
* @param referenceDataSource The reference used to shatter this file.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return A strategy for shattering this data.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
switch (strat) {
case LOCUS_EXPERIMENTAL:
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
case READS_EXPERIMENTAL:
return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
default:
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
}
}
}

View File

@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
// initialize reads // initialize reads
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser); SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
// intervals // intervals
GenomeLocSortedSet intervalSortedSet = null; GenomeLocSortedSet intervalSortedSet = null;
@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) { while(sharder.hasNext()) {
FilePointer filePointer = sharder.next(); FilePointer filePointer = sharder.next();
@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) { while(sharder.hasNext()) {
FilePointer filePointer = sharder.next(); FilePointer filePointer = sharder.next();

View File

@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex;
import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.FastaSequenceIndexBuilder;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary; import net.sf.picard.sam.CreateSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/** /**
* Loads reference data from fasta file * Loads reference data from fasta file
* Looks for fai and dict files, and tries to create them if they don't exist * Looks for fai and dict files, and tries to create them if they don't exist
*/ */
public class ReferenceDataSource { public class ReferenceDataSource {
private IndexedFastaSequenceFile index; private IndexedFastaSequenceFile reference;
/** our log, which we want to capture anything from this class */ /** our log, which we want to capture anything from this class */
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
@ -173,7 +185,7 @@ public class ReferenceDataSource {
logger.info("Treating existing index file as complete."); logger.info("Treating existing index file as complete.");
} }
index = new CachingIndexedFastaSequenceFile(fastaFile); reference = new CachingIndexedFastaSequenceFile(fastaFile);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
@ -192,6 +204,52 @@ public class ReferenceDataSource {
* @return IndexedFastaSequenceFile that was created from file * @return IndexedFastaSequenceFile that was created from file
*/ */
public IndexedFastaSequenceFile getReference() { public IndexedFastaSequenceFile getReference() {
return this.index; return this.reference;
}
/**
* Creates an iterator for processing the entire reference.
* @param readsDataSource the reads datasource to embed in the locus shard.
* @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param maxShardSize The maximum shard size which can be used to create this list.
* @return Creates a schedule for performing a traversal over the entire reference.
*/
public Iterable<Shard> createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
List<Shard> shards = new ArrayList<Shard>();
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
shards.add(new LocusShard(parser,
readsDataSource,
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
null));
}
}
return shards;
}
/**
* Creates an iterator for processing the entire reference.
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param intervals the list of intervals to use when processing the reference.
* @param maxShardSize The maximum shard size which can be used to create this list.
* @return Creates a schedule for performing a traversal over the entire reference.
*/
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
List<Shard> shards = new ArrayList<Shard>();
for(GenomeLoc interval: intervals) {
while(interval.size() > maxShardSize) {
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
null));
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
}
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(interval),
null));
}
return shards;
} }
} }

View File

@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
} }
public Object execute( Walker walker, ShardStrategy shardStrategy ) { public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
// Fast fail for walkers not supporting TreeReducible interface. // Fast fail for walkers not supporting TreeReducible interface.
if (!( walker instanceof TreeReducible )) if (!( walker instanceof TreeReducible ))
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");

View File

@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker;
@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param walker Computation to perform over dataset. * @param walker Computation to perform over dataset.
* @param shardStrategy A strategy for sharding the data. * @param shardStrategy A strategy for sharding the data.
*/ */
public Object execute(Walker walker, ShardStrategy shardStrategy) { public Object execute(Walker walker, Iterable<Shard> shardStrategy) {
walker.initialize(); walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker); Accumulator accumulator = Accumulator.create(engine,walker);

View File

@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @param reads the informations associated with the reads * @param reads the informations associated with the reads
* @param reference the reference file * @param reference the reference file
* @param rods the rods to include in the traversal * @param rods the rods to include in the traversal
* @param nThreadsToUse Number of threads to utilize. * @param threadAllocation Number of threads to utilize.
* *
* @return The best-fit microscheduler. * @return The best-fit microscheduler.
*/ */
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) { public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
if (walker instanceof TreeReducible && nThreadsToUse > 1) { if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
if(walker.isReduceByInterval()) if(walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
if(walker instanceof ReadWalker) if(walker instanceof ReadWalker)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse)); logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse); return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
} else { } else {
if(nThreadsToUse > 1) if(threadAllocation.getNumCPUThreads() > 1)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
return new LinearMicroScheduler(engine, walker, reads, reference, rods); return new LinearMicroScheduler(engine, walker, reads, reference, rods);
} }
@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* *
* @return the return type of the walker * @return the return type of the walker
*/ */
public abstract Object execute(Walker walker, ShardStrategy shardStrategy); public abstract Object execute(Walker walker, Iterable<Shard> shardStrategy);
/** /**
* Retrieves the object responsible for tracking and managing output. * Retrieves the object responsible for tracking and managing output.

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.resourcemanagement;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Models how threads are distributed between various components of the GATK.
*/
public class ThreadAllocation {
/**
* The number of CPU threads to be used by the GATK.
*/
private final int numCPUThreads;
/**
* Number of threads to devote exclusively to IO. Default is 0.
*/
private final int numIOThreads;
public int getNumCPUThreads() {
return numCPUThreads;
}
public int getNumIOThreads() {
return numIOThreads;
}
/**
* Construct the default thread allocation.
*/
public ThreadAllocation() {
this(1,null,null);
}
/**
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
* @param totalThreads Complete number of threads to allocate.
* @param numCPUThreads Total number of threads allocated to the traversal.
* @param numIOThreads Total number of threads allocated exclusively to IO.
*/
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
// If no allocation information is present, allocate all threads to CPU
if(numCPUThreads == null && numIOThreads == null) {
this.numCPUThreads = totalThreads;
this.numIOThreads = 0;
}
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
else if(numIOThreads == null) {
if(numCPUThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = totalThreads - numCPUThreads;
}
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
else if(numCPUThreads == null) {
if(numIOThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
this.numIOThreads = numIOThreads;
}
else {
if(numCPUThreads + numIOThreads != totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = numIOThreads;
}
}
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest {
SAMRecordIterator iterator = new SAMRecordIterator(); SAMRecordIterator iterator = new SAMRecordIterator();
GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5);
Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap()); Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.<SAMReaderID>emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap());
WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs());
WindowMaker.WindowMakerIterator window = windowMaker.next(); WindowMaker.WindowMakerIterator window = windowMaker.next();
LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null);

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -42,7 +43,7 @@ import java.util.Collections;
public class MockLocusShard extends LocusShard { public class MockLocusShard extends LocusShard {
public MockLocusShard(final GenomeLocParser genomeLocParser,final List<GenomeLoc> intervals) { public MockLocusShard(final GenomeLocParser genomeLocParser,final List<GenomeLoc> intervals) {
super( genomeLocParser, super( genomeLocParser,
new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser), new SAMDataSource(Collections.<SAMReaderID>emptyList(),new ThreadAllocation(),null,genomeLocParser),
intervals, intervals,
null); null);
} }

View File

@ -1,223 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import static org.testng.Assert.fail;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
*
* User: aaron
* Date: Apr 8, 2009
* Time: 8:14:23 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMBAMDataSourceUnitTest
* <p/>
* The test of the SAMBAM simple data source.
*/
public class SAMBAMDataSourceUnitTest extends BaseTest {
private List<SAMReaderID> readers;
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeMethod
public void doForEachTest() throws FileNotFoundException {
readers = new ArrayList<SAMReaderID>();
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@AfterMethod
public void undoForEachTest() {
seq = null;
readers.clear();
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testLinearBreakIterateAll() {
logger.warn("Executing testLinearBreakIterateAll");
// setup the data
readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
logger.debug("count = " + count);
StingSAMIterator datum = data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
for (SAMRecord r : datum) {
}
readCount++;
}
datum.close();
// if we're over 100 shards, break out
if (count > 100) {
break;
}
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testMergingTwoBAMFiles() {
logger.warn("Executing testMergingTwoBAMFiles");
// setup the test files
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
// count up the first hundred shards
int shardsToCount = 100;
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
if (count > shardsToCount) {
break;
}
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
}
readcountPerShard.add(readCount);
logger.debug("read count = " + readCount);
datum.close();
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
// setup the data and the counter before our second run
readers.clear();
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
count = 0;
// the sharding strat.
data = new SAMDataSource(readers,genomeLocParser);
strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser);
logger.debug("Pile two:");
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
// can we leave?
if (count > shardsToCount) {
break;
}
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
}
readcountPerShard2.add(readCount);
logger.debug("read count = " + readCount);
datum.close();
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
/*int pos = 0;
for (; pos < 100; pos++) {
if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos));
}
} */
}
}

View File

@ -0,0 +1,147 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import static org.testng.Assert.fail;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMDataSourceUnitTest
* <p/>
* The test of the SAMBAM simple data source.
*/
public class SAMDataSourceUnitTest extends BaseTest {
private List<SAMReaderID> readers;
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeMethod
public void doForEachTest() throws FileNotFoundException {
readers = new ArrayList<SAMReaderID>();
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@AfterMethod
public void undoForEachTest() {
seq = null;
readers.clear();
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testLinearBreakIterateAll() {
logger.warn("Executing testLinearBreakIterateAll");
// setup the data
readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,
new ThreadAllocation(),
null,
genomeLocParser,
false,
SAMFileReader.ValidationStringency.SILENT,
null,
null,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false,
false);
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer());
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
logger.debug("count = " + count);
StingSAMIterator datum = data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
for (SAMRecord r : datum) {
}
readCount++;
}
datum.close();
// if we're over 100 shards, break out
if (count > 100) {
break;
}
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
}
}

View File

@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest {
private List<SAMReaderID> bamList; private List<SAMReaderID> bamList;
private Walker countReadWalker; private Walker countReadWalker;
private File output; private File output;
private long readSize = 100000;
private TraverseReads traversalEngine = null; private TraverseReads traversalEngine = null;
private IndexedFastaSequenceFile ref = null; private IndexedFastaSequenceFile ref = null;
@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */ /** Test out that we can shard the file and iterate over every read */
@Test @Test
public void testUnmappedReadCount() { public void testUnmappedReadCount() {
SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser); SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser);
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, Iterable<Shard> shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
ref.getSequenceDictionary(),
readSize,
genomeLocParser);
countReadWalker.initialize(); countReadWalker.initialize();
Object accumulator = countReadWalker.reduceInit(); Object accumulator = countReadWalker.reduceInit();
while (shardStrategy.hasNext()) { for(Shard shard: shardStrategy) {
traversalEngine.startTimersIfNecessary(); traversalEngine.startTimersIfNecessary();
Shard shard = shardStrategy.next();
if (shard == null) { if (shard == null) {
fail("Shard == null"); fail("Shard == null");