First pass of the asynchronous block loader.

Block loads are only triggered on queue empty at this point.  Disabled by
default (enable with nt:io=?).
This commit is contained in:
Matt Hanna 2011-09-13 10:49:16 -04:00
parent 6459784351
commit 8bb4d4dca3
45 changed files with 3292 additions and 2037 deletions

View File

@ -0,0 +1,762 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools;
import net.sf.samtools.util.*;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Internal class for reading and querying BAM files.
*/
class BAMFileReader extends SAMFileReader.ReaderImplementation {
// True if reading from a File rather than an InputStream
private boolean mIsSeekable = false;
// For converting bytes into other primitive types
private BinaryCodec mStream = null;
// Underlying compressed data stream.
private final BAMInputStream mInputStream;
private SAMFileHeader mFileHeader = null;
// Populated if the file is seekable and an index exists
private File mIndexFile;
private BAMIndex mIndex = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<SAMRecord> mCurrentIterator = null;
// If true, all SAMRecords are fully decoded as they are read.
private final boolean eagerDecode;
// For error-checking.
private ValidationStringency mValidationStringency;
// For creating BAMRecords
private SAMRecordFactory samRecordFactory;
/**
* Use the caching index reader implementation rather than the disk-hit-per-file model.
*/
private boolean mEnableIndexCaching = false;
/**
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
*/
private boolean mEnableIndexMemoryMapping = true;
/**
* Add information about the origin (reader and position) to SAM records.
*/
private SAMFileReader mFileReader = null;
/**
* Prepare to read BAM from a stream (not seekable)
* @param stream source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final InputStream stream,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = false;
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(null);
}
/**
* Prepare to read BAM from a file (seekable)
* @param file source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final File file,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
" is older than BAM " + file.getAbsolutePath());
}
}
BAMFileReader(final SeekableStream strm,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
indexFile,
eagerDecode,
strm.getSource(),
validationStringency,
factory);
}
private BAMFileReader(final BAMInputStream inputStream,
final File indexFile,
final boolean eagerDecode,
final String source,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = true;
mInputStream = inputStream;
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(source);
mFirstRecordPointer = inputStream.getFilePointer();
}
/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
*/
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
this.mFileReader = enabled ? reader : null;
}
/**
* If true, uses the caching version of the index reader.
* @param enabled true to write source information into each SAMRecord.
*/
public void enableIndexCaching(final boolean enabled) {
if(mIndex != null)
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
this.mEnableIndexCaching = enabled;
}
/**
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
* @param enabled True to use memory mapping, false to use regular I/O.
*/
public void enableIndexMemoryMapping(final boolean enabled) {
if (mIndex != null) {
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
}
this.mEnableIndexMemoryMapping = enabled;
}
@Override void enableCrcChecking(final boolean enabled) {
this.mInputStream.setCheckCrcs(enabled);
}
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
/**
* @return true if ths is a BAM file, and has an index
*/
public boolean hasIndex() {
return (mIndexFile != null);
}
/**
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
* @return An index of the given type.
*/
public BAMIndex getIndex() {
if(mIndexFile == null)
throw new SAMException("No index is available for this BAM file.");
if(mIndex == null)
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
return mIndex;
}
void close() {
if (mStream != null) {
mStream.close();
}
if (mIndex != null) {
mIndex.close();
}
mStream = null;
mFileHeader = null;
mIndex = null;
}
SAMFileHeader getFileHeader() {
return mFileHeader;
}
/**
* Set error-checking level for subsequent SAMRecord reads.
*/
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
this.mValidationStringency = validationStringency;
}
SAMFileReader.ValidationStringency getValidationStringency() {
return this.mValidationStringency;
}
/**
* Prepare to iterate through the SAMRecords in file order.
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
* that iterator must be closed before getIterator() can be called again.
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
* getIterator() begins its iteration where the last one left off. That is the best that can be
* done in that situation.
*/
CloseableIterator<SAMRecord> getIterator() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (mIsSeekable) {
try {
mInputStream.seek(mFirstRecordPointer);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
mCurrentIterator = new BAMFileIterator();
return mCurrentIterator;
}
@Override
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!(chunks instanceof BAMFileSpan)) {
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
}
// Create an iterator over the given chunk boundaries.
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
return mCurrentIterator;
}
/**
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
* the rightmost bound is guaranteed to be after the last read in the file.
* @return An unbounded pointer to the first record in the BAM file.
*/
@Override
SAMFileSpan getFilePointerSpanningReads() {
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
}
/**
* Prepare to iterate through the SAMRecords that match the given interval.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
* A value of zero implies the start of the reference sequence.
* @param end A value of zero implies the end of the reference sequence.
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
* specified by start and end. If false, the SAMRecords need only overlap the interval.
* @return Iterator for the matching SAMRecords
*/
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
return mCurrentIterator;
}
/**
* Prepare to iterate through the SAMRecords with the given alignment start.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Alignment start sought.
* @return Iterator for the matching SAMRecords.
*/
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
return mCurrentIterator;
}
public CloseableIterator<SAMRecord> queryUnmapped() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
try {
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
if (startOfLastLinearBin != -1) {
mInputStream.seek(startOfLastLinearBin);
} else {
// No mapped reads in file, just start at the first read in file.
mInputStream.seek(mFirstRecordPointer);
}
mCurrentIterator = new BAMFileIndexUnmappedIterator();
return mCurrentIterator;
} catch (IOException e) {
throw new RuntimeException("IOException seeking to unmapped reads", e);
}
}
/**
* Reads the header from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private void readHeader(final String source)
throws IOException {
final byte[] buffer = new byte[4];
mStream.readBytes(buffer);
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
throw new IOException("Invalid BAM file header");
}
final int headerTextLength = mStream.readInt();
final String textHeader = mStream.readString(headerTextLength);
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
headerCodec.setValidationStringency(mValidationStringency);
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
source);
final int sequenceCount = mStream.readInt();
if (mFileHeader.getSequenceDictionary().size() > 0) {
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
throw new SAMFormatException("Number of sequences in text header (" +
mFileHeader.getSequenceDictionary().size() +
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
}
for (int i = 0; i < sequenceCount; i++) {
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
source);
}
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
source);
}
}
} else {
// If only binary sequences are present, copy them into mFileHeader
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
for (int i = 0; i < sequenceCount; i++) {
sequences.add(readSequenceRecord(source));
}
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
}
}
/**
* Reads a single binary sequence record from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private SAMSequenceRecord readSequenceRecord(final String source) {
final int nameLength = mStream.readInt();
if (nameLength <= 1) {
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
}
final String sequenceName = mStream.readString(nameLength - 1);
// Skip the null terminator
mStream.readByte();
final int sequenceLength = mStream.readInt();
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
}
/**
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
* Starting point of iteration is wherever current file position is when the iterator is constructed.
*/
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
private SAMRecord mNextRecord = null;
private final BAMRecordCodec bamRecordCodec;
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
BAMFileIterator() {
this(true);
}
/**
* @param advance Trick to enable subclass to do more setup before advancing
*/
BAMFileIterator(final boolean advance) {
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
if (advance) {
advance();
}
}
public void close() {
if (mCurrentIterator != null && this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
public boolean hasNext() {
return (mNextRecord != null);
}
public SAMRecord next() {
final SAMRecord result = mNextRecord;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
void advance() {
try {
mNextRecord = getNextRecord();
if (mNextRecord != null) {
++this.samRecordIndex;
// Because some decoding is done lazily, the record needs to remember the validation stringency.
mNextRecord.setValidationStringency(mValidationStringency);
if (mValidationStringency != ValidationStringency.SILENT) {
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
SAMUtils.processValidationErrors(validationErrors,
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
}
}
if (eagerDecode && mNextRecord != null) {
mNextRecord.eagerDecode();
}
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
/**
* Read the next record from the input stream.
*/
SAMRecord getNextRecord() throws IOException {
final long startCoordinate = mInputStream.getFilePointer();
final SAMRecord next = bamRecordCodec.decode();
final long stopCoordinate = mInputStream.getFilePointer();
if(mFileReader != null && next != null)
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
return next;
}
/**
* @return The record that will be return by the next call to next()
*/
protected SAMRecord peek() {
return mNextRecord;
}
}
/**
* Prepare to iterate through SAMRecords matching the target interval.
* @param sequence Desired reference sequence.
* @param start 1-based start of target interval, inclusive.
* @param end 1-based end of target interval, inclusive.
* @param queryType contained, overlapping, or starting-at query.
*/
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
final int start,
final int end,
final QueryType queryType) {
long[] filePointers = null;
// Hit the index to determine the chunk boundaries for the required data.
final SAMFileHeader fileHeader = getFileHeader();
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMIndex fileIndex = getIndex();
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
}
// Create an iterator over the above chunk boundaries.
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
// Add some preprocessing filters for edge-case reads that don't fit into this
// query type.
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
}
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
/**
* Look for BAM index file according to standard naming convention.
*
* @param dataFile BAM file name.
* @return Index file name, or null if not found.
*/
private static File findIndexFile(final File dataFile) {
// If input is foo.bam, look for foo.bai
final String bamExtension = ".bam";
File indexFile;
final String fileName = dataFile.getName();
if (fileName.endsWith(bamExtension)) {
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
indexFile = new File(dataFile.getParent(), bai);
if (indexFile.exists()) {
return indexFile;
}
}
// If foo.bai doesn't exist look for foo.bam.bai
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
if (indexFile.exists()) {
return indexFile;
} else {
return null;
}
}
private class BAMFileIndexIterator extends BAMFileIterator {
private long[] mFilePointers = null;
private int mFilePointerIndex = 0;
private long mFilePointerLimit = -1;
/**
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
* @param filePointers the block / offset combination, stored in chunk format.
*/
BAMFileIndexIterator(final long[] filePointers) {
super(false); // delay advance() until after construction
mFilePointers = filePointers;
advance();
}
SAMRecord getNextRecord()
throws IOException {
// Advance to next file block if necessary
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
if (mFilePointers == null ||
mFilePointerIndex >= mFilePointers.length) {
return null;
}
final long startOffset = mFilePointers[mFilePointerIndex++];
final long endOffset = mFilePointers[mFilePointerIndex++];
mInputStream.seek(startOffset);
mFilePointerLimit = endOffset;
}
// Pull next record from stream
return super.getNextRecord();
}
}
/**
* A decorating iterator that filters out records that are outside the bounds of the
* given query parameters.
*/
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private final CloseableIterator<SAMRecord> wrappedIterator;
/**
* The next record to be returned. Will be null if no such record exists.
*/
private SAMRecord mNextRecord;
private final int mReferenceIndex;
private final int mRegionStart;
private final int mRegionEnd;
private final QueryType mQueryType;
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
this.wrappedIterator = iterator;
final SAMFileHeader fileHeader = getFileHeader();
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
mRegionStart = start;
if (queryType == QueryType.STARTING_AT) {
mRegionEnd = mRegionStart;
} else {
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
}
mQueryType = queryType;
mNextRecord = advance();
}
/**
* Returns true if a next element exists; false otherwise.
*/
public boolean hasNext() {
return mNextRecord != null;
}
/**
* Gets the next record from the given iterator.
* @return The next SAM record in the iterator.
*/
public SAMRecord next() {
if(!hasNext())
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
final SAMRecord currentRead = mNextRecord;
mNextRecord = advance();
return currentRead;
}
/**
* Closes down the existing iterator.
*/
public void close() {
if (this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
/**
* @throws UnsupportedOperationException always.
*/
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
SAMRecord advance() {
while (true) {
// Pull next record from stream
if(!wrappedIterator.hasNext())
return null;
final SAMRecord record = wrappedIterator.next();
// If beyond the end of this reference sequence, end iteration
final int referenceIndex = record.getReferenceIndex();
if (referenceIndex != mReferenceIndex) {
if (referenceIndex < 0 ||
referenceIndex > mReferenceIndex) {
return null;
}
// If before this reference sequence, continue
continue;
}
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
// Quick exit to avoid expensive alignment end calculation
return record;
}
final int alignmentStart = record.getAlignmentStart();
// If read is unmapped but has a coordinate, return it if the coordinate is within
// the query region, regardless of whether the mapped mate will be returned.
final int alignmentEnd;
if (mQueryType == QueryType.STARTING_AT) {
alignmentEnd = -1;
} else {
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
record.getAlignmentEnd(): alignmentStart);
}
if (alignmentStart > mRegionEnd) {
// If scanned beyond target region, end iteration
return null;
}
// Filter for overlap with region
if (mQueryType == QueryType.CONTAINED) {
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
return record;
}
} else if (mQueryType == QueryType.OVERLAPPING) {
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
return record;
}
} else {
if (alignmentStart == mRegionStart) {
return record;
}
}
}
}
}
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
private BAMFileIndexUnmappedIterator() {
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
advance();
}
}
}
}

View File

@ -25,6 +25,7 @@
package net.sf.samtools;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Arrays;
@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
super();
}
/**
* Create a new GATKBAMFileSpan from an existing BAMFileSpan.
* @param sourceFileSpan
*/
public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
if(!(sourceFileSpan instanceof BAMFileSpan))
throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
for(Chunk chunk: sourceBAMFileSpan.getChunks())
add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
}
/**
* Convenience constructor to construct a BAM file span from
* a single chunk.

View File

@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
super.setChunkEnd(value);
}
public long getBlockStart() {
return getChunkStart() >>> 16;
}
public int getBlockOffsetStart() {
return (int)(getChunkStart() & 0xFFFF);
}
public long getBlockEnd() {
return getChunkEnd() >>> 16;
}
public int getBlockOffsetEnd() {
return ((int)getChunkEnd() & 0xFFFF);
}
/**
* Computes an approximation of the uncompressed size of the
* chunk, in bytes. Can be used to determine relative weights

View File

@ -0,0 +1,72 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.IOException;
/**
* An input stream formulated for use reading BAM files. Supports
*/
public interface BAMInputStream {
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos) throws IOException;
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer();
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check);
public int read() throws java.io.IOException;
public int read(byte[] bytes) throws java.io.IOException;
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
public long skip(long l) throws java.io.IOException;
public int available() throws java.io.IOException;
public void close() throws java.io.IOException;
public void mark(int i);
public void reset() throws java.io.IOException;
public boolean markSupported();
}

View File

@ -0,0 +1,483 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import net.sf.samtools.FileTruncatedException;
/*
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
*
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
*/
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
private InputStream mStream = null;
private SeekableStream mFile = null;
private byte[] mFileBuffer = null;
private byte[] mCurrentBlock = null;
private int mCurrentOffset = 0;
private long mBlockAddress = 0;
private int mLastBlockLength = 0;
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
/**
* Note that seek() is not supported if this ctor is used.
*/
public BlockCompressedInputStream(final InputStream stream) {
mStream = IOUtil.toBufferedStream(stream);
mFile = null;
}
/**
* Use this ctor if you wish to call seek()
*/
public BlockCompressedInputStream(final File file)
throws IOException {
mFile = new SeekableFileStream(file);
mStream = null;
}
public BlockCompressedInputStream(final URL url) {
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
mStream = null;
}
/**
* For providing some arbitrary data source. No additional buffering is
* provided, so if the underlying source is not buffered, wrap it in a
* SeekableBufferedStream before passing to this ctor.
*/
public BlockCompressedInputStream(final SeekableStream strm) {
mFile = strm;
mStream = null;
}
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check) {
this.blockGunzipper.setCheckCrcs(check);
}
/**
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
* may block in order to fill an internal buffer if it has been exhausted.
*/
public int available()
throws IOException {
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
readBlock();
}
if (mCurrentBlock == null) {
return 0;
}
return mCurrentBlock.length - mCurrentOffset;
}
/**
* Closes the underlying InputStream or RandomAccessFile
*/
public void close()
throws IOException {
if (mFile != null) {
mFile.close();
mFile = null;
} else if (mStream != null) {
mStream.close();
mStream = null;
}
// Encourage garbage collection
mFileBuffer = null;
mCurrentBlock = null;
}
/**
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
* @return the next byte of data, or -1 if the end of the stream is reached.
*/
public int read()
throws IOException {
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
}
/**
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
* or an exception is thrown.
*
* read(buf) has the same effect as read(buf, 0, buf.length).
*
* @param buffer the buffer into which the data is read.
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer)
throws IOException {
return read(buffer, 0, buffer.length);
}
private volatile ByteArrayOutputStream buf = null;
private static final byte eol = '\n';
private static final byte eolCr = '\r';
/**
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
*
* @return A String containing the contents of the line, excluding the line terminating
* character, or null if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
* @
*/
public String readLine() throws IOException {
int available = available();
if (available == 0) {
return null;
}
if(null == buf){ // lazy initialisation
buf = new ByteArrayOutputStream(8192);
}
buf.reset();
boolean done = false;
boolean foundCr = false; // \r found flag
while (!done) {
int linetmpPos = mCurrentOffset;
int bCnt = 0;
while((available-- > 0)){
final byte c = mCurrentBlock[linetmpPos++];
if(c == eol){ // found \n
done = true;
break;
} else if(foundCr){ // previous char was \r
--linetmpPos; // current char is not \n so put it back
done = true;
break;
} else if(c == eolCr){ // found \r
foundCr = true;
continue; // no ++bCnt
}
++bCnt;
}
if(mCurrentOffset < linetmpPos){
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
mCurrentOffset = linetmpPos;
}
available = available();
if(available == 0){
// EOF
done = true;
}
}
return buf.toString();
}
/**
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
*
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
*
* @param buffer buffer into which data is read.
* @param offset the start offset in array b at which the data is written.
* @param length the maximum number of bytes to read.
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer, int offset, int length)
throws IOException {
final int originalLength = length;
while (length > 0) {
final int available = available();
if (available == 0) {
// Signal EOF to caller
if (originalLength == length) {
return -1;
}
break;
}
final int copyLength = Math.min(length, available);
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
mCurrentOffset += copyLength;
offset += copyLength;
length -= copyLength;
}
return originalLength - length;
}
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos)
throws IOException {
if (mFile == null) {
throw new IOException("Cannot seek on stream based file");
}
// Decode virtual file pointer
// Upper 48 bits is the byte offset into the compressed stream of a block.
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
final int available;
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
available = mCurrentBlock.length;
} else {
mFile.seek(compressedOffset);
mBlockAddress = compressedOffset;
mLastBlockLength = 0;
readBlock();
available = available();
}
if (uncompressedOffset > available ||
(uncompressedOffset == available && !eof())) {
throw new IOException("Invalid file pointer: " + pos);
}
mCurrentOffset = uncompressedOffset;
}
private boolean eof() throws IOException {
if (mFile.eof()) {
return true;
}
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
}
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer() {
if (mCurrentOffset == mCurrentBlock.length) {
// If current offset is at the end of the current block, file pointer should point
// to the beginning of the next block.
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
}
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
}
public static long getFileBlock(final long bgzfOffset) {
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
}
/**
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
* @return true if the given file looks like a valid BGZF file.
*/
public static boolean isValidFile(final InputStream stream)
throws IOException {
if (!stream.markSupported()) {
throw new RuntimeException("Cannot test non-buffered stream");
}
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
stream.reset();
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
}
private static boolean isValidBlockHeader(final byte[] buffer) {
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
}
private void readBlock()
throws IOException {
if (mFileBuffer == null) {
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
}
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
if (count == 0) {
// Handle case where there is no empty gzip block at end.
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mCurrentBlock = new byte[0];
return;
}
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
throw new IOException("Premature end of file");
}
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
throw new IOException("Unexpected compressed block length: " + blockLength);
}
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
if (count != remaining) {
throw new FileTruncatedException("Premature end of file");
}
inflateBlock(mFileBuffer, blockLength);
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mLastBlockLength = blockLength;
}
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
throws IOException {
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
byte[] buffer = mCurrentBlock;
mCurrentBlock = null;
if (buffer == null || buffer.length != uncompressedLength) {
try {
buffer = new byte[uncompressedLength];
} catch (NegativeArraySizeException e) {
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
}
}
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
mCurrentBlock = buffer;
}
private int readBytes(final byte[] buffer, final int offset, final int length)
throws IOException {
if (mFile != null) {
return readBytes(mFile, buffer, offset, length);
} else if (mStream != null) {
return readBytes(mStream, buffer, offset, length);
} else {
return 0;
}
}
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private int unpackInt16(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8));
}
private int unpackInt32(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8) |
((buffer[offset+2] & 0xFF) << 16) |
((buffer[offset+3] & 0xFF) << 24));
}
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
public static FileTermination checkTermination(final File file)
throws IOException {
final long fileSize = file.length();
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
return FileTermination.DEFECTIVE;
}
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
try {
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
raFile.readFully(buf);
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
return FileTermination.HAS_TERMINATOR_BLOCK;
}
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
buf = new byte[bufsize];
raFile.seek(fileSize - bufsize);
raFile.read(buf);
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
i >= 0; --i) {
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
continue;
}
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
if (buf.length - i == totalBlockSizeMinusOne + 1) {
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
} else {
return FileTermination.DEFECTIVE;
}
}
return FileTermination.DEFECTIVE;
} finally {
raFile.close();
}
}
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
for (int i = 0; i < length; ++i) {
if (preamble[i] != buf[i + startOffset]) {
return false;
}
}
return true;
}
}

View File

@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
* used to indicate an error occured
*
* @param msg the message
* @param e the error
* @param t the error
*/
public static void exitSystemWithError(String msg, final Exception e) {
public static void exitSystemWithError(String msg, final Throwable t) {
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("stack trace %n");
e.printStackTrace();
t.printStackTrace();
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
@ -394,8 +394,8 @@ public abstract class CommandLineProgram {
*
* @param e the exception occured
*/
public static void exitSystemWithError(Exception e) {
exitSystemWithError(e.getMessage(), e);
public static void exitSystemWithError(Throwable t) {
exitSystemWithError(t.getMessage(), t);
}
/**

View File

@ -99,8 +99,8 @@ public class CommandLineGATK extends CommandLineExecutable {
} catch (net.sf.samtools.SAMException e) {
// Let's try this out and see how it is received by our users
exitSystemWithSamError(e);
} catch (Exception e) {
exitSystemWithError(e);
} catch (Throwable t) {
exitSystemWithError(t);
}
}

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
*/
private Collection<ReadFilter> filters;
/**
* Controls the allocation of threads between CPU vs IO.
*/
private ThreadAllocation threadAllocation;
/**
* A currently hacky unique name for this GATK instance
*/
@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();
// Prepare the data for traversal.
initializeDataSources();
@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
// create the output streams "
initializeOutputStreams(microScheduler.getOutputTracker());
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results
return microScheduler.execute(this.walker, shardStrategy);
@ -266,6 +275,16 @@ public class GenomeAnalysisEngine {
return Collections.unmodifiableList(filters);
}
/**
* Parse out the thread allocation from the given command-line argument.
*/
private void determineThreadAllocation() {
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null;
Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null;
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
}
/**
* Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package.
@ -286,7 +305,7 @@ public class GenomeAnalysisEngine {
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
}
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
}
protected DownsamplingMethod getDownsamplingMethod() {
@ -397,103 +416,49 @@ public class GenomeAnalysisEngine {
* @param intervals intervals
* @return the sharding strategy
*/
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
if(!readsDataSource.isEmpty()) {
if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
Shard.ShardType shardType;
if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
shardType = Shard.ShardType.LOCUS;
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.
if(walker instanceof ReadPairWalker) {
if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
if(intervals != null && !intervals.isEmpty())
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
}
if(intervals == null)
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
shardType = Shard.ShardType.READ;
else
throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
List<GenomeLoc> region;
if(intervals != null)
region = intervals.toList();
else {
region = new ArrayList<GenomeLoc>();
for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
}
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
}
else {
final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000;
if(intervals == null)
return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
else
return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
}
ShardStrategy shardStrategy;
ShardStrategyFactory.SHATTER_STRATEGY shardType;
long SHARD_SIZE = 100000L;
if (walker instanceof LocusWalker) {
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
if (intervals != null && !intervals.isEmpty()) {
if (readsDataSource == null)
throw new IllegalArgumentException("readsDataSource is null");
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser(),
intervals);
} else
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,getGenomeLocParser());
} else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
if (intervals != null && !intervals.isEmpty()) {
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser(),
intervals);
} else {
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser());
}
} else if (walker instanceof ReadPairWalker) {
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
if(intervals != null && !intervals.isEmpty())
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
getGenomeLocParser());
} else
throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
return shardStrategy;
}
protected boolean flashbackData() {
@ -751,6 +716,8 @@ public class GenomeAnalysisEngine {
return new SAMDataSource(
samReaderIDs,
threadAllocation,
argCollection.numberOfBAMFileHandles,
genomeLocParser,
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
@ -763,8 +730,7 @@ public class GenomeAnalysisEngine {
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
argCollection.defaultBaseQualities,
!argCollection.disableLowMemorySharding);
argCollection.defaultBaseQualities);
}
/**

View File

@ -194,10 +194,14 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
public int numberOfThreads = 1;
/** How many threads should be allocated to this analysis. */
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
public Integer numberOfThreads = 1;
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false)
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
public Integer numberOfBAMFileHandles = null;
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
public List<String> readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
@ -292,9 +296,6 @@ public class GATKArgumentCollection {
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
public boolean disableLowMemorySharding = false;
// --------------------------------------------------------------------------------------------------------------
//
// methods
@ -365,7 +366,11 @@ public class GATKArgumentCollection {
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
return false;
}
if (other.numberOfThreads != this.numberOfThreads) {
if (!other.numberOfThreads.equals(this.numberOfThreads)) {
return false;
}
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
return false;
}
if (other.intervalMerging != this.intervalMerging) {
@ -389,9 +394,6 @@ public class GATKArgumentCollection {
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;
if (disableLowMemorySharding != other.disableLowMemorySharding)
return false;
return true;
}

View File

@ -1,128 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 7, 2011
* Time: 2:46:34 PM
* To change this template use File | Settings | File Templates.
*/
public class BAMBlockStartIterator implements Iterator<Long> {
/**
* How large is a BGZF header?
*/
private static int BGZF_HEADER_SIZE = 18;
/**
* Where within the header does the BLOCKSIZE actually live?
*/
private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
private FileChannel bamInputChannel;
private ByteBuffer headerByteBuffer;
private long nextLocation = 0;
public BAMBlockStartIterator(File bamFile) {
try {
FileInputStream bamInputStream = new FileInputStream(bamFile);
bamInputChannel = bamInputStream.getChannel();
headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
}
catch(IOException ex) {
throw new StingException("Could not open file",ex);
}
}
public boolean hasNext() {
return nextLocation != -1;
}
public Long next() {
long currentLocation = nextLocation;
advance();
return currentLocation;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
}
private void advance() {
int readStatus;
headerByteBuffer.clear();
try {
readStatus = bamInputChannel.read(headerByteBuffer);
}
catch(IOException ex) {
throw new StingException("Could not read header data",ex);
}
if(readStatus == -1) {
nextLocation = -1;
try {
bamInputChannel.close();
}
catch(IOException ex) {
throw new StingException("Could not close input file",ex);
}
return;
}
headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
int blockSize = headerByteBuffer.getShort();
try {
bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
nextLocation = bamInputChannel.position();
}
catch(IOException ex) {
throw new StingException("Could not reposition input stream",ex);
}
}
public static void main(String argv[]) throws IOException {
BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
int i = 0;
while(blockStartIterator.hasNext())
System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
}
}

View File

@ -1,195 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.LinearIndex;
import java.util.*;
/**
* Represents the contents of a bam index file for one reference.
* A BAM index (.bai) file contains information for all references in the bam file.
* This class describes the data present in the index file for one of these references;
* including the bins, chunks, and linear index.
*/
class BAMIndexContent {
/**
* The reference sequence for the data currently loaded.
*/
private final int mReferenceSequence;
/**
* A list of all bins in the above reference sequence.
*/
private final BinList mBinList;
/**
* The linear index for the reference sequence above.
*/
private final LinearIndex mLinearIndex;
/**
* @param referenceSequence Content corresponds to this reference.
* @param bins Array of bins represented by this content, possibly sparse
* @param numberOfBins Number of non-null bins
* @param linearIndex Additional index used to optimize queries
*/
BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
this.mReferenceSequence = referenceSequence;
this.mBinList = new BinList(bins, numberOfBins);
this.mLinearIndex = linearIndex;
}
/**
* Reference for this Content
*/
public int getReferenceSequence() {
return mReferenceSequence;
}
/**
* Does this content have anything in this bin?
*/
public boolean containsBin(final GATKBin bin) {
return mBinList.getBin(bin.getBinNumber()) != null;
}
/**
* @return iterable list of bins represented by this content
*/
public BinList getBins() {
return mBinList;
}
/**
* @return the number of non-null bins represented by this content
*/
int getNumberOfNonNullBins() {
return mBinList.getNumberOfNonNullBins();
}
/**
* @return all chunks associated with all bins in this content
*/
public List<GATKChunk> getAllChunks() {
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
for (GATKBin b : mBinList)
if (b.getChunkList() != null) {
allChunks.addAll(Arrays.asList(b.getChunkList()));
}
return Collections.unmodifiableList(allChunks);
}
/**
* @return the linear index represented by this content
*/
public LinearIndex getLinearIndex() {
return mLinearIndex;
}
/**
* This class is used to encapsulate the list of Bins store in the BAMIndexContent
* While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
*/
class BinList implements Iterable<GATKBin> {
private final GATKBin[] mBinArray;
public final int numberOfNonNullBins;
public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
/**
* @param binArray a sparse array representation of the bins. The index into the array is the bin number.
* @param numberOfNonNullBins
*/
BinList(GATKBin[] binArray, int numberOfNonNullBins) {
this.mBinArray = binArray;
this.numberOfNonNullBins = numberOfNonNullBins;
this.maxBinNumber = mBinArray.length - 1;
}
GATKBin getBin(int binNumber) {
if (binNumber > maxBinNumber) return null;
return mBinArray[binNumber];
}
int getNumberOfNonNullBins() {
return numberOfNonNullBins;
}
/**
* Gets an iterator over all non-null bins.
*
* @return An iterator over all bins.
*/
public Iterator<GATKBin> iterator() {
return new BinIterator();
}
private class BinIterator implements Iterator<GATKBin> {
/**
* Stores the bin # of the Bin currently in use.
*/
private int nextBin;
public BinIterator() {
nextBin = 0;
}
/**
* Are there more bins in this set, waiting to be returned?
*
* @return True if more bins are remaining.
*/
public boolean hasNext() {
while (nextBin <= maxBinNumber) {
if (getBin(nextBin) != null) return true;
nextBin++;
}
return false;
}
/**
* Gets the next bin in the provided BinList.
*
* @return the next available bin in the BinList.
*/
public GATKBin next() {
if (!hasNext())
throw new NoSuchElementException("This BinIterator is currently empty");
GATKBin result = getBin(nextBin);
nextBin++;
return result;
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from a bin iterator");
}
}
}
}

View File

@ -1,29 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.Bin;
import java.util.HashMap;
import java.util.Map;
/**
* Models a bin at which all BAM files in the merged input stream overlap.
*/
class BAMOverlap {
public final int start;
public final int stop;
private final Map<SAMReaderID,Bin> bins = new HashMap<SAMReaderID,Bin>();
public BAMOverlap(final int start, final int stop) {
this.start = start;
this.stop = stop;
}
public void addBin(final SAMReaderID id, final Bin bin) {
bins.put(id,bin);
}
public Bin getBin(final SAMReaderID id) {
return bins.get(id);
}
}

View File

@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
/**
* Create a new BAM schedule based on the given index.
* @param indexFiles Index files.
* @param dataSource The SAM data source to use.
* @param intervals List of
*/
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) {
public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
if(intervals.isEmpty())
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
referenceSequence = intervals.get(0).getContigIndex();
referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
createScheduleFile();
readerIDs.addAll(indexFiles.keySet());
readerIDs.addAll(dataSource.getReaderIDs());
for(final SAMReaderID reader: readerIDs) {
final GATKBAMIndex index = indexFiles.get(reader);
final GATKBAMIndex index = dataSource.getIndex(reader);
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
if(selectedIterators.isEmpty())
return;
// Create the target schedule entry
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
// For each schedule entry with data, load the data into the merged schedule.
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
scheduleIterator.next();
}
// For each schedule entry without data, add a blank entry.
for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
}
nextScheduleEntry = mergedScheduleEntry;
}

View File

@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*;
@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator<FilePointer> {
private FilePointer nextFilePointer = null;
private final GenomeLocSortedSet loci;
private GenomeLocSortedSet loci;
private PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus;
private final PeekableIterator<GenomeLoc> locusIterator;
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
// Match only on sequence name; trust startup validation to make sure all the sequences match.
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
}
scheduler.populateFilteredIntervalList(intervals);
return scheduler;
}
private GenomeLoc currentLocus;
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
scheduler.populateUnfilteredIntervalList(parser);
return scheduler;
}
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
scheduler.populateFilteredIntervalList(loci);
return scheduler;
}
private BAMScheduler(final SAMDataSource dataSource) {
this.dataSource = dataSource;
for(SAMReaderID reader: dataSource.getReaderIDs())
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
for(SAMReaderID reader: dataSource.getReaderIDs()) {
GATKBAMIndex index = dataSource.getIndex(reader);
if(index != null)
indexFiles.put(reader,dataSource.getIndex(reader));
}
}
/**
* The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
* @param loci The list of locations to search and iterate over.
*/
private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
if(locusIterator.hasNext())
currentLocus = locusIterator.next();
advance();
if(!indexFiles.isEmpty()) {
// If index data is available, start up the iterator.
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
if(locusIterator.hasNext())
currentLocus = locusIterator.next();
advance();
}
else {
// Otherwise, seed the iterator with a single file pointer over the entire region.
nextFilePointer = generatePointerOverEntireFileset();
for(GenomeLoc locus: loci)
nextFilePointer.addLocation(locus);
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
}
}
/**
* The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
* from just before the start of the region to the end of the region.
*/
private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
this.loci = new GenomeLocSortedSet(parser);
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
nextFilePointer = generatePointerOverEntireFileset();
}
/**
* Generate a span that runs from the end of the BAM header to the end of the fle.
* @return A file pointer over the specified region.
*/
private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer();
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
for(SAMReaderID reader: dataSource.getReaderIDs())
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
return filePointer;
}
public boolean hasNext() {
@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(!hasNext())
throw new NoSuchElementException("No next element available in interval sharder");
FilePointer currentFilePointer = nextFilePointer;
nextFilePointer = null;
advance();
return currentFilePointer;
}
@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(loci.isEmpty())
return;
nextFilePointer = null;
while(nextFilePointer == null && currentLocus != null) {
// special case handling of the unmapped shard.
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
currentLocus = null;
continue;
}
@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null;
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
// No overlapping data at all.
if(scheduleEntry != null) {
@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator<FilePointer> {
}
else {
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
for(SAMReaderID reader: indexFiles.keySet())
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
}
@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
// Early exit if no bins were found.
if(coveredRegion == null) {
// for debugging only: maximum split is 16384.
if(currentLocus.size() > 16384) {
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
nextFilePointer.addLocation(splitContigs[0]);
currentLocus = splitContigs[1];
}
else {
nextFilePointer.addLocation(currentLocus);
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
}
nextFilePointer.addLocation(currentLocus);
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
continue;
}
// Early exit if only part of the first interval was found.
if(currentLocus.startsBefore(coveredRegion)) {
// for debugging only: maximum split is 16384.
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
nextFilePointer.addLocation(splitContigs[0]);
@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator<FilePointer> {
/**
* Get the next overlapping tree of bins associated with the given BAM file.
* @param indices BAM indices.
* @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci.
*/
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
// Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
// we'll be using the correct contig index for the BAMs.
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
if(bamScheduleIterator != null)
bamScheduleIterator.close();
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
lastReferenceSequenceLoaded = currentContigIndex;
// Naive algorithm: find all elements in current contig for proper schedule creation.
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
for(GenomeLoc locus: loci) {
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
lociInContig.add(locus);
}
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(dataSource,lociInContig));
}
if(!bamScheduleIterator.hasNext())
@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
}
/**
* Create a span from the given start point to the end of the file.
* @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
* @return A file span from the given point to the end of the file.
*/
private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
}
}

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.LinkedList;
import java.util.Queue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* Preloads BGZF blocks in preparation for unzipping and data processing.
* TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
* TODO: be replaced with a central thread management strategy.
*/
public class BGZFBlockLoadingDispatcher {
/**
* The file handle cache, used when allocating blocks from the dispatcher.
*/
private final FileHandleCache fileHandleCache;
private final ExecutorService threadPool;
private final Queue<SAMReaderPosition> inputQueue;
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
threadPool = Executors.newFixedThreadPool(numThreads);
fileHandleCache = new FileHandleCache(numFileHandles);
inputQueue = new LinkedList<SAMReaderPosition>();
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
}
/**
* Initiates a request for a new block load.
* @param readerPosition Position at which to load.
*/
void queueBlockLoad(final SAMReaderPosition readerPosition) {
synchronized(inputQueue) {
inputQueue.add(readerPosition);
inputQueue.notify();
}
}
/**
* Claims the next work request from the queue.
* @return The next work request, or null if none is available.
*/
SAMReaderPosition claimNextWorkRequest() {
synchronized(inputQueue) {
while(inputQueue.isEmpty()) {
try {
inputQueue.wait();
}
catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
}
}
return inputQueue.poll();
}
}
}

View File

@ -0,0 +1,436 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.BAMInputStream;
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.RuntimeEOFException;
import net.sf.samtools.util.SeekableStream;
import org.broad.tribble.util.BlockCompressedStreamConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.LinkedList;
/**
* Presents decompressed blocks to the SAMFileReader.
*/
public class BlockInputStream extends SeekableStream implements BAMInputStream {
/**
* Mechanism for triggering block loads.
*/
private final BGZFBlockLoadingDispatcher dispatcher;
/**
* The reader whose data is supplied by this input stream.
*/
private final SAMReaderID reader;
/**
* Length of the input stream.
*/
private final long length;
/**
* The latest error reported by an asynchronous block load.
*/
private Throwable error;
/**
* Current position.
*/
private SAMReaderPosition position;
/**
* A stream of compressed data blocks.
*/
private final ByteBuffer buffer;
/**
* Offsets of the given blocks in the buffer.
*/
private LinkedList<Integer> blockOffsets = new LinkedList<Integer>();
/**
* Source positions of the given blocks in the buffer.
*/
private LinkedList<Long> blockPositions = new LinkedList<Long>();
/**
* Provides a lock to wait for more data to arrive.
*/
private final Object lock = new Object();
/**
* An input stream to use when comparing data back to what it should look like.
*/
private final BlockCompressedInputStream validatingInputStream;
/**
* Has the buffer been filled since last request?
*/
private boolean bufferFilled = false;
/**
* Create a new block presenting input stream with a dedicated buffer.
* @param dispatcher the block loading messenger.
* @param reader the reader for which to load data.
* @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
*/
BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
this.reader = reader;
this.length = reader.samFile.length();
buffer = ByteBuffer.wrap(new byte[64*1024]);
buffer.order(ByteOrder.LITTLE_ENDIAN);
// The state of the buffer assumes that the range of data written into the buffer appears in the range
// [position,limit), while extra capacity exists in the range [limit,capacity)
buffer.limit(0);
this.dispatcher = dispatcher;
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
try {
if(validate) {
System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
validatingInputStream = new BlockCompressedInputStream(reader.samFile);
// A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
// Poke the stream to start reading data.
validatingInputStream.available();
}
else
validatingInputStream = null;
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
public long length() {
return length;
}
public long getFilePointer() {
long filePointer;
synchronized(lock) {
if(buffer.remaining() > 0) {
// If there's data in the buffer, figure out from whence it came.
final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
final int blockOffset = buffer.position();
filePointer = blockAddress << 16 | blockOffset;
}
else {
// Otherwise, find the next position to load.
filePointer = position.getBlockAddress() << 16;
}
}
if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
return filePointer;
}
public void seek(long target) {
// TODO: Validate the seek point.
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
synchronized(lock) {
clearBuffers();
position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
waitForBufferFill();
buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
if(validatingInputStream != null) {
try {
validatingInputStream.seek(target);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
}
private void clearBuffers() {
this.position.reset();
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
// Indicate no data to be read.
buffer.clear();
buffer.limit(0);
blockOffsets.clear();
blockPositions.clear();
}
public boolean eof() {
synchronized(lock) {
// TODO: Handle multiple empty BGZF blocks at end of the file.
return position != null && position.getBlockAddress() >= length;
}
}
public void setCheckCrcs(final boolean check) {
// TODO: Implement
}
/**
* Submits a new access plan for the given dataset.
* @param position The next seek point for BAM data in this reader.
*/
public void submitAccessPlan(final SAMReaderPosition position) {
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
synchronized(lock) {
// Assume that the access plan is going to tell us to start where we are and move forward.
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
position.advancePosition(this.position.getBlockAddress());
}
this.position = position;
}
private void compactBuffer() {
// Compact buffer to maximize storage space.
int bytesToRemove = 0;
// Look ahead to see if we can compact away the first block in the series.
while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
bytesToRemove += blockOffsets.remove();
blockPositions.remove();
}
// If we end up with an empty block at the end of the series, compact this as well.
if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
bytesToRemove += buffer.position();
blockOffsets.remove();
blockPositions.remove();
}
int finalBufferStart = buffer.position() - bytesToRemove;
int finalBufferSize = buffer.remaining();
buffer.position(bytesToRemove);
buffer.compact();
buffer.position(finalBufferStart);
buffer.limit(finalBufferStart+finalBufferSize);
}
/**
* Push contents of incomingBuffer into the end of this buffer.
* MUST be called from a thread that is NOT the reader thread.
* @param incomingBuffer The data being pushed into this input stream.
* @param position target position for the data.
*/
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
synchronized(lock) {
try {
compactBuffer();
// Open up the buffer for more reading.
buffer.limit(buffer.capacity());
// Advance the position to take the most recent read into account.
long lastReadPosition = position.getBlockAddress();
byte[] validBytes = null;
if(validatingInputStream != null) {
validBytes = new byte[incomingBuffer.remaining()];
byte[] currentBytes = new byte[incomingBuffer.remaining()];
int pos = incomingBuffer.position();
int lim = incomingBuffer.limit();
incomingBuffer.get(currentBytes);
incomingBuffer.limit(lim);
incomingBuffer.position(pos);
long currentFilePointer = validatingInputStream.getFilePointer();
validatingInputStream.seek(lastReadPosition << 16);
validatingInputStream.read(validBytes);
validatingInputStream.seek(currentFilePointer);
if(!Arrays.equals(validBytes,currentBytes))
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
}
this.position = position;
position.advancePosition(filePosition);
if(buffer.remaining() < incomingBuffer.remaining()) {
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
lock.wait();
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
}
// Queue list of block offsets / block positions.
blockOffsets.add(buffer.position());
blockPositions.add(lastReadPosition);
buffer.put(incomingBuffer);
// Set up the buffer for reading.
buffer.flip();
bufferFilled = true;
lock.notify();
}
catch(Exception ex) {
reportException(ex);
lock.notify();
}
}
}
void reportException(Throwable t) {
synchronized(lock) {
this.error = t;
lock.notify();
}
}
private void checkForErrors() {
synchronized(lock) {
if(error != null) {
ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
toThrow.setStackTrace(error.getStackTrace());
throw toThrow;
}
}
}
/**
* Reads the next byte of data from the input stream.
* @return Next byte of data, from 0->255, as an int.
*/
@Override
public int read() {
byte[] singleByte = new byte[1];
read(singleByte);
return singleByte[0];
}
/**
* Fills the given byte array to the extent possible.
* @param bytes byte array to be filled.
* @return The number of bytes actually read.
*/
@Override
public int read(byte[] bytes) {
return read(bytes,0,bytes.length);
}
@Override
public int read(byte[] bytes, final int offset, final int length) {
int remaining = length;
synchronized(lock) {
while(remaining > 0) {
// Check for error conditions during last read.
checkForErrors();
// If completely out of space, queue up another buffer fill.
waitForBufferFill();
// Couldn't manage to load any data at all; abort and return what's available.
if(buffer.remaining() == 0)
break;
int numBytesToCopy = Math.min(buffer.remaining(),remaining);
buffer.get(bytes,length-remaining+offset,numBytesToCopy);
remaining -= numBytesToCopy;
//if(remaining > 0)
// System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
// TODO: Assert that we don't copy across a block boundary
}
// Notify any waiting threads that some of the contents of the buffer were removed.
if(length-remaining > 0)
lock.notify();
}
if(validatingInputStream != null) {
byte[] validBytes = new byte[length];
try {
validatingInputStream.read(validBytes,offset,length);
for(int i = offset; i < offset+length; i++) {
if(bytes[i] != validBytes[i]) {
System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
}
}
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
return length - remaining;
}
public void close() {
if(validatingInputStream != null) {
try {
validatingInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
public String getSource() {
return reader.getSamFilePath();
}
private void waitForBufferFill() {
synchronized(lock) {
bufferFilled = false;
if(buffer.remaining() == 0 && !eof()) {
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
dispatcher.queueBlockLoad(position);
try {
lock.wait();
}
catch(InterruptedException ex) {
// TODO: handle me.
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
}
if(bufferFilled && buffer.remaining() == 0)
throw new RuntimeEOFException("No more data left in InputStream");
}
}
}
}

View File

@ -0,0 +1,188 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broad.tribble.util.BlockCompressedStreamConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/**
* An engine for loading blocks.
*/
class BlockLoader implements Runnable {
/**
* Coordinates the input queue.
*/
private BGZFBlockLoadingDispatcher dispatcher;
/**
* A cache from which to retrieve open file handles.
*/
private final FileHandleCache fileHandleCache;
/**
* Whether asynchronous decompression should happen.
*/
private final boolean decompress;
/**
* An direct input buffer for incoming data from disk.
*/
private final ByteBuffer inputBuffer;
public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
this.dispatcher = dispatcher;
this.fileHandleCache = fileHandleCache;
this.decompress = decompress;
this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
}
public void run() {
for(;;) {
SAMReaderPosition readerPosition = null;
try {
readerPosition = dispatcher.claimNextWorkRequest();
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
long blockAddress = readerPosition.getBlockAddress();
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
long nextBlockAddress = position(inputStream);
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
int bytesCopied = block.remaining();
BlockInputStream bamInputStream = readerPosition.getInputStream();
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
}
catch(Throwable error) {
if(readerPosition != null && readerPosition.getInputStream() != null)
readerPosition.getInputStream().reportException(error);
}
}
}
private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
FileChannel channel = inputStream.getChannel();
// Read the block header
channel.position(blockAddress);
int uncompressedDataSize = 0;
int bufferSize = 0;
do {
inputBuffer.clear();
inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
channel.read(inputBuffer);
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
// value into an int buffer to transfer the bitwise contents into an int.
inputBuffer.flip();
if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
// Verify that the file was read at a valid point.
if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
}
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
// Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
inputBuffer.limit(bufferSize);
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
channel.read(inputBuffer);
// Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
//System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
}
while(uncompressedDataSize == 0 && channel.position() < channel.size());
// Prepare the buffer for reading.
inputBuffer.flip();
return inputBuffer;
}
private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
final int compressedBufferSize = bgzfBlock.remaining();
// Determine the uncompressed buffer size (
bgzfBlock.position(bgzfBlock.limit()-4);
int uncompressedBufferSize = bgzfBlock.getInt();
byte[] uncompressedContent = new byte[uncompressedBufferSize];
// Bound the CDATA section of the buffer.
bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
byte[] compressedContent = new byte[bgzfBlock.remaining()];
ByteBuffer.wrap(compressedContent).put(bgzfBlock);
// Decompress the buffer.
final Inflater inflater = new Inflater(true);
inflater.setInput(compressedContent);
int bytesUncompressed = inflater.inflate(uncompressedContent);
if(bytesUncompressed != uncompressedBufferSize)
throw new ReviewedStingException("Error decompressing block");
return ByteBuffer.wrap(uncompressedContent);
}
private long position(final FileInputStream inputStream) throws IOException {
return inputStream.getChannel().position();
}
private int unpackUByte8(final ByteBuffer buffer,final int position) {
return buffer.get(position) & 0xFF;
}
private int unpackUInt16(final ByteBuffer buffer,final int position) {
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
// value into an int buffer to transfer the bitwise contents into an int.
return buffer.getShort(position) & 0xFFFF;
}
}

View File

@ -0,0 +1,231 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
/**
* Caches frequently used file handles. Right now, caches only a single file handle.
* TODO: Generalize to support arbitrary file handle caches.
*/
public class FileHandleCache {
/**
* The underlying data structure storing file handles.
*/
private final FileHandleStorage fileHandleStorage;
/**
* How many file handles should be kept open at once.
*/
private final int cacheSize;
/**
* A uniquifier: assign a unique ID to every instance of a file handle.
*/
private final Map<SAMReaderID,Integer> keyCounter = new HashMap<SAMReaderID,Integer>();
/**
* A shared lock, private so that outside users cannot notify it.
*/
private final Object lock = new Object();
/**
* Indicates how many file handles are outstanding at this point.
*/
private int numOutstandingFileHandles = 0;
/**
* Create a new file handle cache of the given cache size.
* @param cacheSize how many readers to hold open at once.
*/
public FileHandleCache(final int cacheSize) {
this.cacheSize = cacheSize;
fileHandleStorage = new FileHandleStorage();
}
/**
* Retrieves or opens a file handle for the given reader ID.
* @param key The ke
* @return A file input stream from the cache, if available, or otherwise newly opened.
*/
public FileInputStream claimFileInputStream(final SAMReaderID key) {
synchronized(lock) {
FileInputStream inputStream = findExistingEntry(key);
if(inputStream == null) {
try {
// If the cache is maxed out, wait for another file handle to emerge.
if(numOutstandingFileHandles >= cacheSize)
lock.wait();
}
catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupted while waiting for a file handle");
}
inputStream = openInputStream(key);
}
numOutstandingFileHandles++;
//System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
return inputStream;
}
}
/**
* Releases the current reader and returns it to the cache.
* @param key The reader.
* @param inputStream The stream being used.
*/
public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
synchronized(lock) {
numOutstandingFileHandles--;
UniqueKey newID = allocateKey(key);
fileHandleStorage.put(newID,inputStream);
// Let any listeners know that another file handle has become available.
lock.notify();
}
}
/**
* Finds an existing entry in the storage mechanism.
* @param key Reader.
* @return a cached stream, if available. Otherwise,
*/
private FileInputStream findExistingEntry(final SAMReaderID key) {
int existingHandles = getMostRecentUniquifier(key);
// See if any of the keys currently exist in the repository.
for(int i = 0; i <= existingHandles; i++) {
UniqueKey uniqueKey = new UniqueKey(key,i);
if(fileHandleStorage.containsKey(uniqueKey))
return fileHandleStorage.remove(uniqueKey);
}
return null;
}
/**
* Gets the most recent uniquifier used for the given reader.
* @param reader Reader for which to determine uniqueness.
* @return
*/
private int getMostRecentUniquifier(final SAMReaderID reader) {
if(keyCounter.containsKey(reader))
return keyCounter.get(reader);
else return -1;
}
private UniqueKey allocateKey(final SAMReaderID reader) {
int uniquifier = getMostRecentUniquifier(reader)+1;
keyCounter.put(reader,uniquifier);
return new UniqueKey(reader,uniquifier);
}
private FileInputStream openInputStream(final SAMReaderID reader) {
try {
return new FileInputStream(reader.getSamFilePath());
}
catch(IOException ex) {
throw new StingException("Unable to open input file");
}
}
private void closeInputStream(final FileInputStream inputStream) {
try {
inputStream.close();
}
catch(IOException ex) {
throw new StingException("Unable to open input file");
}
}
/**
* Actually contains the file handles, purging them as they get too old.
*/
private class FileHandleStorage extends LinkedHashMap<UniqueKey,FileInputStream> {
/**
* Remove the oldest entry
* @param entry Entry to consider removing.
* @return True if the cache size has been exceeded. False otherwise.
*/
@Override
protected boolean removeEldestEntry(Map.Entry<UniqueKey,FileInputStream> entry) {
synchronized (lock) {
if(size() > cacheSize) {
keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
closeInputStream(entry.getValue());
return true;
}
}
return false;
}
}
/**
* Uniquifies a key by adding a numerical uniquifier.
*/
private class UniqueKey {
/**
* The file handle's key.
*/
private final SAMReaderID key;
/**
* A uniquifier, so that multiple of the same reader can exist in the cache.
*/
private final int uniqueID;
public UniqueKey(final SAMReaderID reader, final int uniqueID) {
this.key = reader;
this.uniqueID = uniqueID;
}
@Override
public boolean equals(Object other) {
if(!(other instanceof UniqueKey))
return false;
UniqueKey otherUniqueKey = (UniqueKey)other;
return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
}
@Override
public int hashCode() {
return key.hashCode();
}
}
}

View File

@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
@ -40,28 +41,25 @@ import java.util.*;
*/
public class FilePointer {
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
protected final BAMOverlap overlap;
protected final List<GenomeLoc> locations;
protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
/**
* Does this file pointer point into an unmapped region?
*/
protected final boolean isRegionUnmapped;
public FilePointer() {
this((BAMOverlap)null);
}
public FilePointer(final GenomeLoc location) {
this.overlap = null;
this.locations = Collections.singletonList(location);
this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
}
public FilePointer(final BAMOverlap overlap) {
this.overlap = overlap;
this.locations = new ArrayList<GenomeLoc>();
this.isRegionUnmapped = false;
public FilePointer(final GenomeLoc... locations) {
this.locations.addAll(Arrays.asList(locations));
boolean foundMapped = false, foundUnmapped = false;
for(GenomeLoc location: locations) {
if(GenomeLoc.isUnmapped(location))
foundUnmapped = true;
else
foundMapped = true;
}
if(foundMapped && foundUnmapped)
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
this.isRegionUnmapped = foundUnmapped;
}
/**
@ -217,4 +215,20 @@ public class FilePointer {
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
combined.addFileSpans(initialElement.getKey(),fileSpan);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("FilePointer:%n");
builder.append("\tlocations = {");
builder.append(Utils.join(";",locations));
builder.append("}%n\tregions = %n");
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
builder.append(entry.getKey());
builder.append("= {");
builder.append(entry.getValue());
builder.append("}");
}
return builder.toString();
}
}

View File

@ -25,419 +25,58 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.AbstractBAMFileIndex;
import net.sf.samtools.Bin;
import net.sf.samtools.BrowseableBAMIndex;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
import java.util.Iterator;
/**
* Shard intervals based on position within the BAM file.
*
* @author mhanna
* @version 0.1
* Handles the process of aggregating BAM intervals into individual shards.
* TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
*/
public class IntervalSharder {
private static Logger logger = Logger.getLogger(IntervalSharder.class);
public class IntervalSharder implements Iterator<FilePointer> {
/**
* The iterator actually laying out the data for BAM scheduling.
*/
private final PeekableIterator<FilePointer> wrappedIterator;
public static Iterator<FilePointer> shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
return new IntervalSharder.FilePointerIterator(dataSource,loci);
/**
* The parser, for interval manipulation.
*/
private final GenomeLocParser parser;
public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
}
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
}
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
}
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
this.parser = parser;
}
public boolean hasNext() {
return wrappedIterator.hasNext();
}
/**
* A lazy-loading iterator over file pointers.
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
* @return The next file pointer to process.
*/
private static class FilePointerIterator implements Iterator<FilePointer> {
final SAMDataSource dataSource;
final GenomeLocSortedSet loci;
final PeekableIterator<GenomeLoc> locusIterator;
final Queue<FilePointer> cachedFilePointers = new LinkedList<FilePointer>();
public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
this.dataSource = dataSource;
this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
advance();
}
public boolean hasNext() {
return !cachedFilePointers.isEmpty();
}
public FilePointer next() {
if(!hasNext())
throw new NoSuchElementException("FilePointerIterator iteration is complete");
FilePointer filePointer = cachedFilePointers.remove();
if(cachedFilePointers.isEmpty())
advance();
return filePointer;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
}
private void advance() {
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
String contig = null;
// If the next section of the BAM to be processed is unmapped, handle this region separately.
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
contig = null;
while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
GenomeLoc nextLocus = locusIterator.next();
contig = nextLocus.getContig();
nextBatch.add(nextLocus);
}
}
if(nextBatch.size() > 0) {
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
}
}
public FilePointer next() {
FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}
/**
* Merge / split intervals based on an awareness of the structure of the BAM file.
* @param dataSource
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
* @param loci
* @return
*/
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
if(contig == null) {
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
filePointer.addFileSpans(id,null);
return Collections.singletonList(filePointer);
}
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
List<FilePointer> filePointers = new ArrayList<FilePointer>();
FilePointer lastFilePointer = null;
BAMOverlap lastBAMOverlap = null;
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
for(SAMReaderID id: dataSource.getReaderIDs()) {
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
// If this contig can't be found in the reference, skip over it.
if(referenceSequence == null && contig != null)
continue;
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
binMerger.addReader(id,
index,
referenceSequence.getSequenceIndex(),
index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
// Cache the reader for later data lookup.
readerToIndexMap.put(id,index);
}
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
for(GenomeLoc location: loci) {
if(!location.getContig().equals(contig))
throw new ReviewedStingException("Location outside bounds of contig");
if(!binIterator.hasNext())
break;
int locationStart = location.getStart();
final int locationStop = location.getStop();
// Advance to first bin.
while(binIterator.peek().stop < locationStart)
binIterator.next();
// Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
// sure the extending bin is not pruned from the list.
List<BAMOverlap> bamOverlaps = new ArrayList<BAMOverlap>();
while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
bamOverlaps.add(binIterator.next());
if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
bamOverlaps.add(binIterator.peek());
// Bins found; try to match bins with locations.
Iterator<BAMOverlap> bamOverlapIterator = bamOverlaps.iterator();
while(locationStop >= locationStart) {
int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
while(binStop < locationStart && bamOverlapIterator.hasNext()) {
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
filePointers.add(lastFilePointer);
lastBAMOverlap = bamOverlapIterator.next();
lastFilePointer = new FilePointer(lastBAMOverlap);
binStart = lastFilePointer.overlap.start;
binStop = lastFilePointer.overlap.stop;
}
if(locationStart < binStart) {
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
filePointers.add(lastFilePointer);
lastFilePointer = null;
lastBAMOverlap = null;
}
final int regionStop = Math.min(locationStop,binStart-1);
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
lastFilePointer = new FilePointer(subset);
locationStart = regionStop + 1;
}
else if(locationStart > binStop) {
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
filePointers.add(lastFilePointer);
lastFilePointer = null;
lastBAMOverlap = null;
}
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
filePointers.add(new FilePointer(subset));
locationStart = locationStop + 1;
}
else {
if(lastFilePointer == null)
throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
// The start of the region overlaps the bin. Add the overlapping subset.
final int regionStop = Math.min(locationStop,binStop);
lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
locationStart = regionStop + 1;
}
}
}
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
filePointers.add(lastFilePointer);
// Lookup the locations for every file pointer in the index.
for(SAMReaderID id: readerToIndexMap.keySet()) {
BrowseableBAMIndex index = readerToIndexMap.get(id);
for(FilePointer filePointer: filePointers)
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
}
return filePointers;
}
private static class BinMergingIterator implements Iterator<BAMOverlap> {
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
}
public boolean hasNext() {
return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
}
public BAMOverlap next() {
if(!hasNext())
throw new NoSuchElementException("No elements left in merging iterator");
if(pendingOverlaps.isEmpty())
advance();
return pendingOverlaps.remove();
}
public void advance() {
List<ReaderBin> bins = new ArrayList<ReaderBin>();
int boundsStart, boundsStop;
// Prime the pump
if(binQueue.isEmpty())
return;
bins.add(getNextBin());
boundsStart = bins.get(0).getStart();
boundsStop = bins.get(0).getStop();
// Accumulate all the bins that overlap the current bin, in sorted order.
while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
ReaderBin bin = getNextBin();
bins.add(bin);
boundsStart = Math.min(boundsStart,bin.getStart());
boundsStop = Math.max(boundsStop,bin.getStop());
}
List<Pair<Integer,Integer>> range = new ArrayList<Pair<Integer,Integer>>();
int start = bins.get(0).getStart();
int stop = bins.get(0).getStop();
while(start <= boundsStop) {
// Find the next stopping point.
for(ReaderBin bin: bins) {
stop = Math.min(stop,bin.getStop());
if(start < bin.getStart())
stop = Math.min(stop,bin.getStart()-1);
}
range.add(new Pair<Integer,Integer>(start,stop));
// If the last entry added included the last element, stop.
if(stop >= boundsStop)
break;
// Find the next start.
start = stop + 1;
for(ReaderBin bin: bins) {
if(start >= bin.getStart() && start <= bin.getStop())
break;
else if(start < bin.getStart()) {
start = bin.getStart();
break;
}
}
}
// Add the next series of BAM overlaps to the window.
for(Pair<Integer,Integer> window: range) {
BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
for(ReaderBin bin: bins)
bamOverlap.addBin(bin.id,bin.bin);
pendingOverlaps.add(bamOverlap);
}
}
public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
private ReaderBin peekNextBin() {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.peek();
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
}
private ReaderBin getNextBin() {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.remove();
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
if(current.hasNextBin())
binQueue.add(current);
return readerBin;
}
}
/**
* Filters out bins not at the lowest level in the tree.
*/
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
private BrowseableBAMIndex index;
private Iterator<Bin> wrappedIterator;
private Bin nextBin;
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
this.index = index;
this.wrappedIterator = iterator;
advance();
}
public boolean hasNext() {
return nextBin != null;
}
public Bin next() {
Bin bin = nextBin;
advance();
return bin;
}
public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
private void advance() {
nextBin = null;
while(wrappedIterator.hasNext() && nextBin == null) {
Bin bin = wrappedIterator.next();
if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
nextBin = bin;
}
}
}
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
}
class BinQueueState implements Comparable<org.broadinstitute.sting.gatk.datasources.reads.BinQueueState> {
private final SAMReaderID id;
private final BrowseableBAMIndex index;
private final int referenceSequence;
private final PeekableIterator<Bin> bins;
private int firstLocusInCurrentBin;
private int lastLocusInCurrentBin;
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bins = new PeekableIterator<Bin>(bins);
refreshLocusInBinCache();
}
public SAMReaderID getReaderID() {
return id;
}
public BrowseableBAMIndex getIndex() {
return index;
}
public int getReferenceSequence() {
return referenceSequence;
}
public boolean hasNextBin() {
return bins.hasNext();
}
public Bin peekNextBin() {
return bins.peek();
}
public Bin nextBin() {
Bin nextBin = bins.next();
refreshLocusInBinCache();
return nextBin;
}
public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
if(!this.bins.hasNext()) return -1;
if(!this.bins.hasNext()) return 1;
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
}
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
}
private void refreshLocusInBinCache() {
firstLocusInCurrentBin = -1;
lastLocusInCurrentBin = -1;
if(bins.hasNext()) {
Bin bin = bins.peek();
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
}
}
}

View File

@ -22,30 +22,34 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools;
package org.broadinstitute.sting.gatk.datasources.reads;
import java.util.BitSet;
import java.util.Iterator;
/**
* A temporary solution to work around Java access rights issues:
* override chunk and make it public.
* TODO: Eliminate once we determine the final fate of the BAM index reading code.
* Batch granular file pointers into potentially larger shards.
*/
public class GATKBinList extends BinList {
public class LocusShardBalancer extends ShardBalancer {
/**
* Create a new BinList over sequenceCount sequences, consisting of the given bins.
* @param referenceSequence Reference sequence to which these bins are relevant.
* @param bins The given bins to include.
* Convert iterators of file pointers into balanced iterators of shards.
* @return An iterator over balanced shards.
*/
public GATKBinList(final int referenceSequence, final BitSet bins) {
super(referenceSequence,bins);
}
public Iterator<Shard> iterator() {
return new Iterator<Shard>() {
public boolean hasNext() {
return filePointers.hasNext();
}
/**
* Retrieves the bins stored in this list.
* @return A bitset where a bin is present in the list if the bit is true.
*/
public BitSet getBins() {
return super.getBins();
public Shard next() {
FilePointer current = filePointers.next();
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
current = current.combine(parser,filePointers.next());
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
};
}
}

View File

@ -1,178 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* A sharding strategy for loci based on reading of the index.
*/
public class LocusShardStrategy implements ShardStrategy {
/**
* The data source to use when performing this sharding.
*/
private final SAMDataSource reads;
/**
* the parser for creating shards
*/
private GenomeLocParser genomeLocParser;
/**
* An iterator through the available file pointers.
*/
private final Iterator<FilePointer> filePointerIterator;
/**
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
* @param reads Data source from which to load index data.
* @param locations List of locations for which to load data.
*/
public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
this.reads = reads;
this.genomeLocParser = genomeLocParser;
if(!reads.isEmpty()) {
GenomeLocSortedSet intervals;
if(locations == null) {
// If no locations were passed in, shard the entire BAM file.
SAMFileHeader header = reads.getHeader();
intervals = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
// Check this sequence against the reference sequence dictionary.
// TODO: Do a better job of merging reads + reference.
SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
if(refSequenceRecord != null) {
final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
}
}
}
else
intervals = locations;
if(reads.isLowMemoryShardingEnabled()) {
/*
Iterator<FilePointer> filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
List<FilePointer> filePointers = new ArrayList<FilePointer>();
while(filePointerIterator.hasNext())
filePointers.add(filePointerIterator.next());
this.filePointerIterator = filePointers.iterator();
*/
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
}
else
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
}
else {
final int maxShardSize = 100000;
List<FilePointer> filePointers = new ArrayList<FilePointer>();
if(locations == null) {
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
}
}
}
else {
for(GenomeLoc interval: locations) {
while(interval.size() > maxShardSize) {
filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
}
filePointers.add(new FilePointer(interval));
}
}
filePointerIterator = filePointers.iterator();
}
}
/**
* returns true if there are additional shards
*
* @return false if we're done processing shards
*/
public boolean hasNext() {
return filePointerIterator.hasNext();
}
public long shardNumber = 0;
/**
* gets the next Shard
*
* @return the next shard
*/
public LocusShard next() {
FilePointer nextFilePointer = filePointerIterator.next();
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
/*
System.out.printf("Shard %d: interval = {",++shardNumber);
for(GenomeLoc locus: nextFilePointer.locations)
System.out.printf("%s;",locus);
System.out.printf("}; ");
if(fileSpansBounding == null)
System.out.printf("no shard data%n");
else {
SortedMap<SAMReaderID,SAMFileSpan> sortedSpans = new TreeMap<SAMReaderID,SAMFileSpan>(fileSpansBounding);
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: sortedSpans.entrySet()) {
System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
}
}
*/
return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
}
/** we don't support the remove command */
public void remove() {
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
}
/**
* makes the IntervalShard iterable, i.e. usable in a for loop.
*
* @return
*/
public Iterator<Shard> iterator() {
return this;
}
}

View File

@ -1,68 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.Iterator;
/**
* Handles the process of aggregating BAM intervals into individual shards.
*/
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
/**
* The iterator actually laying out the data for BAM scheduling.
*/
private final PeekableIterator<FilePointer> wrappedIterator;
/**
* The parser, for interval manipulation.
*/
private final GenomeLocParser parser;
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
parser = loci.getGenomeLocParser();
}
public boolean hasNext() {
return wrappedIterator.hasNext();
}
/**
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
* @return The next file pointer to process.
*/
public FilePointer next() {
FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
}

View File

@ -1,34 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.List;
/**
* A single, monolithic shard bridging all available data.
* @author mhanna
* @version 0.1
*/
public class MonolithicShard extends Shard {
/**
* Creates a new monolithic shard of the given type.
* @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
* @param locs Intervals that this monolithic shard should process.
*/
public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List<GenomeLoc> locs) {
super(parser, shardType, locs, readsDataSource, null, false);
if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
}
/**
* String representation of this shard.
* @return "entire genome".
*/
@Override
public String toString() {
return "entire genome";
}
}

View File

@ -1,77 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Create a giant shard representing all the data in the input BAM(s).
*
* @author mhanna
* @version 0.1
*/
public class MonolithicShardStrategy implements ShardStrategy {
/**
* The single shard associated with this sharding strategy.
*/
private MonolithicShard shard;
/**
* Create a new shard strategy for shards of the given type.
* @param shardType The shard type.
*/
public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List<GenomeLoc> region) {
shard = new MonolithicShard(parser,readsDataSource,shardType,region);
}
/**
* Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
* will only return another copy of the active iterator.
* @return A copy of this.
*/
public Iterator<Shard> iterator() {
return this;
}
/**
* Returns true if the monolithic shard has not yet been consumed, or false otherwise.
* @return True if shard has been consumed, false otherwise.
*/
public boolean hasNext() {
return shard != null;
}
/**
* Returns the monolithic shard if it has not already been retrieved.
* @return The monolithic shard.
* @throws NoSuchElementException if no such data exists.
*/
public Shard next() {
if(shard == null)
throw new NoSuchElementException("Monolithic shard has already been retrived.");
Shard working = shard;
shard = null;
return working;
}
/**
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
*/
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a shard strategy");
}
/**
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
* @param size adjust the next size to this
*/
public void adjustNextShardSize( long size ) {
throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
}
}

View File

@ -35,10 +35,15 @@ import java.util.Map;
* @version 0.1
*/
public class ReadShard extends Shard {
/**
* What is the maximum number of reads which should go into a read shard.
*/
public static final int MAX_READS = 10000;
/**
* The reads making up this shard.
*/
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(MAX_READS);
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
@ -66,7 +71,7 @@ public class ReadShard extends Shard {
* @return True if this shard's buffer is full (and the shard can buffer reads).
*/
public boolean isBufferFull() {
return reads.size() > ReadShardStrategy.MAX_READS;
return reads.size() > ReadShard.MAX_READS;
}
/**

View File

@ -0,0 +1,115 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.SAMFileSpan;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
/**
* Divide up large file pointers containing reads into more manageable subcomponents.
*/
public class ReadShardBalancer extends ShardBalancer {
/**
* Convert iterators of file pointers into balanced iterators of shards.
* @return An iterator over balanced shards.
*/
public Iterator<Shard> iterator() {
return new Iterator<Shard>() {
/**
* The cached shard to be returned next. Prefetched in the peekable iterator style.
*/
private Shard nextShard = null;
/**
* The file pointer currently being processed.
*/
private FilePointer currentFilePointer;
/**
* Ending position of the last shard in the file.
*/
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
{
if(filePointers.hasNext())
currentFilePointer = filePointers.next();
advance();
}
public boolean hasNext() {
return nextShard != null;
}
public Shard next() {
if(!hasNext())
throw new NoSuchElementException("No next read shard available");
Shard currentShard = nextShard;
advance();
return currentShard;
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
private void advance() {
Map<SAMReaderID,SAMFileSpan> shardPosition;
nextShard = null;
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
readsDataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
nextShard = shard;
break;
}
}
selectedReaders.clear();
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
}
position = readsDataSource.getCurrentPosition();
}
};
}
}

View File

@ -1,183 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*;
/**
* The sharding strategy for reads using a simple counting mechanism. Each read shard
* has a specific number of reads (default to 10K) which is configured in the constructor.
* @author aaron
* @version 1.0
* @date Apr 14, 2009
*/
public class ReadShardStrategy implements ShardStrategy {
/**
* What is the maximum number of reads which should go into a read shard.
*/
protected static final int MAX_READS = 10000;
/**
* The data source used to shard.
*/
private final SAMDataSource dataSource;
/**
* The intervals to be processed.
*/
private final GenomeLocSortedSet locations;
/**
* The cached shard to be returned next. Prefetched in the peekable iterator style.
*/
private Shard nextShard = null;
/** our storage of the genomic locations they'd like to shard over */
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
/**
* Iterator over the list of file pointers.
*/
private final Iterator<FilePointer> filePointerIterator;
/**
* The file pointer currently being processed.
*/
private FilePointer currentFilePointer;
/**
* Ending position of the last shard in the file.
*/
private Map<SAMReaderID,SAMFileSpan> position;
/**
* An indicator whether the strategy has sharded into the unmapped region.
*/
private boolean isIntoUnmappedRegion = false;
private final GenomeLocParser parser;
/**
* Create a new read shard strategy, loading read shards from the given BAM file.
* @param dataSource Data source from which to load shards.
* @param locations intervals to use for sharding.
*/
public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
this.dataSource = dataSource;
this.parser = parser;
this.position = this.dataSource.getCurrentPosition();
this.locations = locations;
if(locations != null)
filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
else
filePointerIterator = filePointers.iterator();
if(filePointerIterator.hasNext())
currentFilePointer = filePointerIterator.next();
advance();
}
/**
* do we have another read shard?
* @return True if any more data is available. False otherwise.
*/
public boolean hasNext() {
return nextShard != null;
}
/**
* Retrieves the next shard, if available.
* @return The next shard, if available.
* @throws java.util.NoSuchElementException if no such shard is available.
*/
public Shard next() {
if(!hasNext())
throw new NoSuchElementException("No next read shard available");
Shard currentShard = nextShard;
advance();
return currentShard;
}
public void advance() {
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
nextShard = null;
if(locations != null) {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {
Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
dataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
nextShard = shard;
break;
}
}
selectedReaders.clear();
currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
}
}
else {
// todo -- this nulling of intervals is a bit annoying since readwalkers without
// todo -- any -L values need to be special cased throughout the code.
Shard shard = new ReadShard(parser,dataSource,position,null,false);
dataSource.fillShard(shard);
nextShard = !shard.isBufferEmpty() ? shard : null;
}
this.position = dataSource.getCurrentPosition();
}
/**
* @throws UnsupportedOperationException always.
*/
public void remove() {
throw new UnsupportedOperationException("Remove not supported");
}
/**
* Convenience method for using ShardStrategy in an foreach loop.
* @return A iterator over shards.
*/
public Iterator<Shard> iterator() {
return this;
}
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.Bin;
import net.sf.samtools.BrowseableBAMIndex;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 2, 2011
* Time: 4:36:40 PM
* To change this template use File | Settings | File Templates.
*/
class ReaderBin {
public final SAMReaderID id;
public final BrowseableBAMIndex index;
public final int referenceSequence;
public final Bin bin;
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bin = bin;
}
public int getStart() {
return index.getFirstLocusInBin(bin);
}
public int getStop() {
return index.getLastLocusInBin(bin);
}
}

View File

@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -71,7 +73,7 @@ public class SAMDataSource {
/**
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
*/
private final GenomeLocParser genomeLocParser;
protected final GenomeLocParser genomeLocParser;
/**
* Identifiers for the readers driving this data source.
@ -91,13 +93,18 @@ public class SAMDataSource {
/**
* How far along is each reader?
*/
private final Map<SAMReaderID, SAMFileSpan> readerPositions = new HashMap<SAMReaderID,SAMFileSpan>();
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
/**
* The merged header.
*/
private final SAMFileHeader mergedHeader;
/**
* The constituent headers of the unmerged files.
*/
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
/**
* The sort order of the BAM files. Files without a sort order tag are assumed to be
* in coordinate order.
@ -131,17 +138,24 @@ public class SAMDataSource {
private final SAMResourcePool resourcePool;
/**
* Whether to enable the new low-memory sharding mechanism.
* Asynchronously loads BGZF blocks.
*/
private boolean enableLowMemorySharding = false;
private final BGZFBlockLoadingDispatcher dispatcher;
/**
* How are threads allocated.
*/
private final ThreadAllocation threadAllocation;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
*/
public SAMDataSource(Collection<SAMReaderID> samFiles,GenomeLocParser genomeLocParser) {
public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
this(
samFiles,
threadAllocation,
numFileHandles,
genomeLocParser,
false,
SAMFileReader.ValidationStringency.STRICT,
@ -150,8 +164,7 @@ public class SAMDataSource {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false,
false,
true);
false);
}
/**
@ -159,6 +172,8 @@ public class SAMDataSource {
*/
public SAMDataSource(
Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness,
@ -167,9 +182,10 @@ public class SAMDataSource {
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents,
boolean enableLowMemorySharding) {
boolean generateExtendedEvents) {
this( samFiles,
threadAllocation,
numFileHandles,
genomeLocParser,
useOriginalBaseQualities,
strictness,
@ -182,8 +198,7 @@ public class SAMDataSource {
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
(byte) -1,
enableLowMemorySharding);
(byte) -1);
}
/**
@ -205,6 +220,8 @@ public class SAMDataSource {
*/
public SAMDataSource(
Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness,
@ -217,13 +234,19 @@ public class SAMDataSource {
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
byte defaultBaseQualities,
boolean enableLowMemorySharding) {
this.enableLowMemorySharding(enableLowMemorySharding);
byte defaultBaseQualities) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
readerIDs = samFiles;
this.threadAllocation = threadAllocation;
// TODO: Consider a borrowed-thread dispatcher implementation.
if(this.threadAllocation.getNumIOThreads() > 0)
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
else
dispatcher = null;
validationStringency = strictness;
for (SAMReaderID readerID : samFiles) {
if (!readerID.samFile.canRead())
@ -235,10 +258,13 @@ public class SAMDataSource {
SAMReaders readers = resourcePool.getAvailableReaders();
// Determine the sort order.
for(SAMFileReader reader: readers.values()) {
for(SAMReaderID readerID: readerIDs) {
// Get the sort order, forcing it to coordinate if unsorted.
SAMFileReader reader = readers.getReader(readerID);
SAMFileHeader header = reader.getFileHeader();
headers.put(readerID,header);
if ( header.getReadGroups().isEmpty() ) {
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
@ -275,7 +301,7 @@ public class SAMDataSource {
qmode,
refReader,
defaultBaseQualities);
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
@ -296,12 +322,10 @@ public class SAMDataSource {
originalToMergedReadGroupMappings.put(id,mappingToMerged);
}
if(enableLowMemorySharding) {
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.samFile);
if(indexFile != null)
bamIndices.put(id,new GATKBAMIndex(indexFile));
}
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.samFile);
if(indexFile != null)
bamIndices.put(id,new GATKBAMIndex(indexFile));
}
resourcePool.releaseReaders(readers);
@ -314,22 +338,6 @@ public class SAMDataSource {
*/
public ReadProperties getReadsInfo() { return readProperties; }
/**
* Enable experimental low-memory sharding.
* @param enable True to enable sharding. False otherwise.
*/
public void enableLowMemorySharding(final boolean enable) {
enableLowMemorySharding = enable;
}
/**
* Returns whether low-memory sharding is enabled.
* @return True if enabled, false otherwise.
*/
public boolean isLowMemoryShardingEnabled() {
return enableLowMemorySharding;
}
/**
* Checks to see whether any reads files are supplying data.
* @return True if no reads files are supplying data to the traversal; false otherwise.
@ -368,7 +376,7 @@ public class SAMDataSource {
* Retrieves the current position within the BAM file.
* @return A mapping of reader to current position.
*/
public Map<SAMReaderID,SAMFileSpan> getCurrentPosition() {
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
return readerPositions;
}
@ -381,7 +389,7 @@ public class SAMDataSource {
}
public SAMFileHeader getHeader(SAMReaderID id) {
return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader();
return headers.get(id);
}
/**
@ -404,45 +412,21 @@ public class SAMDataSource {
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
}
/**
* No read group collisions at this time because only one SAM file is currently supported.
* @return False always.
*/
public boolean hasReadGroupCollisions() {
return hasReadGroupCollisions;
}
/**
* True if all readers have an index.
* @return True if all readers have an index.
*/
public boolean hasIndex() {
if(enableLowMemorySharding)
return readerIDs.size() == bamIndices.size();
else {
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
if(!reader.hasIndex())
return false;
}
return true;
}
return readerIDs.size() == bamIndices.size();
}
/**
* Gets the index for a particular reader. Always preloaded.
* TODO: Should return object of type GATKBAMIndex, but cannot because there
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
* TODO: sharding system goes live.
* @param id Id of the reader.
* @return The index. Will preload the index if necessary.
*/
public Object getIndex(final SAMReaderID id) {
if(enableLowMemorySharding)
return bamIndices.get(id);
else {
SAMReaders readers = resourcePool.getReadersWithoutLocking();
return readers.getReader(id).getBrowseableIndex();
}
public GATKBAMIndex getIndex(final SAMReaderID id) {
return bamIndices.get(id);
}
/**
@ -454,7 +438,7 @@ public class SAMDataSource {
}
/**
* Gets the cumulative read metrics for shards already processed.
* Gets the cumulative read metrics for shards already processed.
* @return Cumulative read metrics.
*/
public ReadMetrics getCumulativeReadMetrics() {
@ -507,10 +491,6 @@ public class SAMDataSource {
}
public StingSAMIterator seek(Shard shard) {
// todo: refresh monolithic sharding implementation
if(shard instanceof MonolithicShard)
return seekMonolithic(shard);
if(shard.buffersReads()) {
return shard.iterator();
}
@ -540,7 +520,7 @@ public class SAMDataSource {
*/
private void initializeReaderPositions(SAMReaders readers) {
for(SAMReaderID id: getReaderIDs())
readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads());
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
}
/**
@ -548,7 +528,6 @@ public class SAMDataSource {
* @param readers Readers from which to load data.
* @param shard The shard specifying the data limits.
* @param enableVerification True to verify. For compatibility with old sharding strategy.
* TODO: Collapse this flag when the two sharding systems are merged.
* @return An iterator over the selected data.
*/
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
@ -559,14 +538,20 @@ public class SAMDataSource {
for(SAMReaderID id: getReaderIDs()) {
CloseableIterator<SAMRecord> iterator = null;
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
continue;
iterator = shard.getFileSpans().get(id) != null ?
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
readers.getReader(id).queryUnmapped();
// TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
// TODO: Kill this check once we've proven that the design elements are gone.
if(shard.getFileSpans().get(id) == null)
throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream inputStream = readers.getInputStream(id);
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
}
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
if(shard.getGenomeLocs() != null)
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
mergingIterator.addIterator(readers.getReader(id),iterator);
}
@ -584,33 +569,6 @@ public class SAMDataSource {
readProperties.defaultBaseQualities());
}
/**
* A stopgap measure to handle monolithic sharding
* @param shard the (monolithic) shard.
* @return An iterator over the monolithic shard.
*/
private StingSAMIterator seekMonolithic(Shard shard) {
SAMReaders readers = resourcePool.getAvailableReaders();
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
for(SAMReaderID id: getReaderIDs())
mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
return applyDecoratingIterators(shard.getReadMetrics(),
shard instanceof ReadShard,
readProperties.useOriginalBaseQualities(),
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
readProperties.getDownsamplingMethod().toFraction,
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
readProperties.getSupplementalFilters(),
readProperties.getBAQCalculationMode(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
readProperties.defaultBaseQualities());
}
/**
* Adds this read to the given shard.
* @param shard The shard to which to add the read.
@ -618,7 +576,7 @@ public class SAMDataSource {
* @param read The read to add to the shard.
*/
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
shard.addRead(read);
readerPositions.put(id,endChunk);
}
@ -689,19 +647,6 @@ public class SAMDataSource {
this.maxEntries = maxEntries;
}
/**
* Dangerous internal method; retrieves any set of readers, whether in iteration or not.
* Used to handle non-exclusive, stateless operations, such as index queries.
* @return Any collection of SAMReaders, whether in iteration or not.
*/
protected SAMReaders getReadersWithoutLocking() {
synchronized(this) {
if(allResources.size() == 0)
createNewResource();
}
return allResources.get(0);
}
/**
* Choose a set of readers from the pool to use for this query. When complete,
* @return
@ -753,6 +698,11 @@ public class SAMDataSource {
*/
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
/**
* The inptu streams backing
*/
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
/**
* Derive a new set of readers from the Reads metadata.
* @param readerIDs reads to load.
@ -760,12 +710,20 @@ public class SAMDataSource {
*/
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = new SAMFileReader(readerID.samFile);
File indexFile = findIndexFile(readerID.samFile);
SAMFileReader reader = null;
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
reader = new SAMFileReader(blockInputStream,indexFile,false);
inputStreams.put(readerID,blockInputStream);
}
else
reader = new SAMFileReader(readerID.samFile,indexFile,false);
reader.setSAMRecordFactory(factory);
reader.enableFileSource(true);
reader.enableIndexMemoryMapping(false);
if(!enableLowMemorySharding)
reader.enableIndexCaching(true);
reader.setValidationStringency(validationStringency);
final SAMFileHeader header = reader.getFileHeader();
@ -786,6 +744,15 @@ public class SAMDataSource {
return readers.get(id);
}
/**
* Retrieve the input stream backing a reader.
* @param id The ID of the reader to retrieve.
* @return the reader associated with the given id.
*/
public BlockInputStream getInputStream(final SAMReaderID id) {
return inputStreams.get(id);
}
/**
* Searches for the reader id of this reader.
* @param reader Reader for which to search.
@ -883,7 +850,7 @@ public class SAMDataSource {
* Filters out reads that do not overlap the current GenomeLoc.
* Note the custom implementation: BAM index querying returns all reads that could
* possibly overlap the given region (and quite a few extras). In order not to drag
* down performance, this implementation is highly customized to its task.
* down performance, this implementation is highly customized to its task.
*/
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
/**
@ -903,7 +870,7 @@ public class SAMDataSource {
/**
* Custom representation of interval bounds.
* Makes it simpler to track current position.
* Makes it simpler to track current position.
*/
private int[] intervalContigIndices;
private int[] intervalStarts;
@ -941,7 +908,7 @@ public class SAMDataSource {
i++;
}
}
advance();
}
@ -1070,6 +1037,40 @@ public class SAMDataSource {
return indexFile;
}
/**
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
* @return An iterator that spans all reads in all BAM files.
*/
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
* read that has been assigned
* @return
*/
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Create a schedule for processing the initialized BAM file using the given interval list.
* The returned schedule should be as granular as possible.
* @param intervals The list of intervals for which to create the schedule.
* @return A granular iterator over file pointers.
*/
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
if(intervals == null)
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
return shardBalancer;
}
}

View File

@ -0,0 +1,120 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: 10/14/11
* Time: 10:47 PM
* To change this template use File | Settings | File Templates.
*/
class SAMReaderPosition {
private final SAMReaderID reader;
private final BlockInputStream inputStream;
private final List<GATKChunk> positions;
private PeekableIterator<GATKChunk> positionIterator;
/**
* Stores the next block address to read, or -1 if no such block is available.
*/
private long nextBlockAddress;
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
this.reader = reader;
this.inputStream = inputStream;
this.positions = fileSpan.getGATKChunks();
initialize();
}
public SAMReaderID getReader() {
return reader;
}
public BlockInputStream getInputStream() {
return inputStream;
}
/**
* Retrieves the next block address to be read.
* @return Next block address to be read.
*/
public long getBlockAddress() {
return nextBlockAddress;
}
public void reset() {
initialize();
}
/**
* Resets the SAM reader position to its original state.
*/
private void initialize() {
this.positionIterator = new PeekableIterator<GATKChunk>(positions.iterator());
if(positionIterator.hasNext())
nextBlockAddress = positionIterator.peek().getBlockStart();
else
nextBlockAddress = -1;
}
/**
* Advances the current position to the next block to read, given the current position in the file.
* @param filePosition The current position within the file.
*/
void advancePosition(final long filePosition) {
nextBlockAddress = filePosition;
// Check the current file position against the iterator; if the iterator is before the current file position,
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
try {
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
positionIterator.next();
// Check to see if the iterator has more data available.
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
nextBlockAddress = positionIterator.peek().getBlockStart();
break;
}
}
}
catch(Exception ex) {
throw new ReviewedStingException("");
}
}
private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
}
}

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
/**
* Balances maximally granular file pointers into shards of reasonable size.
*/
public abstract class ShardBalancer implements Iterable<Shard> {
protected SAMDataSource readsDataSource;
protected PeekableIterator<FilePointer> filePointers;
protected GenomeLocParser parser;
public void initialize(final SAMDataSource readsDataSource, final Iterator<FilePointer> filePointers, final GenomeLocParser parser) {
this.readsDataSource = readsDataSource;
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
this.parser = parser;
}
}

View File

@ -1,31 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import java.util.Iterator;
/**
*
* User: aaron
* Date: Apr 10, 2009
* Time: 4:55:37 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 10, 2009
* <p/>
* Interface ShardStrategy
* <p/>
* The base interface for the sharding strategy; before we had a base abstract
* class, but not this will be an interface to accomidate read based sharding
*/
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
}

View File

@ -1,117 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 7:09:22 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class ShardStrategyFactory
* <p/>
* The Shard Strategy Factory, use this class to create and transfer shard strategies
* between different approaches.
*/
public class ShardStrategyFactory {
public enum SHATTER_STRATEGY {
MONOLITHIC, // Put all of the available data into one shard.
LOCUS_EXPERIMENTAL,
READS_EXPERIMENTAL
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
switch (strat) {
case LOCUS_EXPERIMENTAL:
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
case READS_EXPERIMENTAL:
return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
default:
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
}
}
/**
* get a new shatter strategy
*
* @param readsDataSource File pointer to BAM.
* @param referenceDataSource File pointer to reference.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return a shard strategy capable of dividing input data into shards.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
}
/**
* get a new shatter strategy
*
* @param readsDataSource The reads used to shatter this file.
* @param referenceDataSource The reference used to shatter this file.
* @param strat what's our strategy - SHATTER_STRATEGY type
* @param dic the seq dictionary
* @param startingSize the starting size
* @return A strategy for shattering this data.
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
switch (strat) {
case LOCUS_EXPERIMENTAL:
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
case READS_EXPERIMENTAL:
return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
default:
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
}
}
}

View File

@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
// initialize reads
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
// intervals
GenomeLocSortedSet intervalSortedSet = null;
@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();

View File

@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex;
import net.sf.picard.reference.FastaSequenceIndexBuilder;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
* Loads reference data from fasta file
* Looks for fai and dict files, and tries to create them if they don't exist
*/
public class ReferenceDataSource {
private IndexedFastaSequenceFile index;
private IndexedFastaSequenceFile reference;
/** our log, which we want to capture anything from this class */
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
@ -173,7 +185,7 @@ public class ReferenceDataSource {
logger.info("Treating existing index file as complete.");
}
index = new CachingIndexedFastaSequenceFile(fastaFile);
reference = new CachingIndexedFastaSequenceFile(fastaFile);
} catch (IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
@ -192,6 +204,52 @@ public class ReferenceDataSource {
* @return IndexedFastaSequenceFile that was created from file
*/
public IndexedFastaSequenceFile getReference() {
return this.index;
return this.reference;
}
/**
* Creates an iterator for processing the entire reference.
* @param readsDataSource the reads datasource to embed in the locus shard.
* @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param maxShardSize The maximum shard size which can be used to create this list.
* @return Creates a schedule for performing a traversal over the entire reference.
*/
public Iterable<Shard> createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
List<Shard> shards = new ArrayList<Shard>();
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
shards.add(new LocusShard(parser,
readsDataSource,
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
null));
}
}
return shards;
}
/**
* Creates an iterator for processing the entire reference.
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param intervals the list of intervals to use when processing the reference.
* @param maxShardSize The maximum shard size which can be used to create this list.
* @return Creates a schedule for performing a traversal over the entire reference.
*/
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
List<Shard> shards = new ArrayList<Shard>();
for(GenomeLoc interval: intervals) {
while(interval.size() > maxShardSize) {
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
null));
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
}
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(interval),
null));
}
return shards;
}
}

View File

@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
}
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
// Fast fail for walkers not supporting TreeReducible interface.
if (!( walker instanceof TreeReducible ))
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");

View File

@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param walker Computation to perform over dataset.
* @param shardStrategy A strategy for sharding the data.
*/
public Object execute(Walker walker, ShardStrategy shardStrategy) {
public Object execute(Walker walker, Iterable<Shard> shardStrategy) {
walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker);

View File

@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @param reads the informations associated with the reads
* @param reference the reference file
* @param rods the rods to include in the traversal
* @param nThreadsToUse Number of threads to utilize.
* @param threadAllocation Number of threads to utilize.
*
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
if(walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
if(walker instanceof ReadWalker)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse));
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse);
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
} else {
if(nThreadsToUse > 1)
if(threadAllocation.getNumCPUThreads() > 1)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
}
@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
*
* @return the return type of the walker
*/
public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
public abstract Object execute(Walker walker, Iterable<Shard> shardStrategy);
/**
* Retrieves the object responsible for tracking and managing output.

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.resourcemanagement;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Models how threads are distributed between various components of the GATK.
*/
public class ThreadAllocation {
/**
* The number of CPU threads to be used by the GATK.
*/
private final int numCPUThreads;
/**
* Number of threads to devote exclusively to IO. Default is 0.
*/
private final int numIOThreads;
public int getNumCPUThreads() {
return numCPUThreads;
}
public int getNumIOThreads() {
return numIOThreads;
}
/**
* Construct the default thread allocation.
*/
public ThreadAllocation() {
this(1,null,null);
}
/**
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
* @param totalThreads Complete number of threads to allocate.
* @param numCPUThreads Total number of threads allocated to the traversal.
* @param numIOThreads Total number of threads allocated exclusively to IO.
*/
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
// If no allocation information is present, allocate all threads to CPU
if(numCPUThreads == null && numIOThreads == null) {
this.numCPUThreads = totalThreads;
this.numIOThreads = 0;
}
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
else if(numIOThreads == null) {
if(numCPUThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = totalThreads - numCPUThreads;
}
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
else if(numCPUThreads == null) {
if(numIOThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
this.numIOThreads = numIOThreads;
}
else {
if(numCPUThreads + numIOThreads != totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = numIOThreads;
}
}
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest {
SAMRecordIterator iterator = new SAMRecordIterator();
GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5);
Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap());
Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.<SAMReaderID>emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap());
WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs());
WindowMaker.WindowMakerIterator window = windowMaker.next();
LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null);

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -42,7 +43,7 @@ import java.util.Collections;
public class MockLocusShard extends LocusShard {
public MockLocusShard(final GenomeLocParser genomeLocParser,final List<GenomeLoc> intervals) {
super( genomeLocParser,
new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),
new SAMDataSource(Collections.<SAMReaderID>emptyList(),new ThreadAllocation(),null,genomeLocParser),
intervals,
null);
}

View File

@ -1,223 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import static org.testng.Assert.fail;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
*
* User: aaron
* Date: Apr 8, 2009
* Time: 8:14:23 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMBAMDataSourceUnitTest
* <p/>
* The test of the SAMBAM simple data source.
*/
public class SAMBAMDataSourceUnitTest extends BaseTest {
private List<SAMReaderID> readers;
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeMethod
public void doForEachTest() throws FileNotFoundException {
readers = new ArrayList<SAMReaderID>();
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@AfterMethod
public void undoForEachTest() {
seq = null;
readers.clear();
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testLinearBreakIterateAll() {
logger.warn("Executing testLinearBreakIterateAll");
// setup the data
readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
logger.debug("count = " + count);
StingSAMIterator datum = data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
for (SAMRecord r : datum) {
}
readCount++;
}
datum.close();
// if we're over 100 shards, break out
if (count > 100) {
break;
}
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testMergingTwoBAMFiles() {
logger.warn("Executing testMergingTwoBAMFiles");
// setup the test files
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
// count up the first hundred shards
int shardsToCount = 100;
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
if (count > shardsToCount) {
break;
}
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
}
readcountPerShard.add(readCount);
logger.debug("read count = " + readCount);
datum.close();
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
// setup the data and the counter before our second run
readers.clear();
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
count = 0;
// the sharding strat.
data = new SAMDataSource(readers,genomeLocParser);
strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser);
logger.debug("Pile two:");
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
// can we leave?
if (count > shardsToCount) {
break;
}
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
}
readcountPerShard2.add(readCount);
logger.debug("read count = " + readCount);
datum.close();
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
/*int pos = 0;
for (; pos < 100; pos++) {
if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos));
}
} */
}
}

View File

@ -0,0 +1,147 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import static org.testng.Assert.fail;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMDataSourceUnitTest
* <p/>
* The test of the SAMBAM simple data source.
*/
public class SAMDataSourceUnitTest extends BaseTest {
private List<SAMReaderID> readers;
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeMethod
public void doForEachTest() throws FileNotFoundException {
readers = new ArrayList<SAMReaderID>();
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@AfterMethod
public void undoForEachTest() {
seq = null;
readers.clear();
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testLinearBreakIterateAll() {
logger.warn("Executing testLinearBreakIterateAll");
// setup the data
readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
// the sharding strat.
SAMDataSource data = new SAMDataSource(readers,
new ThreadAllocation(),
null,
genomeLocParser,
false,
SAMFileReader.ValidationStringency.SILENT,
null,
null,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false,
false);
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer());
int count = 0;
try {
for (Shard sh : strat) {
int readCount = 0;
count++;
GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
logger.debug("count = " + count);
StingSAMIterator datum = data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
for (SAMRecord r : datum) {
}
readCount++;
}
datum.close();
// if we're over 100 shards, break out
if (count > 100) {
break;
}
}
}
catch (UserException.CouldNotReadInputFile e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
}
}
}

View File

@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest {
private List<SAMReaderID> bamList;
private Walker countReadWalker;
private File output;
private long readSize = 100000;
private TraverseReads traversalEngine = null;
private IndexedFastaSequenceFile ref = null;
@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */
@Test
public void testUnmappedReadCount() {
SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser);
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
ref.getSequenceDictionary(),
readSize,
genomeLocParser);
SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser);
Iterable<Shard> shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
countReadWalker.initialize();
Object accumulator = countReadWalker.reduceInit();
while (shardStrategy.hasNext()) {
for(Shard shard: shardStrategy) {
traversalEngine.startTimersIfNecessary();
Shard shard = shardStrategy.next();
if (shard == null) {
fail("Shard == null");