Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-02-10 11:08:33 -05:00
commit 5e18020a5f
33 changed files with 965 additions and 2096 deletions

View File

@ -1,762 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools;
import net.sf.samtools.util.*;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Internal class for reading and querying BAM files.
*/
class BAMFileReader extends SAMFileReader.ReaderImplementation {
// True if reading from a File rather than an InputStream
private boolean mIsSeekable = false;
// For converting bytes into other primitive types
private BinaryCodec mStream = null;
// Underlying compressed data stream.
private final BAMInputStream mInputStream;
private SAMFileHeader mFileHeader = null;
// Populated if the file is seekable and an index exists
private File mIndexFile;
private BAMIndex mIndex = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<SAMRecord> mCurrentIterator = null;
// If true, all SAMRecords are fully decoded as they are read.
private final boolean eagerDecode;
// For error-checking.
private ValidationStringency mValidationStringency;
// For creating BAMRecords
private SAMRecordFactory samRecordFactory;
/**
* Use the caching index reader implementation rather than the disk-hit-per-file model.
*/
private boolean mEnableIndexCaching = false;
/**
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
*/
private boolean mEnableIndexMemoryMapping = true;
/**
* Add information about the origin (reader and position) to SAM records.
*/
private SAMFileReader mFileReader = null;
/**
* Prepare to read BAM from a stream (not seekable)
* @param stream source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final InputStream stream,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = false;
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(null);
}
/**
* Prepare to read BAM from a file (seekable)
* @param file source of bytes.
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
* @param validationStringency Controls how to handle invalidate reads or header lines.
*/
BAMFileReader(final File file,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
" is older than BAM " + file.getAbsolutePath());
}
}
BAMFileReader(final SeekableStream strm,
final File indexFile,
final boolean eagerDecode,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
indexFile,
eagerDecode,
strm.getSource(),
validationStringency,
factory);
}
private BAMFileReader(final BAMInputStream inputStream,
final File indexFile,
final boolean eagerDecode,
final String source,
final ValidationStringency validationStringency,
final SAMRecordFactory factory)
throws IOException {
mIndexFile = indexFile;
mIsSeekable = true;
mInputStream = inputStream;
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = factory;
readHeader(source);
mFirstRecordPointer = inputStream.getFilePointer();
}
/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
*/
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
this.mFileReader = enabled ? reader : null;
}
/**
* If true, uses the caching version of the index reader.
* @param enabled true to write source information into each SAMRecord.
*/
public void enableIndexCaching(final boolean enabled) {
if(mIndex != null)
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
this.mEnableIndexCaching = enabled;
}
/**
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
* @param enabled True to use memory mapping, false to use regular I/O.
*/
public void enableIndexMemoryMapping(final boolean enabled) {
if (mIndex != null) {
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
}
this.mEnableIndexMemoryMapping = enabled;
}
@Override void enableCrcChecking(final boolean enabled) {
this.mInputStream.setCheckCrcs(enabled);
}
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
/**
* @return true if ths is a BAM file, and has an index
*/
public boolean hasIndex() {
return (mIndexFile != null);
}
/**
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
* @return An index of the given type.
*/
public BAMIndex getIndex() {
if(mIndexFile == null)
throw new SAMException("No index is available for this BAM file.");
if(mIndex == null)
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
return mIndex;
}
void close() {
if (mStream != null) {
mStream.close();
}
if (mIndex != null) {
mIndex.close();
}
mStream = null;
mFileHeader = null;
mIndex = null;
}
SAMFileHeader getFileHeader() {
return mFileHeader;
}
/**
* Set error-checking level for subsequent SAMRecord reads.
*/
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
this.mValidationStringency = validationStringency;
}
SAMFileReader.ValidationStringency getValidationStringency() {
return this.mValidationStringency;
}
/**
* Prepare to iterate through the SAMRecords in file order.
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
* that iterator must be closed before getIterator() can be called again.
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
* getIterator() begins its iteration where the last one left off. That is the best that can be
* done in that situation.
*/
CloseableIterator<SAMRecord> getIterator() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (mIsSeekable) {
try {
mInputStream.seek(mFirstRecordPointer);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
mCurrentIterator = new BAMFileIterator();
return mCurrentIterator;
}
@Override
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!(chunks instanceof BAMFileSpan)) {
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
}
// Create an iterator over the given chunk boundaries.
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
return mCurrentIterator;
}
/**
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
* the rightmost bound is guaranteed to be after the last read in the file.
* @return An unbounded pointer to the first record in the BAM file.
*/
@Override
SAMFileSpan getFilePointerSpanningReads() {
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
}
/**
* Prepare to iterate through the SAMRecords that match the given interval.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
* A value of zero implies the start of the reference sequence.
* @param end A value of zero implies the end of the reference sequence.
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
* specified by start and end. If false, the SAMRecords need only overlap the interval.
* @return Iterator for the matching SAMRecords
*/
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
return mCurrentIterator;
}
/**
* Prepare to iterate through the SAMRecords with the given alignment start.
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
* before calling any of the methods that return an iterator.
*
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
* matches the specified interval.
*
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
*
* @param sequence Reference sequence sought.
* @param start Alignment start sought.
* @return Iterator for the matching SAMRecords.
*/
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
return mCurrentIterator;
}
public CloseableIterator<SAMRecord> queryUnmapped() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
try {
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
if (startOfLastLinearBin != -1) {
mInputStream.seek(startOfLastLinearBin);
} else {
// No mapped reads in file, just start at the first read in file.
mInputStream.seek(mFirstRecordPointer);
}
mCurrentIterator = new BAMFileIndexUnmappedIterator();
return mCurrentIterator;
} catch (IOException e) {
throw new RuntimeException("IOException seeking to unmapped reads", e);
}
}
/**
* Reads the header from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private void readHeader(final String source)
throws IOException {
final byte[] buffer = new byte[4];
mStream.readBytes(buffer);
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
throw new IOException("Invalid BAM file header");
}
final int headerTextLength = mStream.readInt();
final String textHeader = mStream.readString(headerTextLength);
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
headerCodec.setValidationStringency(mValidationStringency);
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
source);
final int sequenceCount = mStream.readInt();
if (mFileHeader.getSequenceDictionary().size() > 0) {
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
throw new SAMFormatException("Number of sequences in text header (" +
mFileHeader.getSequenceDictionary().size() +
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
}
for (int i = 0; i < sequenceCount; i++) {
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
source);
}
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
source);
}
}
} else {
// If only binary sequences are present, copy them into mFileHeader
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
for (int i = 0; i < sequenceCount; i++) {
sequences.add(readSequenceRecord(source));
}
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
}
}
/**
* Reads a single binary sequence record from the file or stream
* @param source Note that this is used only for reporting errors.
*/
private SAMSequenceRecord readSequenceRecord(final String source) {
final int nameLength = mStream.readInt();
if (nameLength <= 1) {
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
}
final String sequenceName = mStream.readString(nameLength - 1);
// Skip the null terminator
mStream.readByte();
final int sequenceLength = mStream.readInt();
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
}
/**
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
* Starting point of iteration is wherever current file position is when the iterator is constructed.
*/
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
private SAMRecord mNextRecord = null;
private final BAMRecordCodec bamRecordCodec;
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
BAMFileIterator() {
this(true);
}
/**
* @param advance Trick to enable subclass to do more setup before advancing
*/
BAMFileIterator(final boolean advance) {
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
if (advance) {
advance();
}
}
public void close() {
if (mCurrentIterator != null && this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
public boolean hasNext() {
return (mNextRecord != null);
}
public SAMRecord next() {
final SAMRecord result = mNextRecord;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
void advance() {
try {
mNextRecord = getNextRecord();
if (mNextRecord != null) {
++this.samRecordIndex;
// Because some decoding is done lazily, the record needs to remember the validation stringency.
mNextRecord.setValidationStringency(mValidationStringency);
if (mValidationStringency != ValidationStringency.SILENT) {
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
SAMUtils.processValidationErrors(validationErrors,
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
}
}
if (eagerDecode && mNextRecord != null) {
mNextRecord.eagerDecode();
}
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
/**
* Read the next record from the input stream.
*/
SAMRecord getNextRecord() throws IOException {
final long startCoordinate = mInputStream.getFilePointer();
final SAMRecord next = bamRecordCodec.decode();
final long stopCoordinate = mInputStream.getFilePointer();
if(mFileReader != null && next != null)
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
return next;
}
/**
* @return The record that will be return by the next call to next()
*/
protected SAMRecord peek() {
return mNextRecord;
}
}
/**
* Prepare to iterate through SAMRecords matching the target interval.
* @param sequence Desired reference sequence.
* @param start 1-based start of target interval, inclusive.
* @param end 1-based end of target interval, inclusive.
* @param queryType contained, overlapping, or starting-at query.
*/
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
final int start,
final int end,
final QueryType queryType) {
long[] filePointers = null;
// Hit the index to determine the chunk boundaries for the required data.
final SAMFileHeader fileHeader = getFileHeader();
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMIndex fileIndex = getIndex();
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
}
// Create an iterator over the above chunk boundaries.
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
// Add some preprocessing filters for edge-case reads that don't fit into this
// query type.
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
}
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
/**
* Look for BAM index file according to standard naming convention.
*
* @param dataFile BAM file name.
* @return Index file name, or null if not found.
*/
private static File findIndexFile(final File dataFile) {
// If input is foo.bam, look for foo.bai
final String bamExtension = ".bam";
File indexFile;
final String fileName = dataFile.getName();
if (fileName.endsWith(bamExtension)) {
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
indexFile = new File(dataFile.getParent(), bai);
if (indexFile.exists()) {
return indexFile;
}
}
// If foo.bai doesn't exist look for foo.bam.bai
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
if (indexFile.exists()) {
return indexFile;
} else {
return null;
}
}
private class BAMFileIndexIterator extends BAMFileIterator {
private long[] mFilePointers = null;
private int mFilePointerIndex = 0;
private long mFilePointerLimit = -1;
/**
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
* @param filePointers the block / offset combination, stored in chunk format.
*/
BAMFileIndexIterator(final long[] filePointers) {
super(false); // delay advance() until after construction
mFilePointers = filePointers;
advance();
}
SAMRecord getNextRecord()
throws IOException {
// Advance to next file block if necessary
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
if (mFilePointers == null ||
mFilePointerIndex >= mFilePointers.length) {
return null;
}
final long startOffset = mFilePointers[mFilePointerIndex++];
final long endOffset = mFilePointers[mFilePointerIndex++];
mInputStream.seek(startOffset);
mFilePointerLimit = endOffset;
}
// Pull next record from stream
return super.getNextRecord();
}
}
/**
* A decorating iterator that filters out records that are outside the bounds of the
* given query parameters.
*/
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private final CloseableIterator<SAMRecord> wrappedIterator;
/**
* The next record to be returned. Will be null if no such record exists.
*/
private SAMRecord mNextRecord;
private final int mReferenceIndex;
private final int mRegionStart;
private final int mRegionEnd;
private final QueryType mQueryType;
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
this.wrappedIterator = iterator;
final SAMFileHeader fileHeader = getFileHeader();
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
mRegionStart = start;
if (queryType == QueryType.STARTING_AT) {
mRegionEnd = mRegionStart;
} else {
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
}
mQueryType = queryType;
mNextRecord = advance();
}
/**
* Returns true if a next element exists; false otherwise.
*/
public boolean hasNext() {
return mNextRecord != null;
}
/**
* Gets the next record from the given iterator.
* @return The next SAM record in the iterator.
*/
public SAMRecord next() {
if(!hasNext())
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
final SAMRecord currentRead = mNextRecord;
mNextRecord = advance();
return currentRead;
}
/**
* Closes down the existing iterator.
*/
public void close() {
if (this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
/**
* @throws UnsupportedOperationException always.
*/
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
SAMRecord advance() {
while (true) {
// Pull next record from stream
if(!wrappedIterator.hasNext())
return null;
final SAMRecord record = wrappedIterator.next();
// If beyond the end of this reference sequence, end iteration
final int referenceIndex = record.getReferenceIndex();
if (referenceIndex != mReferenceIndex) {
if (referenceIndex < 0 ||
referenceIndex > mReferenceIndex) {
return null;
}
// If before this reference sequence, continue
continue;
}
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
// Quick exit to avoid expensive alignment end calculation
return record;
}
final int alignmentStart = record.getAlignmentStart();
// If read is unmapped but has a coordinate, return it if the coordinate is within
// the query region, regardless of whether the mapped mate will be returned.
final int alignmentEnd;
if (mQueryType == QueryType.STARTING_AT) {
alignmentEnd = -1;
} else {
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
record.getAlignmentEnd(): alignmentStart);
}
if (alignmentStart > mRegionEnd) {
// If scanned beyond target region, end iteration
return null;
}
// Filter for overlap with region
if (mQueryType == QueryType.CONTAINED) {
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
return record;
}
} else if (mQueryType == QueryType.OVERLAPPING) {
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
return record;
}
} else {
if (alignmentStart == mRegionStart) {
return record;
}
}
}
}
}
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
private BAMFileIndexUnmappedIterator() {
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
advance();
}
}
}
}

View File

@ -40,6 +40,10 @@ public class GATKChunk extends Chunk {
super(start,stop); super(start,stop);
} }
public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) {
super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd);
}
public GATKChunk(final Chunk chunk) { public GATKChunk(final Chunk chunk) {
super(chunk.getChunkStart(),chunk.getChunkEnd()); super(chunk.getChunkStart(),chunk.getChunkEnd());
} }

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools;
/**
* Utils that insist on being in the same package as Picard.
*/
public class PicardNamespaceUtils {
/**
* Private constructor only. Do not instantiate.
*/
private PicardNamespaceUtils() {}
public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) {
read.setFileSource(fileSource);
}
}

View File

@ -1,72 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.IOException;
/**
* An input stream formulated for use reading BAM files. Supports
*/
public interface BAMInputStream {
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos) throws IOException;
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer();
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check);
public int read() throws java.io.IOException;
public int read(byte[] bytes) throws java.io.IOException;
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
public long skip(long l) throws java.io.IOException;
public int available() throws java.io.IOException;
public void close() throws java.io.IOException;
public void mark(int i);
public void reset() throws java.io.IOException;
public boolean markSupported();
}

View File

@ -1,483 +0,0 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.samtools.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import net.sf.samtools.FileTruncatedException;
/*
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
*
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
*/
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
private InputStream mStream = null;
private SeekableStream mFile = null;
private byte[] mFileBuffer = null;
private byte[] mCurrentBlock = null;
private int mCurrentOffset = 0;
private long mBlockAddress = 0;
private int mLastBlockLength = 0;
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
/**
* Note that seek() is not supported if this ctor is used.
*/
public BlockCompressedInputStream(final InputStream stream) {
mStream = IOUtil.toBufferedStream(stream);
mFile = null;
}
/**
* Use this ctor if you wish to call seek()
*/
public BlockCompressedInputStream(final File file)
throws IOException {
mFile = new SeekableFileStream(file);
mStream = null;
}
public BlockCompressedInputStream(final URL url) {
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
mStream = null;
}
/**
* For providing some arbitrary data source. No additional buffering is
* provided, so if the underlying source is not buffered, wrap it in a
* SeekableBufferedStream before passing to this ctor.
*/
public BlockCompressedInputStream(final SeekableStream strm) {
mFile = strm;
mStream = null;
}
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check) {
this.blockGunzipper.setCheckCrcs(check);
}
/**
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
* may block in order to fill an internal buffer if it has been exhausted.
*/
public int available()
throws IOException {
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
readBlock();
}
if (mCurrentBlock == null) {
return 0;
}
return mCurrentBlock.length - mCurrentOffset;
}
/**
* Closes the underlying InputStream or RandomAccessFile
*/
public void close()
throws IOException {
if (mFile != null) {
mFile.close();
mFile = null;
} else if (mStream != null) {
mStream.close();
mStream = null;
}
// Encourage garbage collection
mFileBuffer = null;
mCurrentBlock = null;
}
/**
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
* @return the next byte of data, or -1 if the end of the stream is reached.
*/
public int read()
throws IOException {
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
}
/**
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
* or an exception is thrown.
*
* read(buf) has the same effect as read(buf, 0, buf.length).
*
* @param buffer the buffer into which the data is read.
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer)
throws IOException {
return read(buffer, 0, buffer.length);
}
private volatile ByteArrayOutputStream buf = null;
private static final byte eol = '\n';
private static final byte eolCr = '\r';
/**
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
*
* @return A String containing the contents of the line, excluding the line terminating
* character, or null if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
* @
*/
public String readLine() throws IOException {
int available = available();
if (available == 0) {
return null;
}
if(null == buf){ // lazy initialisation
buf = new ByteArrayOutputStream(8192);
}
buf.reset();
boolean done = false;
boolean foundCr = false; // \r found flag
while (!done) {
int linetmpPos = mCurrentOffset;
int bCnt = 0;
while((available-- > 0)){
final byte c = mCurrentBlock[linetmpPos++];
if(c == eol){ // found \n
done = true;
break;
} else if(foundCr){ // previous char was \r
--linetmpPos; // current char is not \n so put it back
done = true;
break;
} else if(c == eolCr){ // found \r
foundCr = true;
continue; // no ++bCnt
}
++bCnt;
}
if(mCurrentOffset < linetmpPos){
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
mCurrentOffset = linetmpPos;
}
available = available();
if(available == 0){
// EOF
done = true;
}
}
return buf.toString();
}
/**
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
*
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
*
* @param buffer buffer into which data is read.
* @param offset the start offset in array b at which the data is written.
* @param length the maximum number of bytes to read.
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
* the stream has been reached.
*/
public int read(final byte[] buffer, int offset, int length)
throws IOException {
final int originalLength = length;
while (length > 0) {
final int available = available();
if (available == 0) {
// Signal EOF to caller
if (originalLength == length) {
return -1;
}
break;
}
final int copyLength = Math.min(length, available);
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
mCurrentOffset += copyLength;
offset += copyLength;
length -= copyLength;
}
return originalLength - length;
}
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer
*/
public void seek(final long pos)
throws IOException {
if (mFile == null) {
throw new IOException("Cannot seek on stream based file");
}
// Decode virtual file pointer
// Upper 48 bits is the byte offset into the compressed stream of a block.
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
final int available;
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
available = mCurrentBlock.length;
} else {
mFile.seek(compressedOffset);
mBlockAddress = compressedOffset;
mLastBlockLength = 0;
readBlock();
available = available();
}
if (uncompressedOffset > available ||
(uncompressedOffset == available && !eof())) {
throw new IOException("Invalid file pointer: " + pos);
}
mCurrentOffset = uncompressedOffset;
}
private boolean eof() throws IOException {
if (mFile.eof()) {
return true;
}
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
}
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer() {
if (mCurrentOffset == mCurrentBlock.length) {
// If current offset is at the end of the current block, file pointer should point
// to the beginning of the next block.
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
}
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
}
public static long getFileBlock(final long bgzfOffset) {
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
}
/**
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
* @return true if the given file looks like a valid BGZF file.
*/
public static boolean isValidFile(final InputStream stream)
throws IOException {
if (!stream.markSupported()) {
throw new RuntimeException("Cannot test non-buffered stream");
}
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
stream.reset();
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
}
private static boolean isValidBlockHeader(final byte[] buffer) {
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
}
private void readBlock()
throws IOException {
if (mFileBuffer == null) {
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
}
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
if (count == 0) {
// Handle case where there is no empty gzip block at end.
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mCurrentBlock = new byte[0];
return;
}
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
throw new IOException("Premature end of file");
}
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
throw new IOException("Unexpected compressed block length: " + blockLength);
}
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
if (count != remaining) {
throw new FileTruncatedException("Premature end of file");
}
inflateBlock(mFileBuffer, blockLength);
mCurrentOffset = 0;
mBlockAddress += mLastBlockLength;
mLastBlockLength = blockLength;
}
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
throws IOException {
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
byte[] buffer = mCurrentBlock;
mCurrentBlock = null;
if (buffer == null || buffer.length != uncompressedLength) {
try {
buffer = new byte[uncompressedLength];
} catch (NegativeArraySizeException e) {
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
}
}
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
mCurrentBlock = buffer;
}
private int readBytes(final byte[] buffer, final int offset, final int length)
throws IOException {
if (mFile != null) {
return readBytes(mFile, buffer, offset, length);
} else if (mStream != null) {
return readBytes(mStream, buffer, offset, length);
} else {
return 0;
}
}
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private int unpackInt16(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8));
}
private int unpackInt32(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8) |
((buffer[offset+2] & 0xFF) << 16) |
((buffer[offset+3] & 0xFF) << 24));
}
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
public static FileTermination checkTermination(final File file)
throws IOException {
final long fileSize = file.length();
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
return FileTermination.DEFECTIVE;
}
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
try {
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
raFile.readFully(buf);
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
return FileTermination.HAS_TERMINATOR_BLOCK;
}
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
buf = new byte[bufsize];
raFile.seek(fileSize - bufsize);
raFile.read(buf);
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
i >= 0; --i) {
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
continue;
}
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
if (buf.length - i == totalBlockSizeMinusOne + 1) {
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
} else {
return FileTermination.DEFECTIVE;
}
}
return FileTermination.DEFECTIVE;
} finally {
raFile.close();
}
}
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
for (int i = 0; i < length; ++i) {
if (preamble[i] != buf[i + startOffset]) {
return false;
}
}
return true;
}
}

View File

@ -25,9 +25,14 @@ import java.util.NoSuchElementException;
*/ */
/** /**
* A queue of locus context entries. * The two goals of the LocusView are as follows:
* 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch
* between iterating over all bases in a region, only covered bases in a region covered by
* reads, only bases in a region covered by RODs, or any other sort of trigger track
* implementation one can think of.
* 2) To manage the copious number of iterators that have to be jointly pulled through the
* genome to make a locus traversal function.
*/ */
public abstract class LocusView extends LocusIterator implements View { public abstract class LocusView extends LocusIterator implements View {
/** /**
* The locus bounding this view. * The locus bounding this view.

View File

@ -27,8 +27,10 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk; import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.LinkedList;
import java.util.List; import java.util.List;
/** /**
@ -38,7 +40,7 @@ import java.util.List;
* Time: 10:47 PM * Time: 10:47 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
class SAMReaderPosition { class BAMAccessPlan {
private final SAMReaderID reader; private final SAMReaderID reader;
private final BlockInputStream inputStream; private final BlockInputStream inputStream;
@ -51,7 +53,7 @@ class SAMReaderPosition {
private long nextBlockAddress; private long nextBlockAddress;
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
this.reader = reader; this.reader = reader;
this.inputStream = inputStream; this.inputStream = inputStream;
@ -84,11 +86,45 @@ class SAMReaderPosition {
} }
/** /**
* Retrieves the last offset of interest in the block returned by getBlockAddress(). * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer.
* @return First block of interest in this segment. * @param blockAddress Block address for which to search.
* @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span.
* @return list of chunks containing that block.
*/ */
public int getLastOffsetInBlock() { public List<GATKChunk> getSpansOverlappingBlock(long blockAddress, long filePosition) {
return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536; List<GATKChunk> spansOverlapping = new LinkedList<GATKChunk>();
// While the position iterator overlaps the given block, pull out spans to report.
while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) {
// Create a span over as much of the block as is covered by this chunk.
int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0;
// Calculate the end of this span. If the span extends past this block, cap it using the current file position.
long blockEnd;
int blockOffsetEnd;
if(blockAddress < positionIterator.peek().getBlockEnd()) {
blockEnd = filePosition;
blockOffsetEnd = 0;
}
else {
blockEnd = positionIterator.peek().getBlockEnd();
blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd();
}
GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd);
if(newChunk.getChunkStart() <= newChunk.getChunkEnd())
spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd));
// If the value currently stored in the position iterator ends past the current block, we must be done. Abort.
if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress)
break;
// If the position iterator ends before the block ends, pull the position iterator forward.
if(positionIterator.peek().getBlockEnd() <= blockAddress)
positionIterator.next();
}
return spansOverlapping;
} }
public void reset() { public void reset() {
@ -111,20 +147,16 @@ class SAMReaderPosition {
* @param filePosition The current position within the file. * @param filePosition The current position within the file.
*/ */
void advancePosition(final long filePosition) { void advancePosition(final long filePosition) {
nextBlockAddress = filePosition >> 16; nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition);
// Check the current file position against the iterator; if the iterator is before the current file position, // Check the current file position against the iterator; if the iterator is before the current file position,
// draw the iterator forward. Remember when performing the check that coordinates are half-open! // draw the iterator forward. Remember when performing the check that coordinates are half-open!
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek()))
positionIterator.next(); positionIterator.next();
// If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) { if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart())
nextBlockAddress = positionIterator.peek().getBlockStart(); nextBlockAddress = positionIterator.peek().getBlockStart();
//System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress);
break;
}
}
// If we've shot off the end of the block pointer, notify consumers that iteration is complete. // If we've shot off the end of the block pointer, notify consumers that iteration is complete.
if(!positionIterator.hasNext()) if(!positionIterator.hasNext())

View File

@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher {
private final ExecutorService threadPool; private final ExecutorService threadPool;
private final Queue<SAMReaderPosition> inputQueue; private final Queue<BAMAccessPlan> inputQueue;
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
threadPool = Executors.newFixedThreadPool(numThreads); threadPool = Executors.newFixedThreadPool(numThreads);
fileHandleCache = new FileHandleCache(numFileHandles); fileHandleCache = new FileHandleCache(numFileHandles);
inputQueue = new LinkedList<SAMReaderPosition>(); inputQueue = new LinkedList<BAMAccessPlan>();
threadPool.execute(new BlockLoader(this,fileHandleCache,true)); threadPool.execute(new BlockLoader(this,fileHandleCache,true));
} }
@ -58,7 +58,7 @@ public class BGZFBlockLoadingDispatcher {
* Initiates a request for a new block load. * Initiates a request for a new block load.
* @param readerPosition Position at which to load. * @param readerPosition Position at which to load.
*/ */
void queueBlockLoad(final SAMReaderPosition readerPosition) { void queueBlockLoad(final BAMAccessPlan readerPosition) {
synchronized(inputQueue) { synchronized(inputQueue) {
inputQueue.add(readerPosition); inputQueue.add(readerPosition);
inputQueue.notify(); inputQueue.notify();
@ -69,7 +69,7 @@ public class BGZFBlockLoadingDispatcher {
* Claims the next work request from the queue. * Claims the next work request from the queue.
* @return The next work request, or null if none is available. * @return The next work request, or null if none is available.
*/ */
SAMReaderPosition claimNextWorkRequest() { BAMAccessPlan claimNextWorkRequest() {
synchronized(inputQueue) { synchronized(inputQueue) {
while(inputQueue.isEmpty()) { while(inputQueue.isEmpty()) {
try { try {

View File

@ -26,24 +26,21 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk; import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.BAMInputStream;
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import net.sf.samtools.util.BlockCompressedInputStream; import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.RuntimeEOFException;
import net.sf.samtools.util.SeekableStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.ByteOrder; import java.nio.ByteOrder;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List;
/** /**
* Presents decompressed blocks to the SAMFileReader. * Presents decompressed blocks to the SAMFileReader.
*/ */
public class BlockInputStream extends SeekableStream implements BAMInputStream { public class BlockInputStream extends InputStream {
/** /**
* Mechanism for triggering block loads. * Mechanism for triggering block loads.
*/ */
@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
private Throwable error; private Throwable error;
/** /**
* Current position. * Current accessPlan.
*/ */
private SAMReaderPosition position; private BAMAccessPlan accessPlan;
/** /**
* A stream of compressed data blocks. * A stream of compressed data blocks.
@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
*/ */
private final BlockCompressedInputStream validatingInputStream; private final BlockCompressedInputStream validatingInputStream;
/**
* Has the buffer been filled since last request?
*/
private boolean bufferFilled = false;
/** /**
* Create a new block presenting input stream with a dedicated buffer. * Create a new block presenting input stream with a dedicated buffer.
* @param dispatcher the block loading messenger. * @param dispatcher the block loading messenger.
@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
this.dispatcher = dispatcher; this.dispatcher = dispatcher;
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
// The block offsets / block positions guarantee that the ending offset/position in the data structure maps to // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to
// the point in the file just following the last read. These two arrays should never be empty; initializing // the point in the file just following the last read. These two arrays should never be empty; initializing
@ -151,7 +143,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
synchronized(lock) { synchronized(lock) {
// Find the current block within the input stream. // Find the current block within the input stream.
int blockIndex; int blockIndex;
for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++) for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++)
; ;
filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex));
} }
@ -164,51 +156,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
return filePointer; return filePointer;
} }
public void seek(long target) {
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
synchronized(lock) {
clearBuffers();
// Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified.
position.advancePosition(target);
// If the position advances past the end of the target, that must mean that we seeked to a point at the end
// of one of the chunk list's subregions. Make a note of our current position and punt on loading any data.
if(target < position.getBlockAddress() << 16) {
blockOffsets.clear();
blockOffsets.add(0);
blockPositions.clear();
blockPositions.add(target);
}
else {
waitForBufferFill();
// A buffer fill will load the relevant data from the shard, but the buffer position still needs to be
// advanced as appropriate.
Iterator<Integer> blockOffsetIterator = blockOffsets.descendingIterator();
Iterator<Long> blockPositionIterator = blockPositions.descendingIterator();
while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) {
final int blockOffset = blockOffsetIterator.next();
final long blockPosition = blockPositionIterator.next();
if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) {
buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF));
break;
}
}
}
if(validatingInputStream != null) {
try {
validatingInputStream.seek(target);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
}
private void clearBuffers() { private void clearBuffers() {
this.position.reset(); this.accessPlan.reset();
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading. // Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
// Indicate no data to be read. // Indicate no data to be read.
@ -225,28 +174,40 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
public boolean eof() { public boolean eof() {
synchronized(lock) { synchronized(lock) {
// TODO: Handle multiple empty BGZF blocks at end of the file. // TODO: Handle multiple empty BGZF blocks at end of the file.
return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length); return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length);
} }
} }
public void setCheckCrcs(final boolean check) {
// TODO: Implement
}
/** /**
* Submits a new access plan for the given dataset. * Submits a new access plan for the given dataset and seeks to the given point.
* @param position The next seek point for BAM data in this reader. * @param accessPlan The next seek point for BAM data in this reader.
*/ */
public void submitAccessPlan(final SAMReaderPosition position) { public void submitAccessPlan(final BAMAccessPlan accessPlan) {
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
synchronized(lock) { this.accessPlan = accessPlan;
// Assume that the access plan is going to tell us to start where we are and move forward. accessPlan.reset();
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) clearBuffers();
position.advancePosition(this.position.getBlockAddress() << 16);
// Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc).
// TODO: Don't pass these empty chunks in.
accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0));
if(accessPlan.getBlockAddress() >= 0) {
waitForBufferFill();
} }
this.position = position;
if(validatingInputStream != null) {
try {
validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0));
} }
catch(IOException ex) {
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
}
}
}
private void compactBuffer() { private void compactBuffer() {
// Compact buffer to maximize storage space. // Compact buffer to maximize storage space.
@ -286,27 +247,14 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
* Push contents of incomingBuffer into the end of this buffer. * Push contents of incomingBuffer into the end of this buffer.
* MUST be called from a thread that is NOT the reader thread. * MUST be called from a thread that is NOT the reader thread.
* @param incomingBuffer The data being pushed into this input stream. * @param incomingBuffer The data being pushed into this input stream.
* @param position target position for the data. * @param accessPlan target access plan for the data.
* @param filePosition the current position of the file pointer * @param filePosition the current position of the file pointer
*/ */
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) {
synchronized(lock) { synchronized(lock) {
try { try {
compactBuffer();
// Open up the buffer for more reading.
buffer.limit(buffer.capacity());
// Advance the position to take the most recent read into account.
final long lastBlockAddress = position.getBlockAddress();
final int blockOffsetStart = position.getFirstOffsetInBlock();
final int blockOffsetEnd = position.getLastOffsetInBlock();
// Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block.
final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16;
byte[] validBytes = null;
if(validatingInputStream != null) { if(validatingInputStream != null) {
validBytes = new byte[incomingBuffer.remaining()]; byte[] validBytes = new byte[incomingBuffer.remaining()];
byte[] currentBytes = new byte[incomingBuffer.remaining()]; byte[] currentBytes = new byte[incomingBuffer.remaining()];
int pos = incomingBuffer.position(); int pos = incomingBuffer.position();
@ -317,7 +265,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
incomingBuffer.position(pos); incomingBuffer.position(pos);
long currentFilePointer = validatingInputStream.getFilePointer(); long currentFilePointer = validatingInputStream.getFilePointer();
validatingInputStream.seek(lastBlockAddress << 16); validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0));
validatingInputStream.read(validBytes); validatingInputStream.read(validBytes);
validatingInputStream.seek(currentFilePointer); validatingInputStream.seek(currentFilePointer);
@ -325,33 +273,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
} }
this.position = position; compactBuffer();
position.advancePosition(filePosition << 16); // Open up the buffer for more reading.
buffer.limit(buffer.capacity());
if(buffer.remaining() < incomingBuffer.remaining()) { // Get the spans overlapping this particular block...
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); List<GATKChunk> spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition);
// ...and advance the block
this.accessPlan = accessPlan;
accessPlan.advancePosition(makeFilePointer(filePosition, 0));
if(buffer.remaining() < incomingBuffer.remaining())
lock.wait(); lock.wait();
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
}
// Remove the last position in the list and add in the last read position, in case the two are different. final int bytesInIncomingBuffer = incomingBuffer.limit();
for(GATKChunk spanOverlapping: spansOverlapping) {
// Clear out the endcap tracking state and add in the starting position for this transfer.
blockOffsets.removeLast(); blockOffsets.removeLast();
blockOffsets.add(buffer.position()); blockOffsets.add(buffer.position());
blockPositions.removeLast(); blockPositions.removeLast();
blockPositions.add(lastBlockAddress << 16 | blockOffsetStart); blockPositions.add(spanOverlapping.getChunkStart());
// Stream the buffer into the data stream. // Stream the buffer into the data stream.
incomingBuffer.position(blockOffsetStart); incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd());
incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd)); incomingBuffer.position(spanOverlapping.getBlockOffsetStart());
buffer.put(incomingBuffer); buffer.put(incomingBuffer);
// Then, add the last position read to the very end of the list, just past the end of the last buffer. // Add the endcap for this transfer.
blockOffsets.add(buffer.position()); blockOffsets.add(buffer.position());
blockPositions.add(endOfRead); blockPositions.add(spanOverlapping.getChunkEnd());
}
// Set up the buffer for reading. // Set up the buffer for reading.
buffer.flip(); buffer.flip();
bufferFilled = true;
lock.notify(); lock.notify();
} }
@ -447,12 +403,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
if(remaining < length) if(remaining < length)
return length - remaining; return length - remaining;
// Otherwise, if at eof(), return -1. // Otherwise, return -1.
else if(eof())
return -1; return -1;
// Otherwise, we must've hit a bug in the system.
throw new ReviewedStingException("BUG: read returned no data, but eof() reports false.");
} }
public void close() { public void close() {
@ -472,20 +424,26 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
private void waitForBufferFill() { private void waitForBufferFill() {
synchronized(lock) { synchronized(lock) {
bufferFilled = false;
if(buffer.remaining() == 0 && !eof()) { if(buffer.remaining() == 0 && !eof()) {
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
dispatcher.queueBlockLoad(position); dispatcher.queueBlockLoad(accessPlan);
try { try {
lock.wait(); lock.wait();
} }
catch(InterruptedException ex) { catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
} }
}
}
}
if(bufferFilled && buffer.remaining() == 0) /**
throw new RuntimeEOFException("No more data left in InputStream"); * Create an encoded BAM file pointer given the address of a BGZF block and an offset.
} * @param blockAddress Physical address on disk of a BGZF block.
} * @param blockOffset Offset into the uncompressed data stored in the BGZF block.
* @return 64-bit pointer encoded according to the BAM spec.
*/
public static long makeFilePointer(final long blockAddress, final int blockOffset) {
return blockAddress << 16 | blockOffset;
} }
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2011, The Broad Institute * Copyright (c) 2012, The Broad Institute
* *
* Permission is hereby granted, free of charge, to any person * Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation * obtaining a copy of this software and associated documentation
@ -70,29 +70,29 @@ class BlockLoader implements Runnable {
public void run() { public void run() {
for(;;) { for(;;) {
SAMReaderPosition readerPosition = null; BAMAccessPlan accessPlan = null;
try { try {
readerPosition = dispatcher.claimNextWorkRequest(); accessPlan = dispatcher.claimNextWorkRequest();
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader());
long blockAddress = readerPosition.getBlockAddress(); //long blockAddress = readerPosition.getBlockAddress();
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress());
long nextBlockAddress = position(inputStream); long nextBlockAddress = position(inputStream);
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream);
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
int bytesCopied = block.remaining(); int bytesCopied = block.remaining();
BlockInputStream bamInputStream = readerPosition.getInputStream(); BlockInputStream bamInputStream = accessPlan.getInputStream();
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress);
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
} }
catch(Throwable error) { catch(Throwable error) {
if(readerPosition != null && readerPosition.getInputStream() != null) if(accessPlan != null && accessPlan.getInputStream() != null)
readerPosition.getInputStream().reportException(error); accessPlan.getInputStream().reportException(error);
} }
} }

View File

@ -0,0 +1,203 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.List;
import java.util.NoSuchElementException;
/**
* High efficiency filtering iterator designed to filter out reads only included
* in the query results due to the granularity of the BAM index.
*
* Built into the BAM index is a notion of 16kbase granularity -- an index query for
* two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will
* return exactly the same regions within the BAM file. This iterator is optimized
* to subtract out reads which do not at all overlap the interval list passed to the
* constructor.
*
* Example:
* interval list: chr20:6-10
* Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15.
* Reads that would be discarded by the filter: chr20:1-5, chr20:11-15.
*/
class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private CloseableIterator<SAMRecord> iterator;
/**
* The next read, queued up and ready to go.
*/
private SAMRecord nextRead;
/**
* Rather than using the straight genomic bounds, use filter out only mapped reads.
*/
private boolean keepOnlyUnmappedReads;
/**
* Custom representation of interval bounds.
* Makes it simpler to track current position.
*/
private int[] intervalContigIndices;
private int[] intervalStarts;
private int[] intervalEnds;
/**
* Position within the interval list.
*/
private int currentBound = 0;
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
this.iterator = iterator;
// Look at the interval list to detect whether we should worry about unmapped reads.
// If we find a mix of mapped/unmapped intervals, throw an exception.
boolean foundMappedIntervals = false;
for(GenomeLoc location: intervals) {
if(! GenomeLoc.isUnmapped(location))
foundMappedIntervals = true;
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
}
if(foundMappedIntervals) {
if(keepOnlyUnmappedReads)
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
this.intervalContigIndices = new int[intervals.size()];
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalContigIndices[i] = interval.getContigIndex();
intervalStarts[i] = interval.getStart();
intervalEnds[i] = interval.getStop();
i++;
}
}
advance();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if(nextRead == null)
throw new NoSuchElementException("No more reads left in this iterator.");
SAMRecord currentRead = nextRead;
advance();
return currentRead;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
}
public void close() {
iterator.close();
}
private void advance() {
nextRead = null;
if(!iterator.hasNext())
return;
SAMRecord candidateRead = iterator.next();
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
if(!keepOnlyUnmappedReads) {
// Mapped read filter; check against GenomeLoc-derived bounds.
if(readEndsOnOrAfterStartingBound(candidateRead)) {
// This read ends after the current interval begins.
// Promising, but this read must be checked against the ending bound.
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
// Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
continue;
}
}
}
else {
// Found an unmapped read. We're done.
if(candidateRead.getReadUnmappedFlag()) {
nextRead = candidateRead;
break;
}
}
// No more reads available. Stop the search.
if(!iterator.hasNext())
break;
// No reasonable read found; advance the iterator.
candidateRead = iterator.next();
}
}
/**
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
* end will be distorted, so rely only on the alignment start.
* @param read The read to position-check.
* @return True if the read starts after the current bounds. False otherwise.
*/
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
return
// Read ends on a later contig, or...
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
// Read ends of this contig...
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
// either after this location, or...
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
// read is unmapped but positioned and alignment start is on or after this start point.
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
}
/**
* Check whether the read lies before the end of the current bound.
* @param read The read to position-check.
* @return True if the read starts after the current bounds. False otherwise.
*/
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
return
// Read starts on a prior contig, or...
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
// Read starts on this contig and the alignment start is registered before this end point.
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
}
}

View File

@ -36,7 +36,7 @@ import java.util.Map;
*/ */
public class ReadShard extends Shard { public class ReadShard extends Shard {
/** /**
* What is the maximum number of reads which should go into a read shard. * What is the maximum number of reads per BAM file which should go into a read shard.
*/ */
public static int MAX_READS = 10000; public static int MAX_READS = 10000;

View File

@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.SimpleTimer;
@ -567,9 +566,14 @@ public class SAMDataSource {
if(threadAllocation.getNumIOThreads() > 0) { if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream inputStream = readers.getInputStream(id); BlockInputStream inputStream = readers.getInputStream(id);
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id)));
BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory);
codec.setInputStream(inputStream);
iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec);
} }
else {
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
}
if(shard.getGenomeLocs().size() > 0) if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator); iteratorMap.put(readers.getReader(id), iterator);
@ -577,8 +581,6 @@ public class SAMDataSource {
MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
return applyDecoratingIterators(shard.getReadMetrics(), return applyDecoratingIterators(shard.getReadMetrics(),
enableVerification, enableVerification,
readProperties.useOriginalBaseQualities(), readProperties.useOriginalBaseQualities(),
@ -592,6 +594,49 @@ public class SAMDataSource {
readProperties.defaultBaseQualities()); readProperties.defaultBaseQualities());
} }
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
private final BlockInputStream inputStream;
private final SAMFileReader reader;
private final BAMRecordCodec codec;
private SAMRecord nextRead;
private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) {
this.inputStream = inputStream;
this.reader = reader;
this.codec = codec;
advance();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if(!hasNext())
throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty");
SAMRecord currentRead = nextRead;
advance();
return currentRead;
}
public void close() {
// NO-OP.
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator");
}
private void advance() {
final long startCoordinate = inputStream.getFilePointer();
nextRead = codec.decode();
final long stopCoordinate = inputStream.getFilePointer();
if(reader != null && nextRead != null)
PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate))));
}
}
/** /**
* Filter reads based on user-specified criteria. * Filter reads based on user-specified criteria.
* *
@ -871,11 +916,8 @@ public class SAMDataSource {
public ReaderInitializer call() { public ReaderInitializer call() {
final File indexFile = findIndexFile(readerID.samFile); final File indexFile = findIndexFile(readerID.samFile);
try { try {
if (threadAllocation.getNumIOThreads() > 0) { if (threadAllocation.getNumIOThreads() > 0)
blockInputStream = new BlockInputStream(dispatcher,readerID,false); blockInputStream = new BlockInputStream(dispatcher,readerID,false);
reader = new SAMFileReader(blockInputStream,indexFile,false);
}
else
reader = new SAMFileReader(readerID.samFile,indexFile,false); reader = new SAMFileReader(readerID.samFile,indexFile,false);
} catch ( RuntimeIOException e ) { } catch ( RuntimeIOException e ) {
if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException )
@ -933,167 +975,6 @@ public class SAMDataSource {
*/ */
private class ReadGroupMapping extends HashMap<String,String> {} private class ReadGroupMapping extends HashMap<String,String> {}
/**
* Filters out reads that do not overlap the current GenomeLoc.
* Note the custom implementation: BAM index querying returns all reads that could
* possibly overlap the given region (and quite a few extras). In order not to drag
* down performance, this implementation is highly customized to its task.
*/
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
/**
* The wrapped iterator.
*/
private CloseableIterator<SAMRecord> iterator;
/**
* The next read, queued up and ready to go.
*/
private SAMRecord nextRead;
/**
* Rather than using the straight genomic bounds, use filter out only mapped reads.
*/
private boolean keepOnlyUnmappedReads;
/**
* Custom representation of interval bounds.
* Makes it simpler to track current position.
*/
private int[] intervalContigIndices;
private int[] intervalStarts;
private int[] intervalEnds;
/**
* Position within the interval list.
*/
private int currentBound = 0;
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
this.iterator = iterator;
// Look at the interval list to detect whether we should worry about unmapped reads.
// If we find a mix of mapped/unmapped intervals, throw an exception.
boolean foundMappedIntervals = false;
for(GenomeLoc location: intervals) {
if(! GenomeLoc.isUnmapped(location))
foundMappedIntervals = true;
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
}
if(foundMappedIntervals) {
if(keepOnlyUnmappedReads)
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
this.intervalContigIndices = new int[intervals.size()];
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalContigIndices[i] = interval.getContigIndex();
intervalStarts[i] = interval.getStart();
intervalEnds[i] = interval.getStop();
i++;
}
}
advance();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if(nextRead == null)
throw new NoSuchElementException("No more reads left in this iterator.");
SAMRecord currentRead = nextRead;
advance();
return currentRead;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
}
public void close() {
iterator.close();
}
private void advance() {
nextRead = null;
if(!iterator.hasNext())
return;
SAMRecord candidateRead = iterator.next();
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
if(!keepOnlyUnmappedReads) {
// Mapped read filter; check against GenomeLoc-derived bounds.
if(readEndsOnOrAfterStartingBound(candidateRead)) {
// This read ends after the current interval begins.
// Promising, but this read must be checked against the ending bound.
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
// Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
continue;
}
}
}
else {
// Found an unmapped read. We're done.
if(candidateRead.getReadUnmappedFlag()) {
nextRead = candidateRead;
break;
}
}
// No more reads available. Stop the search.
if(!iterator.hasNext())
break;
// No reasonable read found; advance the iterator.
candidateRead = iterator.next();
}
}
/**
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
* end will be distorted, so rely only on the alignment start.
* @param read The read to position-check.
* @return True if the read starts after the current bounds. False otherwise.
*/
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
return
// Read ends on a later contig, or...
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
// Read ends of this contig...
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
// either after this location, or...
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
// read is unmapped but positioned and alignment start is on or after this start point.
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
}
/**
* Check whether the read lies before the end of the current bound.
* @param read The read to position-check.
* @return True if the read starts after the current bounds. False otherwise.
*/
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
return
// Read starts on a prior contig, or...
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
// Read starts on this contig and the alignment start is registered before this end point.
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
}
}
/** /**
* Locates the index file alongside the given BAM, if present. * Locates the index file alongside the given BAM, if present.
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.

View File

@ -17,9 +17,21 @@ import java.util.List;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
/** /**
* Buffer shards of data which may or may not contain multiple loci into * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci
* iterators of all data which cover an interval. Its existence is an homage * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp
* to Mark's stillborn WindowMaker, RIP 2009. * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of
* loci to only those covered by the given interval list.
*
* Example:
* Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10
* Incoming intervals: chr20:3-7
*
* Locus iterator by state will produce the following stream of data:
* chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E},
* chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E}
*
* WindowMakerIterator will then filter the incoming stream, emitting the following stream:
* chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E}
* *
* @author mhanna * @author mhanna
* @version 0.1 * @version 0.1

View File

@ -470,7 +470,7 @@ public class LocusIteratorByState extends LocusIterator {
if (op == CigarOperator.D) { if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
size++; size++;
nDeletions++; nDeletions++;
if (read.getMappingQuality() == 0) if (read.getMappingQuality() == 0)
@ -479,7 +479,7 @@ public class LocusIteratorByState extends LocusIterator {
} }
else { else {
if (!filterBaseInRead(read, location.getStart())) { if (!filterBaseInRead(read, location.getStart())) {
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
size++; size++;
if (read.getMappingQuality() == 0) if (read.getMappingQuality() == 0)
nMQ0Reads++; nMQ0Reads++;

View File

@ -102,7 +102,9 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
} }
/** /**
* Gets the best view of loci for this walker given the available data. * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
* that comes along.
* @param walker walker to interrogate. * @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view. * @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.

View File

@ -39,7 +39,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
@ -200,7 +203,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
public class BAQedPileupElement extends PileupElement { public class BAQedPileupElement extends PileupElement {
public BAQedPileupElement( final PileupElement PE ) { public BAQedPileupElement( final PileupElement PE ) {
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
} }
@Override @Override

View File

@ -129,13 +129,14 @@ import java.util.Map;
* -cov DinucCovariate \ * -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv * -recalFile my_reads.recal_data.csv
* </pre> * </pre>
*
*/ */
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file @By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta // Filter out all reads with zero or unavailable mapping quality
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS) @PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> { public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
///////////////////////////// /////////////////////////////
// Shared Arguments // Shared Arguments
///////////////////////////// /////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); @ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* Adds the values of other to this, returning this * Adds the values of other to this, returning this
*
* @param other * @param other
* @return this object * @return this object
*/ */
@ -247,8 +250,9 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/ */
public void initialize() { public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } if (RAC.FORCE_PLATFORM != null) {
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates // Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins(); final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -276,7 +280,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
if (requiredClasses.size() == 2) { // readGroup and reported quality score if (requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
requestedCovariates.add(new QualityScoreCovariate()); requestedCovariates.add(new QualityScoreCovariate());
} else { }
else {
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
} }
// Next add the standard covariates if -standard was specified by the user // Next add the standard covariates if -standard was specified by the user
@ -333,7 +338,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// map // map
@ -343,6 +347,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* For each read at this locus get the various covariate values and increment that location in the map based on * For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location * whether or not the base matches the reference at this particular location
*
* @param tracker The reference metadata tracker * @param tracker The reference metadata tracker
* @param ref The reference context * @param ref The reference context
* @param context The alignment context * @param context The alignment context
@ -362,8 +367,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
continue; continue;
} }
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
{
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true); gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
RecalDataManager.parseSAMRecord(gatkRead, RAC); RecalDataManager.parseSAMRecord(gatkRead, RAC);
@ -374,8 +378,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
RecalDataManager.parseColorSpace(gatkRead); RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
} }
// Skip this position if base quality is zero // Skip this position if base quality is zero
@ -394,10 +397,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// This base finally passed all the checks for a good base, so add it to the big data hashmap // This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead(counter, gatkRead, offset, refBase); updateDataFromRead(counter, gatkRead, offset, refBase);
} else { // calculate SOLID reference insertion rate }
else { // calculate SOLID reference insertion rate
if (refBase == bases[offset]) { if (refBase == bases[offset]) {
counter.solidInsertedReferenceBases++; counter.solidInsertedReferenceBases++;
} else { }
else {
counter.otherColorSpaceInconsistency++; counter.otherColorSpaceInconsistency++;
} }
} }
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
} }
counter.countedSites++; counter.countedSites++;
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus }
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
counter.skippedSites++; counter.skippedSites++;
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
} }
@ -442,6 +448,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* adding one to the number of observations and potentially one to the number of mismatches * adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation. * because pulling things out of the SAMRecord is an expensive operation.
*
* @param counter Data structure which holds the counted bases * @param counter Data structure which holds the counted bases
* @param gatkRead The SAMRecord holding all the data for this read * @param gatkRead The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus * @param offset The offset in the read for this locus
@ -470,7 +477,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// reduce // reduce
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker. * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
*
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker * @return returns A PrintStream created from the -recalFile filename argument specified to the walker
*/ */
public CountedData reduceInit() { public CountedData reduceInit() {
@ -487,6 +494,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* The Reduce method doesn't do anything for this walker. * The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored. * @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data * @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data * @return returns The sum used to output the CSV data
@ -508,8 +516,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases; final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) { if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
} }
} }
@ -524,6 +531,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* Write out the full data hashmap to disk in CSV format * Write out the full data hashmap to disk in CSV format
*
* @param sum The CountedData to write out to RECAL_FILE * @param sum The CountedData to write out to RECAL_FILE
*/ */
public void onTraversalDone(CountedData sum) { public void onTraversalDone(CountedData sum) {
@ -537,6 +545,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
*
* @param recalTableStream The PrintStream to write out to * @param recalTableStream The PrintStream to write out to
*/ */
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) { private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
@ -558,7 +567,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
if (DONT_SORT_OUTPUT) { if (DONT_SORT_OUTPUT) {
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} else { }
else {
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} }
@ -585,7 +595,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
// Output the RecalDatum entry // Output the RecalDatum entry
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map }
else { // Another layer in the nested hash map
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val); printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
} }
} }
@ -603,7 +614,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
// Output the RecalDatum entry // Output the RecalDatum entry
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map }
else { // Another layer in the nested hash map
printMappings(recalTableStream, curPos + 1, key, (Map) val); printMappings(recalTableStream, curPos + 1, key, (Map) val);
} }
} }

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -256,32 +257,6 @@ public class RecalDataManager {
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if (readGroup == null) {
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName());
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
else {
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
}
}
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
final String oldPlatform = readGroup.getPlatform();
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
readGroup.setPlatform(oldPlatform);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM); readGroup.setPlatform(RAC.FORCE_PLATFORM);
} }
@ -310,7 +285,7 @@ public class RecalDataManager {
public static void parseColorSpace(final GATKSAMRecord read) { public static void parseColorSpace(final GATKSAMRecord read) {
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { if (ReadUtils.isSOLiDRead(read)) {
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) { if (attr != null) {
@ -408,7 +383,7 @@ public class RecalDataManager {
} }
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { if (ReadUtils.isSOLiDRead(read)) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) { if (attr != null) {
byte[] colorSpace; byte[] colorSpace;
@ -637,21 +612,17 @@ public class RecalDataManager {
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (int i = 0; i < numRequestedCovariates; i++) {
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType); requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
for (int j = 0; j < readLength; j++) { for (int j = 0; j < readLength; j++)
//copy values into a 2D array that allows all covar types to be extracted at once for covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
}
} }
return covariateValues_offset_x_covar; return covariateValues_offset_x_covar;
} }
/** /**
* Perform a ceratin transversion (A <-> C or G <-> T) on the base. * Perform a certain transversion (A <-> C or G <-> T) on the base.
* *
* @param base the base [AaCcGgTt] * @param base the base [AaCcGgTt]
* @return the transversion of the base, or the input base if it's not one of the understood ones * @return the transversion of the base, or the input base if it's not one of the understood ones

View File

@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
// Shared Command Line Arguments // Shared Command Line Arguments
////////////////////////////////// //////////////////////////////////
@Hidden @Hidden
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
public String DEFAULT_READ_GROUP = null;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null; public String DEFAULT_PLATFORM = null;
@Hidden @Hidden
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
public String FORCE_READ_GROUP = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null; public String FORCE_PLATFORM = null;
@Hidden @Hidden
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
public int WINDOW_SIZE = 5; public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Hidden
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
@Hidden
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
public boolean EXCEPTION_IF_NO_TILE = false;
/** /**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the * CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies. * reads which have had the reference inserted because of color space inconsistencies.
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
public int CONTEXT_SIZE = 8; public int CONTEXT_SIZE = 8;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
} }

View File

@ -86,12 +86,12 @@ import java.util.regex.Pattern;
* -o my_reads.recal.bam \ * -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv * -recalFile my_reads.recal_data.csv
* </pre> * </pre>
*
*/ */
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@WalkerName("TableRecalibration") @WalkerName("TableRecalibration")
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta @Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires -I input.bam, it also requires -R reference.fasta
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> { public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
///////////////////////////// /////////////////////////////
// Shared Arguments // Shared Arguments
///////////////////////////// /////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); @ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
@ -165,7 +166,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations") @Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false; private boolean skipUQUpdate = false;
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
///////////////////////////// /////////////////////////////
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/ */
public void initialize() { public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } if (RAC.FORCE_PLATFORM != null) {
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates // Get a list of all available covariates
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins(); final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -213,14 +214,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
lineNumber++; lineNumber++;
if (EOF_MARKER.equals(line)) { if (EOF_MARKER.equals(line)) {
sawEOF = true; sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { }
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#') ; // Skip over the comment lines, (which start with '#')
} }
// Read in the covariates that were used from the input file // Read in the covariates that were used from the input file
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
if (foundAllCovariates) { if (foundAllCovariates) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE); throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
} else { // Found the covariate list in input file, loop through all of them and instantiate them }
else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(","); String[] vals = line.split(",");
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false; boolean foundClass = false;
@ -243,7 +246,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
} }
} else { // Found a line of data }
else { // Found a line of data
if (!foundAllCovariates) { if (!foundAllCovariates) {
foundAllCovariates = true; foundAllCovariates = true;
@ -302,7 +306,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
try { try {
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
programRecord.setProgramVersion(version); programRecord.setProgramVersion(version);
} catch (MissingResourceException e) {} } catch (MissingResourceException e) {
}
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
@ -331,6 +336,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*
* @param line A line of CSV data read from the recalibration table data file * @param line A line of CSV data read from the recalibration table data file
*/ */
private void addCSVData(final File file, final String line) { private void addCSVData(final File file, final String line) {
@ -388,7 +394,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
numReadsWithMalformedColorSpace++; numReadsWithMalformedColorSpace++;
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { }
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
read.setReadFailsVendorQualityCheckFlag(true); read.setReadFailsVendorQualityCheckFlag(true);
return read; return read;
} }
@ -398,8 +405,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
//compute all covariate values for this read //compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar = final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
// For each base in the read // For each base in the read
for (int offset = 0; offset < read.getReadLength(); offset++) { for (int offset = 0; offset < read.getReadLength(); offset++) {
@ -407,8 +413,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null) if (qualityScore == null) {
{
qualityScore = performSequentialQualityCalculation(fullCovariateKey); qualityScore = performSequentialQualityCalculation(fullCovariateKey);
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
} }
@ -446,6 +451,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* - The final shift equation is: * - The final shift equation is:
* *
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param key The list of Comparables that were calculated from the covariates * @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte * @return A recalibrated quality score as a byte
*/ */
@ -508,6 +514,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
*
* @param originalQuals The list of original base quality scores * @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores * @param recalQuals A list of the new recalibrated quality scores
*/ */
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Start the reduce with a handle to the output bam file * Start the reduce with a handle to the output bam file
*
* @return A FileWriter pointing to a new bam file * @return A FileWriter pointing to a new bam file
*/ */
public SAMFileWriter reduceInit() { public SAMFileWriter reduceInit() {
@ -535,6 +543,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Output each read to disk * Output each read to disk
*
* @param read The read to output * @param read The read to output
* @param output The FileWriter to write the read to * @param output The FileWriter to write the read to
* @return The FileWriter * @return The FileWriter
@ -548,6 +557,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Do nothing * Do nothing
*
* @param output The SAMFileWriter that outputs the bam file * @param output The SAMFileWriter that outputs the bam file
*/ */
public void onTraversalDone(SAMFileWriter output) { public void onTraversalDone(SAMFileWriter output) {
@ -557,7 +567,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!"); "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { }
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
/** /**
* BaseUtils contains some basic utilities for manipulating nucleotides. * BaseUtils contains some basic utilities for manipulating nucleotides.
*/ */
@ -29,21 +28,25 @@ public class BaseUtils {
byte b; byte b;
int index; int index;
private Base(char base, int index) { private Base(char base, int index) {
this.b = (byte) base; this.b = (byte) base;
this.index = index; this.index = index;
} }
public byte getBase() { return b; } public byte getBase() { return b; }
public char getBaseAsChar() { return (char) b; } public char getBaseAsChar() { return (char) b; }
public int getIndex() { return index; } public int getIndex() { return index; }
public boolean sameBase(byte o) { return b == o; } public boolean sameBase(byte o) { return b == o; }
public boolean sameBase(char o) { return b == (byte) o; } public boolean sameBase(char o) { return b == (byte) o; }
public boolean sameBase(int i) { return index == i; } public boolean sameBase(int i) { return index == i; }
} }
// todo -- fix me (enums?) // todo -- fix me (enums?)
public static final byte DELETION_INDEX = 4; public static final byte DELETION_INDEX = 4;
public static final byte NO_CALL_INDEX = 5; // (this is 'N') public static final byte NO_CALL_INDEX = 5; // (this is 'N')
@ -53,7 +56,6 @@ public class BaseUtils {
public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
// a pyrimidine to another pyrimidine nucleotide (C <-> T). // a pyrimidine to another pyrimidine nucleotide (C <-> T).
// Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions. // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
@ -64,6 +66,7 @@ public class BaseUtils {
/** /**
* Returns the base substitution type of the 2 state SNP * Returns the base substitution type of the 2 state SNP
*
* @param base1 * @param base1
* @param base2 * @param base2
* @return * @return
@ -85,7 +88,9 @@ public class BaseUtils {
return !isTransition(base1, base2); return !isTransition(base1, base2);
} }
/** Private constructor. No instantiating this class! */ /**
* Private constructor. No instantiating this class!
*/
private BaseUtils() {} private BaseUtils() {}
static public boolean basesAreEqual(byte base1, byte base2) { static public boolean basesAreEqual(byte base1, byte base2) {
@ -96,7 +101,6 @@ public class BaseUtils {
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
} }
/** /**
* Converts a IUPAC nucleotide code to a pair of bases * Converts a IUPAC nucleotide code to a pair of bases
* *
@ -170,22 +174,26 @@ public class BaseUtils {
switch (base) { switch (base) {
case '*': // the wildcard character counts as an A case '*': // the wildcard character counts as an A
case 'A': case 'A':
case 'a': return 0; case 'a':
return 0;
case 'C': case 'C':
case 'c': return 1; case 'c':
return 1;
case 'G': case 'G':
case 'g': return 2; case 'g':
return 2;
case 'T': case 'T':
case 't': return 3; case 't':
return 3;
default: return -1; default:
return -1;
} }
} }
/** /**
* Converts a simple base to a base index * Converts a simple base to a base index
* *
@ -197,29 +205,37 @@ public class BaseUtils {
switch (base) { switch (base) {
case '*': // the wildcard character counts as an A case '*': // the wildcard character counts as an A
case 'A': case 'A':
case 'a': return 0; case 'a':
return 0;
case 'C': case 'C':
case 'c': return 1; case 'c':
return 1;
case 'G': case 'G':
case 'g': return 2; case 'g':
return 2;
case 'T': case 'T':
case 't': return 3; case 't':
return 3;
default: return -1; default:
return -1;
} }
} }
static public int extendedBaseToBaseIndex(byte base) { static public int extendedBaseToBaseIndex(byte base) {
switch (base) { switch (base) {
case 'd': case 'd':
case 'D': return DELETION_INDEX; case 'D':
return DELETION_INDEX;
case 'n': case 'n':
case 'N': return NO_CALL_INDEX; case 'N':
return NO_CALL_INDEX;
default: return simpleBaseToBaseIndex(base); default:
return simpleBaseToBaseIndex(base);
} }
} }
@ -232,11 +248,6 @@ public class BaseUtils {
return simpleBaseToBaseIndex(base) != -1; return simpleBaseToBaseIndex(base) != -1;
} }
@Deprecated
static public boolean isNBase(char base) {
return isNBase((byte)base);
}
static public boolean isNBase(byte base) { static public boolean isNBase(byte base) {
return base == 'N' || base == 'n'; return base == 'N' || base == 'n';
} }
@ -249,11 +260,16 @@ public class BaseUtils {
*/ */
static public byte baseIndexToSimpleBase(int baseIndex) { static public byte baseIndexToSimpleBase(int baseIndex) {
switch (baseIndex) { switch (baseIndex) {
case 0: return 'A'; case 0:
case 1: return 'C'; return 'A';
case 2: return 'G'; case 1:
case 3: return 'T'; return 'C';
default: return '.'; case 2:
return 'G';
case 3:
return 'T';
default:
return '.';
} }
} }
@ -270,11 +286,16 @@ public class BaseUtils {
*/ */
static public int crossTalkPartnerIndex(int baseIndex) { static public int crossTalkPartnerIndex(int baseIndex) {
switch (baseIndex) { switch (baseIndex) {
case 0: return 1; // A -> C case 0:
case 1: return 0; // C -> A return 1; // A -> C
case 2: return 3; // G -> T case 1:
case 3: return 2; // T -> G return 0; // C -> A
default: return -1; case 2:
return 3; // G -> T
case 3:
return 2; // T -> G
default:
return -1;
} }
} }
@ -297,11 +318,16 @@ public class BaseUtils {
*/ */
static public byte complementIndex(int baseIndex) { static public byte complementIndex(int baseIndex) {
switch (baseIndex) { switch (baseIndex) {
case 0: return 3; // a -> t case 0:
case 1: return 2; // c -> g return 3; // a -> t
case 2: return 1; // g -> c case 1:
case 3: return 0; // t -> a return 2; // c -> g
default: return -1; // wtf? case 2:
return 1; // g -> c
case 3:
return 0; // t -> a
default:
return -1; // wtf?
} }
} }
@ -314,14 +340,19 @@ public class BaseUtils {
static public byte simpleComplement(byte base) { static public byte simpleComplement(byte base) {
switch (base) { switch (base) {
case 'A': case 'A':
case 'a': return 'T'; case 'a':
return 'T';
case 'C': case 'C':
case 'c': return 'G'; case 'c':
return 'G';
case 'G': case 'G':
case 'g': return 'C'; case 'g':
return 'C';
case 'T': case 'T':
case 't': return 'A'; case 't':
default: return base; return 'A';
default:
return base;
} }
} }
@ -407,7 +438,6 @@ public class BaseUtils {
return new String(simpleReverseComplement(bases.getBytes())); return new String(simpleReverseComplement(bases.getBytes()));
} }
/** /**
* Complement a String of bases. Preserves ambiguous bases. * Complement a String of bases. Preserves ambiguous bases.
* *
@ -532,8 +562,8 @@ public class BaseUtils {
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
} }
/**
/** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period * The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
* of 2 (it has a period of 4 as well), and so does * of 2 (it has a period of 4 as well), and so does
@ -554,9 +584,7 @@ public class BaseUtils {
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
// if our current hypothesis holds, base[pos] must be the same as base[offset] // if our current hypothesis holds, base[pos] must be the same as base[offset]
if ( Character.toUpperCase( seq[pos] ) != if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
Character.toUpperCase( seq[offset] )
) {
// period we have been trying so far does not work. // period we have been trying so far does not work.
// two possibilities: // two possibilities:
@ -569,8 +597,10 @@ public class BaseUtils {
// we set period to pos (thus trying the hypothesis that bases from start up to the current one, // we set period to pos (thus trying the hypothesis that bases from start up to the current one,
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
// on the next loop re-entrance after pos is autoincremented) // on the next loop re-entrance after pos is autoincremented)
if ( offset == 0 ) period = pos+1; if (offset == 0)
else period = pos-- ; period = pos + 1;
else
period = pos--;
} }
} }

View File

@ -87,7 +87,7 @@ public enum NGSPlatform {
/** /**
* Returns the NGSPlatform corresponding to the PL tag in the read group * Returns the NGSPlatform corresponding to the PL tag in the read group
* @param plFromRG -- the PL field (or equivalent) in a ReadGroup object * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object
* @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match * @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match
*/ */
public static final NGSPlatform fromReadGroupPL(final String plFromRG) { public static final NGSPlatform fromReadGroupPL(final String plFromRG) {
if ( plFromRG == null ) return UNKNOWN; if ( plFromRG == null ) return UNKNOWN;
@ -105,4 +105,14 @@ public enum NGSPlatform {
return UNKNOWN; return UNKNOWN;
} }
/**
* checks whether or not the requested platform is listed in the set (and is not unknown)
*
* @param platform the read group string that describes the platform used
* @return true if the platform is known (i.e. it's in the list and is not UNKNOWN)
*/
public static final boolean isKnown (final String platform) {
return fromReadGroupPL(platform) != UNKNOWN;
}
} }

View File

@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
for (int i = 0; i < reads.size(); i++) { for (int i = 0; i < reads.size(); i++) {
GATKSAMRecord read = reads.get(i); GATKSAMRecord read = reads.get(i);
int offset = offsets.get(i); int offset = offsets.get(i);
pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
} }
return pileup; return pileup;
@ -196,7 +196,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>(); UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
for (GATKSAMRecord read : reads) { for (GATKSAMRecord read : reads) {
pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
} }
return pileup; return pileup;
@ -204,7 +204,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker); protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip);
// -------------------------------------------------------- // --------------------------------------------------------
// //

View File

@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement {
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
super(read, offset, type == Type.DELETION, false, false); // extended events are slated for removal super(read, offset, type == Type.DELETION, false, false, false); // extended events are slated for removal
this.read = read; this.read = read;
this.offset = offset; this.offset = offset;
this.eventLength = eventLength; this.eventLength = eventLength;

View File

@ -24,6 +24,7 @@ public class PileupElement implements Comparable<PileupElement> {
protected final GATKSAMRecord read; protected final GATKSAMRecord read;
protected final int offset; protected final int offset;
protected final boolean isDeletion; protected final boolean isDeletion;
protected final boolean isBeforeDeletion;
protected final boolean isBeforeInsertion; protected final boolean isBeforeInsertion;
protected final boolean isNextToSoftClip; protected final boolean isNextToSoftClip;
@ -33,6 +34,7 @@ public class PileupElement implements Comparable<PileupElement> {
* @param read the read we are adding to the pileup * @param read the read we are adding to the pileup
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
* @param isDeletion whether or not this base is a deletion * @param isDeletion whether or not this base is a deletion
* @param isBeforeDeletion whether or not this base is before a deletion
* @param isBeforeInsertion whether or not this base is before an insertion * @param isBeforeInsertion whether or not this base is before an insertion
* @param isNextToSoftClip whether or not this base is next to a soft clipped base * @param isNextToSoftClip whether or not this base is next to a soft clipped base
*/ */
@ -40,13 +42,14 @@ public class PileupElement implements Comparable<PileupElement> {
"read != null", "read != null",
"offset >= -1", "offset >= -1",
"offset <= read.getReadLength()"}) "offset <= read.getReadLength()"})
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) {
if (offset < 0 && isDeletion) if (offset < 0 && isDeletion)
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
this.read = read; this.read = read;
this.offset = offset; this.offset = offset;
this.isDeletion = isDeletion; this.isDeletion = isDeletion;
this.isBeforeDeletion = isBeforeDeletion;
this.isBeforeInsertion = isBeforeInsertion; this.isBeforeInsertion = isBeforeInsertion;
this.isNextToSoftClip = isNextToSoftClip; this.isNextToSoftClip = isNextToSoftClip;
} }
@ -55,6 +58,10 @@ public class PileupElement implements Comparable<PileupElement> {
return isDeletion; return isDeletion;
} }
public boolean isBeforeDeletion() {
return isBeforeDeletion;
}
public boolean isBeforeInsertion() { public boolean isBeforeInsertion() {
return isBeforeInsertion; return isBeforeInsertion;
} }

View File

@ -96,7 +96,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup<
} }
@Override @Override
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
} }

View File

@ -71,7 +71,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
} }
@Override @Override
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
return new PileupElement(read, offset, isDeletion, isBeforeInsertion, isNextToSoftClip); return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip);
} }
} }

View File

@ -361,10 +361,10 @@ public class ArtificialSAMUtils {
final GATKSAMRecord left = pair.get(0); final GATKSAMRecord left = pair.get(0);
final GATKSAMRecord right = pair.get(1); final GATKSAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false)); pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false));
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false)); pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false));
} }
} }

View File

@ -30,7 +30,7 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
public class GATKReportUnitTest extends BaseTest { public class GATKReportUnitTest extends BaseTest {
@Test @Test(enabled = false)
public void testParse() throws Exception { public void testParse() throws Exception {
String reportPath = validationDataLocation + "exampleGATKReport.eval"; String reportPath = validationDataLocation + "exampleGATKReport.eval";
GATKReport report = new GATKReport(reportPath); GATKReport report = new GATKReport(reportPath);

View File

@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest {
@Test @Test
public void testReducedReadPileupElement() { public void testReducedReadPileupElement() {
PileupElement readp = new PileupElement(read, 0, false, false, false); PileupElement readp = new PileupElement(read, 0, false, false, false, false);
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false); PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false);
Assert.assertFalse(readp.getRead().isReducedRead()); Assert.assertFalse(readp.getRead().isReducedRead());

View File

@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk
import java.io.File import java.io.File
import collection.JavaConversions._ import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils}
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
import net.sf.samtools.SAMFileHeader import net.sf.samtools.SAMFileHeader
import java.util.Collections import java.util.Collections
@ -50,6 +50,8 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) {
IntervalUtils.parseIntervalArguments(parser, intervals) IntervalUtils.parseIntervalArguments(parser, intervals)
Collections.sort(parsedLocs) Collections.sort(parsedLocs)
Collections.unmodifiableList(parsedLocs) Collections.unmodifiableList(parsedLocs)
val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY)
Collections.unmodifiableList(mergedLocs)
} }
lazy val contigs = locs.map(_.getContig).distinct.toSeq lazy val contigs = locs.map(_.getContig).distinct.toSeq

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile
import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser}
import collection.JavaConversions._ import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils
class GATKIntervalsUnitTest { class GATKIntervalsUnitTest {
private final lazy val hg18Reference = new File(BaseTest.hg18Reference) private final lazy val hg18Reference = new File(BaseTest.hg18Reference)
@ -60,7 +61,7 @@ class GATKIntervalsUnitTest {
// for(Item item: javaConvertedScalaList) // for(Item item: javaConvertedScalaList)
// This for loop is actually an O(N^2) operation as the iterator calls the // This for loop is actually an O(N^2) operation as the iterator calls the
// O(N) javaConvertedScalaList.size() for each iteration of the loop. // O(N) javaConvertedScalaList.size() for each iteration of the loop.
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(IntervalUtils.splitFixedIntervals(gi.locs, 189894).size(), 189894)
Assert.assertEquals(gi.contigs.size, 24) Assert.assertEquals(gi.contigs.size, 24)
} }
@ -77,4 +78,17 @@ class GATKIntervalsUnitTest {
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3"))
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3"))
} }
@Test
def testSortAndMergeIntervals() {
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10"))
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12"))
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"))
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30"))
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30"))
}
private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) {
Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_)))
}
} }