Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
5e18020a5f
|
|
@ -1,762 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
package net.sf.samtools;
|
|
||||||
|
|
||||||
|
|
||||||
import net.sf.samtools.util.*;
|
|
||||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Internal class for reading and querying BAM files.
|
|
||||||
*/
|
|
||||||
class BAMFileReader extends SAMFileReader.ReaderImplementation {
|
|
||||||
// True if reading from a File rather than an InputStream
|
|
||||||
private boolean mIsSeekable = false;
|
|
||||||
|
|
||||||
// For converting bytes into other primitive types
|
|
||||||
private BinaryCodec mStream = null;
|
|
||||||
|
|
||||||
// Underlying compressed data stream.
|
|
||||||
private final BAMInputStream mInputStream;
|
|
||||||
private SAMFileHeader mFileHeader = null;
|
|
||||||
|
|
||||||
// Populated if the file is seekable and an index exists
|
|
||||||
private File mIndexFile;
|
|
||||||
private BAMIndex mIndex = null;
|
|
||||||
private long mFirstRecordPointer = 0;
|
|
||||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
|
||||||
|
|
||||||
// If true, all SAMRecords are fully decoded as they are read.
|
|
||||||
private final boolean eagerDecode;
|
|
||||||
|
|
||||||
// For error-checking.
|
|
||||||
private ValidationStringency mValidationStringency;
|
|
||||||
|
|
||||||
// For creating BAMRecords
|
|
||||||
private SAMRecordFactory samRecordFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Use the caching index reader implementation rather than the disk-hit-per-file model.
|
|
||||||
*/
|
|
||||||
private boolean mEnableIndexCaching = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
|
|
||||||
*/
|
|
||||||
private boolean mEnableIndexMemoryMapping = true;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add information about the origin (reader and position) to SAM records.
|
|
||||||
*/
|
|
||||||
private SAMFileReader mFileReader = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to read BAM from a stream (not seekable)
|
|
||||||
* @param stream source of bytes.
|
|
||||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
|
||||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
|
||||||
*/
|
|
||||||
BAMFileReader(final InputStream stream,
|
|
||||||
final File indexFile,
|
|
||||||
final boolean eagerDecode,
|
|
||||||
final ValidationStringency validationStringency,
|
|
||||||
final SAMRecordFactory factory)
|
|
||||||
throws IOException {
|
|
||||||
mIndexFile = indexFile;
|
|
||||||
mIsSeekable = false;
|
|
||||||
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
|
|
||||||
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
|
|
||||||
this.eagerDecode = eagerDecode;
|
|
||||||
this.mValidationStringency = validationStringency;
|
|
||||||
this.samRecordFactory = factory;
|
|
||||||
readHeader(null);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to read BAM from a file (seekable)
|
|
||||||
* @param file source of bytes.
|
|
||||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
|
||||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
|
||||||
*/
|
|
||||||
BAMFileReader(final File file,
|
|
||||||
final File indexFile,
|
|
||||||
final boolean eagerDecode,
|
|
||||||
final ValidationStringency validationStringency,
|
|
||||||
final SAMRecordFactory factory)
|
|
||||||
throws IOException {
|
|
||||||
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
|
|
||||||
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
|
|
||||||
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
|
|
||||||
" is older than BAM " + file.getAbsolutePath());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BAMFileReader(final SeekableStream strm,
|
|
||||||
final File indexFile,
|
|
||||||
final boolean eagerDecode,
|
|
||||||
final ValidationStringency validationStringency,
|
|
||||||
final SAMRecordFactory factory)
|
|
||||||
throws IOException {
|
|
||||||
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
|
|
||||||
indexFile,
|
|
||||||
eagerDecode,
|
|
||||||
strm.getSource(),
|
|
||||||
validationStringency,
|
|
||||||
factory);
|
|
||||||
}
|
|
||||||
|
|
||||||
private BAMFileReader(final BAMInputStream inputStream,
|
|
||||||
final File indexFile,
|
|
||||||
final boolean eagerDecode,
|
|
||||||
final String source,
|
|
||||||
final ValidationStringency validationStringency,
|
|
||||||
final SAMRecordFactory factory)
|
|
||||||
throws IOException {
|
|
||||||
mIndexFile = indexFile;
|
|
||||||
mIsSeekable = true;
|
|
||||||
mInputStream = inputStream;
|
|
||||||
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
|
|
||||||
this.eagerDecode = eagerDecode;
|
|
||||||
this.mValidationStringency = validationStringency;
|
|
||||||
this.samRecordFactory = factory;
|
|
||||||
readHeader(source);
|
|
||||||
mFirstRecordPointer = inputStream.getFilePointer();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If true, writes the source of every read into the source SAMRecords.
|
|
||||||
* @param enabled true to write source information into each SAMRecord.
|
|
||||||
*/
|
|
||||||
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
|
|
||||||
this.mFileReader = enabled ? reader : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If true, uses the caching version of the index reader.
|
|
||||||
* @param enabled true to write source information into each SAMRecord.
|
|
||||||
*/
|
|
||||||
public void enableIndexCaching(final boolean enabled) {
|
|
||||||
if(mIndex != null)
|
|
||||||
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
|
|
||||||
this.mEnableIndexCaching = enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
|
|
||||||
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
|
|
||||||
* @param enabled True to use memory mapping, false to use regular I/O.
|
|
||||||
*/
|
|
||||||
public void enableIndexMemoryMapping(final boolean enabled) {
|
|
||||||
if (mIndex != null) {
|
|
||||||
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
|
|
||||||
}
|
|
||||||
this.mEnableIndexMemoryMapping = enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override void enableCrcChecking(final boolean enabled) {
|
|
||||||
this.mInputStream.setCheckCrcs(enabled);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return true if ths is a BAM file, and has an index
|
|
||||||
*/
|
|
||||||
public boolean hasIndex() {
|
|
||||||
return (mIndexFile != null);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
|
|
||||||
* @return An index of the given type.
|
|
||||||
*/
|
|
||||||
public BAMIndex getIndex() {
|
|
||||||
if(mIndexFile == null)
|
|
||||||
throw new SAMException("No index is available for this BAM file.");
|
|
||||||
if(mIndex == null)
|
|
||||||
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
|
|
||||||
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
|
|
||||||
return mIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void close() {
|
|
||||||
if (mStream != null) {
|
|
||||||
mStream.close();
|
|
||||||
}
|
|
||||||
if (mIndex != null) {
|
|
||||||
mIndex.close();
|
|
||||||
}
|
|
||||||
mStream = null;
|
|
||||||
mFileHeader = null;
|
|
||||||
mIndex = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
SAMFileHeader getFileHeader() {
|
|
||||||
return mFileHeader;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set error-checking level for subsequent SAMRecord reads.
|
|
||||||
*/
|
|
||||||
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
|
||||||
this.mValidationStringency = validationStringency;
|
|
||||||
}
|
|
||||||
|
|
||||||
SAMFileReader.ValidationStringency getValidationStringency() {
|
|
||||||
return this.mValidationStringency;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to iterate through the SAMRecords in file order.
|
|
||||||
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
|
||||||
* that iterator must be closed before getIterator() can be called again.
|
|
||||||
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
|
||||||
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
|
||||||
* done in that situation.
|
|
||||||
*/
|
|
||||||
CloseableIterator<SAMRecord> getIterator() {
|
|
||||||
if (mStream == null) {
|
|
||||||
throw new IllegalStateException("File reader is closed");
|
|
||||||
}
|
|
||||||
if (mCurrentIterator != null) {
|
|
||||||
throw new IllegalStateException("Iteration in progress");
|
|
||||||
}
|
|
||||||
if (mIsSeekable) {
|
|
||||||
try {
|
|
||||||
mInputStream.seek(mFirstRecordPointer);
|
|
||||||
} catch (IOException exc) {
|
|
||||||
throw new RuntimeException(exc.getMessage(), exc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mCurrentIterator = new BAMFileIterator();
|
|
||||||
return mCurrentIterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
|
|
||||||
if (mStream == null) {
|
|
||||||
throw new IllegalStateException("File reader is closed");
|
|
||||||
}
|
|
||||||
if (mCurrentIterator != null) {
|
|
||||||
throw new IllegalStateException("Iteration in progress");
|
|
||||||
}
|
|
||||||
if (!(chunks instanceof BAMFileSpan)) {
|
|
||||||
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create an iterator over the given chunk boundaries.
|
|
||||||
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
|
|
||||||
return mCurrentIterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
|
|
||||||
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
|
|
||||||
* the rightmost bound is guaranteed to be after the last read in the file.
|
|
||||||
* @return An unbounded pointer to the first record in the BAM file.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
SAMFileSpan getFilePointerSpanningReads() {
|
|
||||||
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to iterate through the SAMRecords that match the given interval.
|
|
||||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
|
||||||
* before calling any of the methods that return an iterator.
|
|
||||||
*
|
|
||||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
|
||||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
|
||||||
* matches the specified interval.
|
|
||||||
*
|
|
||||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
|
||||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
|
||||||
*
|
|
||||||
* @param sequence Reference sequence sought.
|
|
||||||
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
|
||||||
* A value of zero implies the start of the reference sequence.
|
|
||||||
* @param end A value of zero implies the end of the reference sequence.
|
|
||||||
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
|
||||||
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
|
||||||
* @return Iterator for the matching SAMRecords
|
|
||||||
*/
|
|
||||||
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
|
||||||
if (mStream == null) {
|
|
||||||
throw new IllegalStateException("File reader is closed");
|
|
||||||
}
|
|
||||||
if (mCurrentIterator != null) {
|
|
||||||
throw new IllegalStateException("Iteration in progress");
|
|
||||||
}
|
|
||||||
if (!mIsSeekable) {
|
|
||||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
|
||||||
}
|
|
||||||
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
|
||||||
return mCurrentIterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to iterate through the SAMRecords with the given alignment start.
|
|
||||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
|
||||||
* before calling any of the methods that return an iterator.
|
|
||||||
*
|
|
||||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
|
||||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
|
||||||
* matches the specified interval.
|
|
||||||
*
|
|
||||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
|
||||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
|
||||||
*
|
|
||||||
* @param sequence Reference sequence sought.
|
|
||||||
* @param start Alignment start sought.
|
|
||||||
* @return Iterator for the matching SAMRecords.
|
|
||||||
*/
|
|
||||||
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
|
||||||
if (mStream == null) {
|
|
||||||
throw new IllegalStateException("File reader is closed");
|
|
||||||
}
|
|
||||||
if (mCurrentIterator != null) {
|
|
||||||
throw new IllegalStateException("Iteration in progress");
|
|
||||||
}
|
|
||||||
if (!mIsSeekable) {
|
|
||||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
|
||||||
}
|
|
||||||
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
|
||||||
return mCurrentIterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
|
||||||
if (mStream == null) {
|
|
||||||
throw new IllegalStateException("File reader is closed");
|
|
||||||
}
|
|
||||||
if (mCurrentIterator != null) {
|
|
||||||
throw new IllegalStateException("Iteration in progress");
|
|
||||||
}
|
|
||||||
if (!mIsSeekable) {
|
|
||||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
|
|
||||||
if (startOfLastLinearBin != -1) {
|
|
||||||
mInputStream.seek(startOfLastLinearBin);
|
|
||||||
} else {
|
|
||||||
// No mapped reads in file, just start at the first read in file.
|
|
||||||
mInputStream.seek(mFirstRecordPointer);
|
|
||||||
}
|
|
||||||
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
|
||||||
return mCurrentIterator;
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads the header from the file or stream
|
|
||||||
* @param source Note that this is used only for reporting errors.
|
|
||||||
*/
|
|
||||||
private void readHeader(final String source)
|
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
final byte[] buffer = new byte[4];
|
|
||||||
mStream.readBytes(buffer);
|
|
||||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
|
||||||
throw new IOException("Invalid BAM file header");
|
|
||||||
}
|
|
||||||
|
|
||||||
final int headerTextLength = mStream.readInt();
|
|
||||||
final String textHeader = mStream.readString(headerTextLength);
|
|
||||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
|
||||||
headerCodec.setValidationStringency(mValidationStringency);
|
|
||||||
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
|
||||||
source);
|
|
||||||
|
|
||||||
final int sequenceCount = mStream.readInt();
|
|
||||||
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
|
||||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
|
||||||
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
|
||||||
throw new SAMFormatException("Number of sequences in text header (" +
|
|
||||||
mFileHeader.getSequenceDictionary().size() +
|
|
||||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < sequenceCount; i++) {
|
|
||||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
|
||||||
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
|
||||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
|
||||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
|
||||||
source);
|
|
||||||
}
|
|
||||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
|
||||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
|
||||||
source);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// If only binary sequences are present, copy them into mFileHeader
|
|
||||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
|
||||||
for (int i = 0; i < sequenceCount; i++) {
|
|
||||||
sequences.add(readSequenceRecord(source));
|
|
||||||
}
|
|
||||||
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a single binary sequence record from the file or stream
|
|
||||||
* @param source Note that this is used only for reporting errors.
|
|
||||||
*/
|
|
||||||
private SAMSequenceRecord readSequenceRecord(final String source) {
|
|
||||||
final int nameLength = mStream.readInt();
|
|
||||||
if (nameLength <= 1) {
|
|
||||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
|
||||||
}
|
|
||||||
final String sequenceName = mStream.readString(nameLength - 1);
|
|
||||||
// Skip the null terminator
|
|
||||||
mStream.readByte();
|
|
||||||
final int sequenceLength = mStream.readInt();
|
|
||||||
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
|
||||||
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
|
||||||
*/
|
|
||||||
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
|
||||||
private SAMRecord mNextRecord = null;
|
|
||||||
private final BAMRecordCodec bamRecordCodec;
|
|
||||||
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
|
||||||
|
|
||||||
BAMFileIterator() {
|
|
||||||
this(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param advance Trick to enable subclass to do more setup before advancing
|
|
||||||
*/
|
|
||||||
BAMFileIterator(final boolean advance) {
|
|
||||||
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
|
|
||||||
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
|
|
||||||
|
|
||||||
if (advance) {
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
if (mCurrentIterator != null && this != mCurrentIterator) {
|
|
||||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
|
||||||
}
|
|
||||||
mCurrentIterator = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return (mNextRecord != null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SAMRecord next() {
|
|
||||||
final SAMRecord result = mNextRecord;
|
|
||||||
advance();
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Not supported: remove");
|
|
||||||
}
|
|
||||||
|
|
||||||
void advance() {
|
|
||||||
try {
|
|
||||||
mNextRecord = getNextRecord();
|
|
||||||
|
|
||||||
if (mNextRecord != null) {
|
|
||||||
++this.samRecordIndex;
|
|
||||||
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
|
||||||
mNextRecord.setValidationStringency(mValidationStringency);
|
|
||||||
|
|
||||||
if (mValidationStringency != ValidationStringency.SILENT) {
|
|
||||||
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
|
||||||
SAMUtils.processValidationErrors(validationErrors,
|
|
||||||
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (eagerDecode && mNextRecord != null) {
|
|
||||||
mNextRecord.eagerDecode();
|
|
||||||
}
|
|
||||||
} catch (IOException exc) {
|
|
||||||
throw new RuntimeException(exc.getMessage(), exc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read the next record from the input stream.
|
|
||||||
*/
|
|
||||||
SAMRecord getNextRecord() throws IOException {
|
|
||||||
final long startCoordinate = mInputStream.getFilePointer();
|
|
||||||
final SAMRecord next = bamRecordCodec.decode();
|
|
||||||
final long stopCoordinate = mInputStream.getFilePointer();
|
|
||||||
|
|
||||||
if(mFileReader != null && next != null)
|
|
||||||
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
|
|
||||||
|
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return The record that will be return by the next call to next()
|
|
||||||
*/
|
|
||||||
protected SAMRecord peek() {
|
|
||||||
return mNextRecord;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to iterate through SAMRecords matching the target interval.
|
|
||||||
* @param sequence Desired reference sequence.
|
|
||||||
* @param start 1-based start of target interval, inclusive.
|
|
||||||
* @param end 1-based end of target interval, inclusive.
|
|
||||||
* @param queryType contained, overlapping, or starting-at query.
|
|
||||||
*/
|
|
||||||
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
|
||||||
final int start,
|
|
||||||
final int end,
|
|
||||||
final QueryType queryType) {
|
|
||||||
long[] filePointers = null;
|
|
||||||
|
|
||||||
// Hit the index to determine the chunk boundaries for the required data.
|
|
||||||
final SAMFileHeader fileHeader = getFileHeader();
|
|
||||||
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
|
||||||
if (referenceIndex != -1) {
|
|
||||||
final BAMIndex fileIndex = getIndex();
|
|
||||||
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
|
|
||||||
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create an iterator over the above chunk boundaries.
|
|
||||||
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
|
||||||
|
|
||||||
// Add some preprocessing filters for edge-case reads that don't fit into this
|
|
||||||
// query type.
|
|
||||||
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Look for BAM index file according to standard naming convention.
|
|
||||||
*
|
|
||||||
* @param dataFile BAM file name.
|
|
||||||
* @return Index file name, or null if not found.
|
|
||||||
*/
|
|
||||||
private static File findIndexFile(final File dataFile) {
|
|
||||||
// If input is foo.bam, look for foo.bai
|
|
||||||
final String bamExtension = ".bam";
|
|
||||||
File indexFile;
|
|
||||||
final String fileName = dataFile.getName();
|
|
||||||
if (fileName.endsWith(bamExtension)) {
|
|
||||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
|
|
||||||
indexFile = new File(dataFile.getParent(), bai);
|
|
||||||
if (indexFile.exists()) {
|
|
||||||
return indexFile;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If foo.bai doesn't exist look for foo.bam.bai
|
|
||||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
|
||||||
if (indexFile.exists()) {
|
|
||||||
return indexFile;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private class BAMFileIndexIterator extends BAMFileIterator {
|
|
||||||
|
|
||||||
private long[] mFilePointers = null;
|
|
||||||
private int mFilePointerIndex = 0;
|
|
||||||
private long mFilePointerLimit = -1;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
|
|
||||||
* @param filePointers the block / offset combination, stored in chunk format.
|
|
||||||
*/
|
|
||||||
BAMFileIndexIterator(final long[] filePointers) {
|
|
||||||
super(false); // delay advance() until after construction
|
|
||||||
mFilePointers = filePointers;
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
SAMRecord getNextRecord()
|
|
||||||
throws IOException {
|
|
||||||
// Advance to next file block if necessary
|
|
||||||
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
|
|
||||||
if (mFilePointers == null ||
|
|
||||||
mFilePointerIndex >= mFilePointers.length) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final long startOffset = mFilePointers[mFilePointerIndex++];
|
|
||||||
final long endOffset = mFilePointers[mFilePointerIndex++];
|
|
||||||
mInputStream.seek(startOffset);
|
|
||||||
mFilePointerLimit = endOffset;
|
|
||||||
}
|
|
||||||
// Pull next record from stream
|
|
||||||
return super.getNextRecord();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A decorating iterator that filters out records that are outside the bounds of the
|
|
||||||
* given query parameters.
|
|
||||||
*/
|
|
||||||
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
|
||||||
/**
|
|
||||||
* The wrapped iterator.
|
|
||||||
*/
|
|
||||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The next record to be returned. Will be null if no such record exists.
|
|
||||||
*/
|
|
||||||
private SAMRecord mNextRecord;
|
|
||||||
|
|
||||||
private final int mReferenceIndex;
|
|
||||||
private final int mRegionStart;
|
|
||||||
private final int mRegionEnd;
|
|
||||||
private final QueryType mQueryType;
|
|
||||||
|
|
||||||
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
|
||||||
this.wrappedIterator = iterator;
|
|
||||||
final SAMFileHeader fileHeader = getFileHeader();
|
|
||||||
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
|
||||||
mRegionStart = start;
|
|
||||||
if (queryType == QueryType.STARTING_AT) {
|
|
||||||
mRegionEnd = mRegionStart;
|
|
||||||
} else {
|
|
||||||
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
|
||||||
}
|
|
||||||
mQueryType = queryType;
|
|
||||||
mNextRecord = advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if a next element exists; false otherwise.
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
return mNextRecord != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the next record from the given iterator.
|
|
||||||
* @return The next SAM record in the iterator.
|
|
||||||
*/
|
|
||||||
public SAMRecord next() {
|
|
||||||
if(!hasNext())
|
|
||||||
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
|
||||||
final SAMRecord currentRead = mNextRecord;
|
|
||||||
mNextRecord = advance();
|
|
||||||
return currentRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Closes down the existing iterator.
|
|
||||||
*/
|
|
||||||
public void close() {
|
|
||||||
if (this != mCurrentIterator) {
|
|
||||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
|
||||||
}
|
|
||||||
mCurrentIterator = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @throws UnsupportedOperationException always.
|
|
||||||
*/
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Not supported: remove");
|
|
||||||
}
|
|
||||||
|
|
||||||
SAMRecord advance() {
|
|
||||||
while (true) {
|
|
||||||
// Pull next record from stream
|
|
||||||
if(!wrappedIterator.hasNext())
|
|
||||||
return null;
|
|
||||||
|
|
||||||
final SAMRecord record = wrappedIterator.next();
|
|
||||||
// If beyond the end of this reference sequence, end iteration
|
|
||||||
final int referenceIndex = record.getReferenceIndex();
|
|
||||||
if (referenceIndex != mReferenceIndex) {
|
|
||||||
if (referenceIndex < 0 ||
|
|
||||||
referenceIndex > mReferenceIndex) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// If before this reference sequence, continue
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
|
||||||
// Quick exit to avoid expensive alignment end calculation
|
|
||||||
return record;
|
|
||||||
}
|
|
||||||
final int alignmentStart = record.getAlignmentStart();
|
|
||||||
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
|
||||||
// the query region, regardless of whether the mapped mate will be returned.
|
|
||||||
final int alignmentEnd;
|
|
||||||
if (mQueryType == QueryType.STARTING_AT) {
|
|
||||||
alignmentEnd = -1;
|
|
||||||
} else {
|
|
||||||
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
|
||||||
record.getAlignmentEnd(): alignmentStart);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (alignmentStart > mRegionEnd) {
|
|
||||||
// If scanned beyond target region, end iteration
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// Filter for overlap with region
|
|
||||||
if (mQueryType == QueryType.CONTAINED) {
|
|
||||||
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
|
||||||
return record;
|
|
||||||
}
|
|
||||||
} else if (mQueryType == QueryType.OVERLAPPING) {
|
|
||||||
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
|
||||||
return record;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (alignmentStart == mRegionStart) {
|
|
||||||
return record;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
|
||||||
private BAMFileIndexUnmappedIterator() {
|
|
||||||
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -40,6 +40,10 @@ public class GATKChunk extends Chunk {
|
||||||
super(start,stop);
|
super(start,stop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) {
|
||||||
|
super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd);
|
||||||
|
}
|
||||||
|
|
||||||
public GATKChunk(final Chunk chunk) {
|
public GATKChunk(final Chunk chunk) {
|
||||||
super(chunk.getChunkStart(),chunk.getChunkEnd());
|
super(chunk.getChunkStart(),chunk.getChunkEnd());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package net.sf.samtools;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utils that insist on being in the same package as Picard.
|
||||||
|
*/
|
||||||
|
public class PicardNamespaceUtils {
|
||||||
|
/**
|
||||||
|
* Private constructor only. Do not instantiate.
|
||||||
|
*/
|
||||||
|
private PicardNamespaceUtils() {}
|
||||||
|
|
||||||
|
public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) {
|
||||||
|
read.setFileSource(fileSource);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package net.sf.samtools.util;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An input stream formulated for use reading BAM files. Supports
|
|
||||||
*/
|
|
||||||
public interface BAMInputStream {
|
|
||||||
/**
|
|
||||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
|
||||||
* not an actual byte offset.
|
|
||||||
*
|
|
||||||
* @param pos virtual file pointer
|
|
||||||
*/
|
|
||||||
public void seek(final long pos) throws IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
|
||||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
|
||||||
* the two.
|
|
||||||
*/
|
|
||||||
public long getFilePointer();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
|
||||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
|
||||||
* operation and should be used accordingly.
|
|
||||||
*/
|
|
||||||
public void setCheckCrcs(final boolean check);
|
|
||||||
|
|
||||||
public int read() throws java.io.IOException;
|
|
||||||
|
|
||||||
public int read(byte[] bytes) throws java.io.IOException;
|
|
||||||
|
|
||||||
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
|
|
||||||
|
|
||||||
public long skip(long l) throws java.io.IOException;
|
|
||||||
|
|
||||||
public int available() throws java.io.IOException;
|
|
||||||
|
|
||||||
public void close() throws java.io.IOException;
|
|
||||||
|
|
||||||
public void mark(int i);
|
|
||||||
|
|
||||||
public void reset() throws java.io.IOException;
|
|
||||||
|
|
||||||
public boolean markSupported();
|
|
||||||
}
|
|
||||||
|
|
@ -1,483 +0,0 @@
|
||||||
/*
|
|
||||||
* The MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2009 The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in
|
|
||||||
* all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
||||||
* THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
package net.sf.samtools.util;
|
|
||||||
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.RandomAccessFile;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.ByteOrder;
|
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
import net.sf.samtools.FileTruncatedException;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
|
|
||||||
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
|
|
||||||
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
|
|
||||||
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
|
|
||||||
*
|
|
||||||
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
|
|
||||||
*/
|
|
||||||
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
|
|
||||||
private InputStream mStream = null;
|
|
||||||
private SeekableStream mFile = null;
|
|
||||||
private byte[] mFileBuffer = null;
|
|
||||||
private byte[] mCurrentBlock = null;
|
|
||||||
private int mCurrentOffset = 0;
|
|
||||||
private long mBlockAddress = 0;
|
|
||||||
private int mLastBlockLength = 0;
|
|
||||||
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Note that seek() is not supported if this ctor is used.
|
|
||||||
*/
|
|
||||||
public BlockCompressedInputStream(final InputStream stream) {
|
|
||||||
mStream = IOUtil.toBufferedStream(stream);
|
|
||||||
mFile = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Use this ctor if you wish to call seek()
|
|
||||||
*/
|
|
||||||
public BlockCompressedInputStream(final File file)
|
|
||||||
throws IOException {
|
|
||||||
mFile = new SeekableFileStream(file);
|
|
||||||
mStream = null;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public BlockCompressedInputStream(final URL url) {
|
|
||||||
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
|
|
||||||
mStream = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For providing some arbitrary data source. No additional buffering is
|
|
||||||
* provided, so if the underlying source is not buffered, wrap it in a
|
|
||||||
* SeekableBufferedStream before passing to this ctor.
|
|
||||||
*/
|
|
||||||
public BlockCompressedInputStream(final SeekableStream strm) {
|
|
||||||
mFile = strm;
|
|
||||||
mStream = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
|
||||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
|
||||||
* operation and should be used accordingly.
|
|
||||||
*/
|
|
||||||
public void setCheckCrcs(final boolean check) {
|
|
||||||
this.blockGunzipper.setCheckCrcs(check);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
|
|
||||||
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
|
|
||||||
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
|
|
||||||
* may block in order to fill an internal buffer if it has been exhausted.
|
|
||||||
*/
|
|
||||||
public int available()
|
|
||||||
throws IOException {
|
|
||||||
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
|
|
||||||
readBlock();
|
|
||||||
}
|
|
||||||
if (mCurrentBlock == null) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return mCurrentBlock.length - mCurrentOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Closes the underlying InputStream or RandomAccessFile
|
|
||||||
*/
|
|
||||||
public void close()
|
|
||||||
throws IOException {
|
|
||||||
if (mFile != null) {
|
|
||||||
mFile.close();
|
|
||||||
mFile = null;
|
|
||||||
} else if (mStream != null) {
|
|
||||||
mStream.close();
|
|
||||||
mStream = null;
|
|
||||||
}
|
|
||||||
// Encourage garbage collection
|
|
||||||
mFileBuffer = null;
|
|
||||||
mCurrentBlock = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
|
|
||||||
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
|
|
||||||
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
|
|
||||||
|
|
||||||
* @return the next byte of data, or -1 if the end of the stream is reached.
|
|
||||||
*/
|
|
||||||
public int read()
|
|
||||||
throws IOException {
|
|
||||||
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
|
|
||||||
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
|
|
||||||
* or an exception is thrown.
|
|
||||||
*
|
|
||||||
* read(buf) has the same effect as read(buf, 0, buf.length).
|
|
||||||
*
|
|
||||||
* @param buffer the buffer into which the data is read.
|
|
||||||
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
|
|
||||||
* the stream has been reached.
|
|
||||||
*/
|
|
||||||
public int read(final byte[] buffer)
|
|
||||||
throws IOException {
|
|
||||||
return read(buffer, 0, buffer.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
private volatile ByteArrayOutputStream buf = null;
|
|
||||||
private static final byte eol = '\n';
|
|
||||||
private static final byte eolCr = '\r';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
|
|
||||||
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
|
|
||||||
*
|
|
||||||
* @return A String containing the contents of the line, excluding the line terminating
|
|
||||||
* character, or null if the end of the stream has been reached
|
|
||||||
*
|
|
||||||
* @exception IOException If an I/O error occurs
|
|
||||||
* @
|
|
||||||
*/
|
|
||||||
public String readLine() throws IOException {
|
|
||||||
int available = available();
|
|
||||||
if (available == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if(null == buf){ // lazy initialisation
|
|
||||||
buf = new ByteArrayOutputStream(8192);
|
|
||||||
}
|
|
||||||
buf.reset();
|
|
||||||
boolean done = false;
|
|
||||||
boolean foundCr = false; // \r found flag
|
|
||||||
while (!done) {
|
|
||||||
int linetmpPos = mCurrentOffset;
|
|
||||||
int bCnt = 0;
|
|
||||||
while((available-- > 0)){
|
|
||||||
final byte c = mCurrentBlock[linetmpPos++];
|
|
||||||
if(c == eol){ // found \n
|
|
||||||
done = true;
|
|
||||||
break;
|
|
||||||
} else if(foundCr){ // previous char was \r
|
|
||||||
--linetmpPos; // current char is not \n so put it back
|
|
||||||
done = true;
|
|
||||||
break;
|
|
||||||
} else if(c == eolCr){ // found \r
|
|
||||||
foundCr = true;
|
|
||||||
continue; // no ++bCnt
|
|
||||||
}
|
|
||||||
++bCnt;
|
|
||||||
}
|
|
||||||
if(mCurrentOffset < linetmpPos){
|
|
||||||
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
|
|
||||||
mCurrentOffset = linetmpPos;
|
|
||||||
}
|
|
||||||
available = available();
|
|
||||||
if(available == 0){
|
|
||||||
// EOF
|
|
||||||
done = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return buf.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
|
|
||||||
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
|
|
||||||
*
|
|
||||||
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
|
|
||||||
*
|
|
||||||
* @param buffer buffer into which data is read.
|
|
||||||
* @param offset the start offset in array b at which the data is written.
|
|
||||||
* @param length the maximum number of bytes to read.
|
|
||||||
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
|
|
||||||
* the stream has been reached.
|
|
||||||
*/
|
|
||||||
public int read(final byte[] buffer, int offset, int length)
|
|
||||||
throws IOException {
|
|
||||||
final int originalLength = length;
|
|
||||||
while (length > 0) {
|
|
||||||
final int available = available();
|
|
||||||
if (available == 0) {
|
|
||||||
// Signal EOF to caller
|
|
||||||
if (originalLength == length) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
final int copyLength = Math.min(length, available);
|
|
||||||
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
|
|
||||||
mCurrentOffset += copyLength;
|
|
||||||
offset += copyLength;
|
|
||||||
length -= copyLength;
|
|
||||||
}
|
|
||||||
return originalLength - length;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
|
||||||
* not an actual byte offset.
|
|
||||||
*
|
|
||||||
* @param pos virtual file pointer
|
|
||||||
*/
|
|
||||||
public void seek(final long pos)
|
|
||||||
throws IOException {
|
|
||||||
if (mFile == null) {
|
|
||||||
throw new IOException("Cannot seek on stream based file");
|
|
||||||
}
|
|
||||||
// Decode virtual file pointer
|
|
||||||
// Upper 48 bits is the byte offset into the compressed stream of a block.
|
|
||||||
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
|
|
||||||
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
|
|
||||||
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
|
|
||||||
final int available;
|
|
||||||
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
|
|
||||||
available = mCurrentBlock.length;
|
|
||||||
} else {
|
|
||||||
mFile.seek(compressedOffset);
|
|
||||||
mBlockAddress = compressedOffset;
|
|
||||||
mLastBlockLength = 0;
|
|
||||||
readBlock();
|
|
||||||
available = available();
|
|
||||||
}
|
|
||||||
if (uncompressedOffset > available ||
|
|
||||||
(uncompressedOffset == available && !eof())) {
|
|
||||||
throw new IOException("Invalid file pointer: " + pos);
|
|
||||||
}
|
|
||||||
mCurrentOffset = uncompressedOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean eof() throws IOException {
|
|
||||||
if (mFile.eof()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
|
|
||||||
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
|
||||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
|
||||||
* the two.
|
|
||||||
*/
|
|
||||||
public long getFilePointer() {
|
|
||||||
if (mCurrentOffset == mCurrentBlock.length) {
|
|
||||||
// If current offset is at the end of the current block, file pointer should point
|
|
||||||
// to the beginning of the next block.
|
|
||||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
|
|
||||||
}
|
|
||||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static long getFileBlock(final long bgzfOffset) {
|
|
||||||
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
|
|
||||||
* @return true if the given file looks like a valid BGZF file.
|
|
||||||
*/
|
|
||||||
public static boolean isValidFile(final InputStream stream)
|
|
||||||
throws IOException {
|
|
||||||
if (!stream.markSupported()) {
|
|
||||||
throw new RuntimeException("Cannot test non-buffered stream");
|
|
||||||
}
|
|
||||||
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
|
||||||
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
|
||||||
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
|
||||||
stream.reset();
|
|
||||||
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isValidBlockHeader(final byte[] buffer) {
|
|
||||||
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
|
|
||||||
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
|
|
||||||
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
|
|
||||||
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
|
|
||||||
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
|
|
||||||
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readBlock()
|
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
if (mFileBuffer == null) {
|
|
||||||
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
|
|
||||||
}
|
|
||||||
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
|
||||||
if (count == 0) {
|
|
||||||
// Handle case where there is no empty gzip block at end.
|
|
||||||
mCurrentOffset = 0;
|
|
||||||
mBlockAddress += mLastBlockLength;
|
|
||||||
mCurrentBlock = new byte[0];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
|
|
||||||
throw new IOException("Premature end of file");
|
|
||||||
}
|
|
||||||
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
|
|
||||||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
|
|
||||||
throw new IOException("Unexpected compressed block length: " + blockLength);
|
|
||||||
}
|
|
||||||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
|
|
||||||
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
|
|
||||||
if (count != remaining) {
|
|
||||||
throw new FileTruncatedException("Premature end of file");
|
|
||||||
}
|
|
||||||
inflateBlock(mFileBuffer, blockLength);
|
|
||||||
mCurrentOffset = 0;
|
|
||||||
mBlockAddress += mLastBlockLength;
|
|
||||||
mLastBlockLength = blockLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
|
|
||||||
throws IOException {
|
|
||||||
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
|
|
||||||
byte[] buffer = mCurrentBlock;
|
|
||||||
mCurrentBlock = null;
|
|
||||||
if (buffer == null || buffer.length != uncompressedLength) {
|
|
||||||
try {
|
|
||||||
buffer = new byte[uncompressedLength];
|
|
||||||
} catch (NegativeArraySizeException e) {
|
|
||||||
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
|
|
||||||
mCurrentBlock = buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int readBytes(final byte[] buffer, final int offset, final int length)
|
|
||||||
throws IOException {
|
|
||||||
if (mFile != null) {
|
|
||||||
return readBytes(mFile, buffer, offset, length);
|
|
||||||
} else if (mStream != null) {
|
|
||||||
return readBytes(mStream, buffer, offset, length);
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
|
|
||||||
throws IOException {
|
|
||||||
int bytesRead = 0;
|
|
||||||
while (bytesRead < length) {
|
|
||||||
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
|
|
||||||
if (count <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
bytesRead += count;
|
|
||||||
}
|
|
||||||
return bytesRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
|
|
||||||
throws IOException {
|
|
||||||
int bytesRead = 0;
|
|
||||||
while (bytesRead < length) {
|
|
||||||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
|
|
||||||
if (count <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
bytesRead += count;
|
|
||||||
}
|
|
||||||
return bytesRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int unpackInt16(final byte[] buffer, final int offset) {
|
|
||||||
return ((buffer[offset] & 0xFF) |
|
|
||||||
((buffer[offset+1] & 0xFF) << 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
private int unpackInt32(final byte[] buffer, final int offset) {
|
|
||||||
return ((buffer[offset] & 0xFF) |
|
|
||||||
((buffer[offset+1] & 0xFF) << 8) |
|
|
||||||
((buffer[offset+2] & 0xFF) << 16) |
|
|
||||||
((buffer[offset+3] & 0xFF) << 24));
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
|
|
||||||
|
|
||||||
public static FileTermination checkTermination(final File file)
|
|
||||||
throws IOException {
|
|
||||||
final long fileSize = file.length();
|
|
||||||
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
|
|
||||||
return FileTermination.DEFECTIVE;
|
|
||||||
}
|
|
||||||
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
|
|
||||||
try {
|
|
||||||
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
|
||||||
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
|
|
||||||
raFile.readFully(buf);
|
|
||||||
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
|
|
||||||
return FileTermination.HAS_TERMINATOR_BLOCK;
|
|
||||||
}
|
|
||||||
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
|
|
||||||
buf = new byte[bufsize];
|
|
||||||
raFile.seek(fileSize - bufsize);
|
|
||||||
raFile.read(buf);
|
|
||||||
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
|
|
||||||
i >= 0; --i) {
|
|
||||||
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
|
|
||||||
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
|
|
||||||
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
|
|
||||||
if (buf.length - i == totalBlockSizeMinusOne + 1) {
|
|
||||||
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
|
|
||||||
} else {
|
|
||||||
return FileTermination.DEFECTIVE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return FileTermination.DEFECTIVE;
|
|
||||||
} finally {
|
|
||||||
raFile.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
|
|
||||||
for (int i = 0; i < length; ++i) {
|
|
||||||
if (preamble[i] != buf[i + startOffset]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,9 +25,14 @@ import java.util.NoSuchElementException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A queue of locus context entries.
|
* The two goals of the LocusView are as follows:
|
||||||
|
* 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch
|
||||||
|
* between iterating over all bases in a region, only covered bases in a region covered by
|
||||||
|
* reads, only bases in a region covered by RODs, or any other sort of trigger track
|
||||||
|
* implementation one can think of.
|
||||||
|
* 2) To manage the copious number of iterators that have to be jointly pulled through the
|
||||||
|
* genome to make a locus traversal function.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class LocusView extends LocusIterator implements View {
|
public abstract class LocusView extends LocusIterator implements View {
|
||||||
/**
|
/**
|
||||||
* The locus bounding this view.
|
* The locus bounding this view.
|
||||||
|
|
|
||||||
|
|
@ -27,8 +27,10 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
import net.sf.picard.util.PeekableIterator;
|
import net.sf.picard.util.PeekableIterator;
|
||||||
import net.sf.samtools.GATKBAMFileSpan;
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
import net.sf.samtools.GATKChunk;
|
import net.sf.samtools.GATKChunk;
|
||||||
|
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -38,7 +40,7 @@ import java.util.List;
|
||||||
* Time: 10:47 PM
|
* Time: 10:47 PM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
class SAMReaderPosition {
|
class BAMAccessPlan {
|
||||||
private final SAMReaderID reader;
|
private final SAMReaderID reader;
|
||||||
private final BlockInputStream inputStream;
|
private final BlockInputStream inputStream;
|
||||||
|
|
||||||
|
|
@ -51,7 +53,7 @@ class SAMReaderPosition {
|
||||||
private long nextBlockAddress;
|
private long nextBlockAddress;
|
||||||
|
|
||||||
|
|
||||||
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||||
this.reader = reader;
|
this.reader = reader;
|
||||||
this.inputStream = inputStream;
|
this.inputStream = inputStream;
|
||||||
|
|
||||||
|
|
@ -84,11 +86,45 @@ class SAMReaderPosition {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the last offset of interest in the block returned by getBlockAddress().
|
* Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer.
|
||||||
* @return First block of interest in this segment.
|
* @param blockAddress Block address for which to search.
|
||||||
|
* @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span.
|
||||||
|
* @return list of chunks containing that block.
|
||||||
*/
|
*/
|
||||||
public int getLastOffsetInBlock() {
|
public List<GATKChunk> getSpansOverlappingBlock(long blockAddress, long filePosition) {
|
||||||
return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536;
|
List<GATKChunk> spansOverlapping = new LinkedList<GATKChunk>();
|
||||||
|
// While the position iterator overlaps the given block, pull out spans to report.
|
||||||
|
while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) {
|
||||||
|
// Create a span over as much of the block as is covered by this chunk.
|
||||||
|
int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0;
|
||||||
|
|
||||||
|
// Calculate the end of this span. If the span extends past this block, cap it using the current file position.
|
||||||
|
long blockEnd;
|
||||||
|
int blockOffsetEnd;
|
||||||
|
if(blockAddress < positionIterator.peek().getBlockEnd()) {
|
||||||
|
blockEnd = filePosition;
|
||||||
|
blockOffsetEnd = 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
blockEnd = positionIterator.peek().getBlockEnd();
|
||||||
|
blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd();
|
||||||
|
}
|
||||||
|
|
||||||
|
GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd);
|
||||||
|
|
||||||
|
if(newChunk.getChunkStart() <= newChunk.getChunkEnd())
|
||||||
|
spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd));
|
||||||
|
|
||||||
|
// If the value currently stored in the position iterator ends past the current block, we must be done. Abort.
|
||||||
|
if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// If the position iterator ends before the block ends, pull the position iterator forward.
|
||||||
|
if(positionIterator.peek().getBlockEnd() <= blockAddress)
|
||||||
|
positionIterator.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
return spansOverlapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset() {
|
public void reset() {
|
||||||
|
|
@ -111,20 +147,16 @@ class SAMReaderPosition {
|
||||||
* @param filePosition The current position within the file.
|
* @param filePosition The current position within the file.
|
||||||
*/
|
*/
|
||||||
void advancePosition(final long filePosition) {
|
void advancePosition(final long filePosition) {
|
||||||
nextBlockAddress = filePosition >> 16;
|
nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition);
|
||||||
|
|
||||||
// Check the current file position against the iterator; if the iterator is before the current file position,
|
// Check the current file position against the iterator; if the iterator is before the current file position,
|
||||||
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
||||||
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
|
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek()))
|
||||||
positionIterator.next();
|
positionIterator.next();
|
||||||
|
|
||||||
// If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
|
// If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
|
||||||
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) {
|
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart())
|
||||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||||
//System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we've shot off the end of the block pointer, notify consumers that iteration is complete.
|
// If we've shot off the end of the block pointer, notify consumers that iteration is complete.
|
||||||
if(!positionIterator.hasNext())
|
if(!positionIterator.hasNext())
|
||||||
|
|
@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher {
|
||||||
|
|
||||||
private final ExecutorService threadPool;
|
private final ExecutorService threadPool;
|
||||||
|
|
||||||
private final Queue<SAMReaderPosition> inputQueue;
|
private final Queue<BAMAccessPlan> inputQueue;
|
||||||
|
|
||||||
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
||||||
threadPool = Executors.newFixedThreadPool(numThreads);
|
threadPool = Executors.newFixedThreadPool(numThreads);
|
||||||
fileHandleCache = new FileHandleCache(numFileHandles);
|
fileHandleCache = new FileHandleCache(numFileHandles);
|
||||||
inputQueue = new LinkedList<SAMReaderPosition>();
|
inputQueue = new LinkedList<BAMAccessPlan>();
|
||||||
|
|
||||||
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
||||||
}
|
}
|
||||||
|
|
@ -58,7 +58,7 @@ public class BGZFBlockLoadingDispatcher {
|
||||||
* Initiates a request for a new block load.
|
* Initiates a request for a new block load.
|
||||||
* @param readerPosition Position at which to load.
|
* @param readerPosition Position at which to load.
|
||||||
*/
|
*/
|
||||||
void queueBlockLoad(final SAMReaderPosition readerPosition) {
|
void queueBlockLoad(final BAMAccessPlan readerPosition) {
|
||||||
synchronized(inputQueue) {
|
synchronized(inputQueue) {
|
||||||
inputQueue.add(readerPosition);
|
inputQueue.add(readerPosition);
|
||||||
inputQueue.notify();
|
inputQueue.notify();
|
||||||
|
|
@ -69,7 +69,7 @@ public class BGZFBlockLoadingDispatcher {
|
||||||
* Claims the next work request from the queue.
|
* Claims the next work request from the queue.
|
||||||
* @return The next work request, or null if none is available.
|
* @return The next work request, or null if none is available.
|
||||||
*/
|
*/
|
||||||
SAMReaderPosition claimNextWorkRequest() {
|
BAMAccessPlan claimNextWorkRequest() {
|
||||||
synchronized(inputQueue) {
|
synchronized(inputQueue) {
|
||||||
while(inputQueue.isEmpty()) {
|
while(inputQueue.isEmpty()) {
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
|
|
@ -26,24 +26,21 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
import net.sf.samtools.GATKBAMFileSpan;
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
import net.sf.samtools.GATKChunk;
|
import net.sf.samtools.GATKChunk;
|
||||||
import net.sf.samtools.util.BAMInputStream;
|
|
||||||
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
|
||||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||||
import net.sf.samtools.util.RuntimeEOFException;
|
|
||||||
import net.sf.samtools.util.SeekableStream;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.ByteOrder;
|
import java.nio.ByteOrder;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Presents decompressed blocks to the SAMFileReader.
|
* Presents decompressed blocks to the SAMFileReader.
|
||||||
*/
|
*/
|
||||||
public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
public class BlockInputStream extends InputStream {
|
||||||
/**
|
/**
|
||||||
* Mechanism for triggering block loads.
|
* Mechanism for triggering block loads.
|
||||||
*/
|
*/
|
||||||
|
|
@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
private Throwable error;
|
private Throwable error;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Current position.
|
* Current accessPlan.
|
||||||
*/
|
*/
|
||||||
private SAMReaderPosition position;
|
private BAMAccessPlan accessPlan;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A stream of compressed data blocks.
|
* A stream of compressed data blocks.
|
||||||
|
|
@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
*/
|
*/
|
||||||
private final BlockCompressedInputStream validatingInputStream;
|
private final BlockCompressedInputStream validatingInputStream;
|
||||||
|
|
||||||
/**
|
|
||||||
* Has the buffer been filled since last request?
|
|
||||||
*/
|
|
||||||
private boolean bufferFilled = false;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new block presenting input stream with a dedicated buffer.
|
* Create a new block presenting input stream with a dedicated buffer.
|
||||||
* @param dispatcher the block loading messenger.
|
* @param dispatcher the block loading messenger.
|
||||||
|
|
@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
|
|
||||||
this.dispatcher = dispatcher;
|
this.dispatcher = dispatcher;
|
||||||
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
||||||
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||||
|
|
||||||
// The block offsets / block positions guarantee that the ending offset/position in the data structure maps to
|
// The block offsets / block positions guarantee that the ending offset/position in the data structure maps to
|
||||||
// the point in the file just following the last read. These two arrays should never be empty; initializing
|
// the point in the file just following the last read. These two arrays should never be empty; initializing
|
||||||
|
|
@ -151,7 +143,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
synchronized(lock) {
|
synchronized(lock) {
|
||||||
// Find the current block within the input stream.
|
// Find the current block within the input stream.
|
||||||
int blockIndex;
|
int blockIndex;
|
||||||
for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++)
|
for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++)
|
||||||
;
|
;
|
||||||
filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex));
|
filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex));
|
||||||
}
|
}
|
||||||
|
|
@ -164,51 +156,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
return filePointer;
|
return filePointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void seek(long target) {
|
|
||||||
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
|
|
||||||
synchronized(lock) {
|
|
||||||
clearBuffers();
|
|
||||||
|
|
||||||
// Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified.
|
|
||||||
position.advancePosition(target);
|
|
||||||
|
|
||||||
// If the position advances past the end of the target, that must mean that we seeked to a point at the end
|
|
||||||
// of one of the chunk list's subregions. Make a note of our current position and punt on loading any data.
|
|
||||||
if(target < position.getBlockAddress() << 16) {
|
|
||||||
blockOffsets.clear();
|
|
||||||
blockOffsets.add(0);
|
|
||||||
blockPositions.clear();
|
|
||||||
blockPositions.add(target);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
waitForBufferFill();
|
|
||||||
// A buffer fill will load the relevant data from the shard, but the buffer position still needs to be
|
|
||||||
// advanced as appropriate.
|
|
||||||
Iterator<Integer> blockOffsetIterator = blockOffsets.descendingIterator();
|
|
||||||
Iterator<Long> blockPositionIterator = blockPositions.descendingIterator();
|
|
||||||
while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) {
|
|
||||||
final int blockOffset = blockOffsetIterator.next();
|
|
||||||
final long blockPosition = blockPositionIterator.next();
|
|
||||||
if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) {
|
|
||||||
buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(validatingInputStream != null) {
|
|
||||||
try {
|
|
||||||
validatingInputStream.seek(target);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void clearBuffers() {
|
private void clearBuffers() {
|
||||||
this.position.reset();
|
this.accessPlan.reset();
|
||||||
|
|
||||||
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
||||||
// Indicate no data to be read.
|
// Indicate no data to be read.
|
||||||
|
|
@ -225,29 +174,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
public boolean eof() {
|
public boolean eof() {
|
||||||
synchronized(lock) {
|
synchronized(lock) {
|
||||||
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
||||||
return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length);
|
return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setCheckCrcs(final boolean check) {
|
|
||||||
// TODO: Implement
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Submits a new access plan for the given dataset.
|
* Submits a new access plan for the given dataset and seeks to the given point.
|
||||||
* @param position The next seek point for BAM data in this reader.
|
* @param accessPlan The next seek point for BAM data in this reader.
|
||||||
*/
|
*/
|
||||||
public void submitAccessPlan(final SAMReaderPosition position) {
|
public void submitAccessPlan(final BAMAccessPlan accessPlan) {
|
||||||
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
||||||
synchronized(lock) {
|
this.accessPlan = accessPlan;
|
||||||
// Assume that the access plan is going to tell us to start where we are and move forward.
|
accessPlan.reset();
|
||||||
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
|
|
||||||
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
|
clearBuffers();
|
||||||
position.advancePosition(this.position.getBlockAddress() << 16);
|
|
||||||
|
// Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc).
|
||||||
|
// TODO: Don't pass these empty chunks in.
|
||||||
|
accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0));
|
||||||
|
|
||||||
|
if(accessPlan.getBlockAddress() >= 0) {
|
||||||
|
waitForBufferFill();
|
||||||
}
|
}
|
||||||
this.position = position;
|
|
||||||
|
if(validatingInputStream != null) {
|
||||||
|
try {
|
||||||
|
validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0));
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void compactBuffer() {
|
private void compactBuffer() {
|
||||||
// Compact buffer to maximize storage space.
|
// Compact buffer to maximize storage space.
|
||||||
int bytesToRemove = 0;
|
int bytesToRemove = 0;
|
||||||
|
|
@ -286,27 +247,14 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
* Push contents of incomingBuffer into the end of this buffer.
|
* Push contents of incomingBuffer into the end of this buffer.
|
||||||
* MUST be called from a thread that is NOT the reader thread.
|
* MUST be called from a thread that is NOT the reader thread.
|
||||||
* @param incomingBuffer The data being pushed into this input stream.
|
* @param incomingBuffer The data being pushed into this input stream.
|
||||||
* @param position target position for the data.
|
* @param accessPlan target access plan for the data.
|
||||||
* @param filePosition the current position of the file pointer
|
* @param filePosition the current position of the file pointer
|
||||||
*/
|
*/
|
||||||
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
|
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) {
|
||||||
synchronized(lock) {
|
synchronized(lock) {
|
||||||
try {
|
try {
|
||||||
compactBuffer();
|
|
||||||
// Open up the buffer for more reading.
|
|
||||||
buffer.limit(buffer.capacity());
|
|
||||||
|
|
||||||
// Advance the position to take the most recent read into account.
|
|
||||||
final long lastBlockAddress = position.getBlockAddress();
|
|
||||||
final int blockOffsetStart = position.getFirstOffsetInBlock();
|
|
||||||
final int blockOffsetEnd = position.getLastOffsetInBlock();
|
|
||||||
|
|
||||||
// Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block.
|
|
||||||
final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16;
|
|
||||||
|
|
||||||
byte[] validBytes = null;
|
|
||||||
if(validatingInputStream != null) {
|
if(validatingInputStream != null) {
|
||||||
validBytes = new byte[incomingBuffer.remaining()];
|
byte[] validBytes = new byte[incomingBuffer.remaining()];
|
||||||
|
|
||||||
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
||||||
int pos = incomingBuffer.position();
|
int pos = incomingBuffer.position();
|
||||||
|
|
@ -317,7 +265,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
incomingBuffer.position(pos);
|
incomingBuffer.position(pos);
|
||||||
|
|
||||||
long currentFilePointer = validatingInputStream.getFilePointer();
|
long currentFilePointer = validatingInputStream.getFilePointer();
|
||||||
validatingInputStream.seek(lastBlockAddress << 16);
|
validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0));
|
||||||
validatingInputStream.read(validBytes);
|
validatingInputStream.read(validBytes);
|
||||||
validatingInputStream.seek(currentFilePointer);
|
validatingInputStream.seek(currentFilePointer);
|
||||||
|
|
||||||
|
|
@ -325,33 +273,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
||||||
}
|
}
|
||||||
|
|
||||||
this.position = position;
|
compactBuffer();
|
||||||
position.advancePosition(filePosition << 16);
|
// Open up the buffer for more reading.
|
||||||
|
buffer.limit(buffer.capacity());
|
||||||
|
|
||||||
if(buffer.remaining() < incomingBuffer.remaining()) {
|
// Get the spans overlapping this particular block...
|
||||||
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
|
List<GATKChunk> spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition);
|
||||||
|
|
||||||
|
// ...and advance the block
|
||||||
|
this.accessPlan = accessPlan;
|
||||||
|
accessPlan.advancePosition(makeFilePointer(filePosition, 0));
|
||||||
|
|
||||||
|
if(buffer.remaining() < incomingBuffer.remaining())
|
||||||
lock.wait();
|
lock.wait();
|
||||||
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
|
|
||||||
|
final int bytesInIncomingBuffer = incomingBuffer.limit();
|
||||||
|
|
||||||
|
for(GATKChunk spanOverlapping: spansOverlapping) {
|
||||||
|
// Clear out the endcap tracking state and add in the starting position for this transfer.
|
||||||
|
blockOffsets.removeLast();
|
||||||
|
blockOffsets.add(buffer.position());
|
||||||
|
blockPositions.removeLast();
|
||||||
|
blockPositions.add(spanOverlapping.getChunkStart());
|
||||||
|
|
||||||
|
// Stream the buffer into the data stream.
|
||||||
|
incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd());
|
||||||
|
incomingBuffer.position(spanOverlapping.getBlockOffsetStart());
|
||||||
|
buffer.put(incomingBuffer);
|
||||||
|
|
||||||
|
// Add the endcap for this transfer.
|
||||||
|
blockOffsets.add(buffer.position());
|
||||||
|
blockPositions.add(spanOverlapping.getChunkEnd());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove the last position in the list and add in the last read position, in case the two are different.
|
|
||||||
blockOffsets.removeLast();
|
|
||||||
blockOffsets.add(buffer.position());
|
|
||||||
blockPositions.removeLast();
|
|
||||||
blockPositions.add(lastBlockAddress << 16 | blockOffsetStart);
|
|
||||||
|
|
||||||
// Stream the buffer into the data stream.
|
|
||||||
incomingBuffer.position(blockOffsetStart);
|
|
||||||
incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd));
|
|
||||||
buffer.put(incomingBuffer);
|
|
||||||
|
|
||||||
// Then, add the last position read to the very end of the list, just past the end of the last buffer.
|
|
||||||
blockOffsets.add(buffer.position());
|
|
||||||
blockPositions.add(endOfRead);
|
|
||||||
|
|
||||||
// Set up the buffer for reading.
|
// Set up the buffer for reading.
|
||||||
buffer.flip();
|
buffer.flip();
|
||||||
bufferFilled = true;
|
|
||||||
|
|
||||||
lock.notify();
|
lock.notify();
|
||||||
}
|
}
|
||||||
|
|
@ -447,12 +403,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
if(remaining < length)
|
if(remaining < length)
|
||||||
return length - remaining;
|
return length - remaining;
|
||||||
|
|
||||||
// Otherwise, if at eof(), return -1.
|
// Otherwise, return -1.
|
||||||
else if(eof())
|
return -1;
|
||||||
return -1;
|
|
||||||
|
|
||||||
// Otherwise, we must've hit a bug in the system.
|
|
||||||
throw new ReviewedStingException("BUG: read returned no data, but eof() reports false.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
|
|
@ -472,20 +424,26 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
|
|
||||||
private void waitForBufferFill() {
|
private void waitForBufferFill() {
|
||||||
synchronized(lock) {
|
synchronized(lock) {
|
||||||
bufferFilled = false;
|
|
||||||
if(buffer.remaining() == 0 && !eof()) {
|
if(buffer.remaining() == 0 && !eof()) {
|
||||||
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
||||||
dispatcher.queueBlockLoad(position);
|
dispatcher.queueBlockLoad(accessPlan);
|
||||||
try {
|
try {
|
||||||
lock.wait();
|
lock.wait();
|
||||||
}
|
}
|
||||||
catch(InterruptedException ex) {
|
catch(InterruptedException ex) {
|
||||||
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(bufferFilled && buffer.remaining() == 0)
|
|
||||||
throw new RuntimeEOFException("No more data left in InputStream");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an encoded BAM file pointer given the address of a BGZF block and an offset.
|
||||||
|
* @param blockAddress Physical address on disk of a BGZF block.
|
||||||
|
* @param blockOffset Offset into the uncompressed data stored in the BGZF block.
|
||||||
|
* @return 64-bit pointer encoded according to the BAM spec.
|
||||||
|
*/
|
||||||
|
public static long makeFilePointer(final long blockAddress, final int blockOffset) {
|
||||||
|
return blockAddress << 16 | blockOffset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2011, The Broad Institute
|
* Copyright (c) 2012, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
|
|
@ -70,29 +70,29 @@ class BlockLoader implements Runnable {
|
||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
for(;;) {
|
for(;;) {
|
||||||
SAMReaderPosition readerPosition = null;
|
BAMAccessPlan accessPlan = null;
|
||||||
try {
|
try {
|
||||||
readerPosition = dispatcher.claimNextWorkRequest();
|
accessPlan = dispatcher.claimNextWorkRequest();
|
||||||
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
|
FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader());
|
||||||
|
|
||||||
long blockAddress = readerPosition.getBlockAddress();
|
//long blockAddress = readerPosition.getBlockAddress();
|
||||||
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
||||||
|
|
||||||
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
|
ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress());
|
||||||
long nextBlockAddress = position(inputStream);
|
long nextBlockAddress = position(inputStream);
|
||||||
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
|
fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream);
|
||||||
|
|
||||||
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
||||||
int bytesCopied = block.remaining();
|
int bytesCopied = block.remaining();
|
||||||
|
|
||||||
BlockInputStream bamInputStream = readerPosition.getInputStream();
|
BlockInputStream bamInputStream = accessPlan.getInputStream();
|
||||||
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
|
bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress);
|
||||||
|
|
||||||
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
||||||
}
|
}
|
||||||
catch(Throwable error) {
|
catch(Throwable error) {
|
||||||
if(readerPosition != null && readerPosition.getInputStream() != null)
|
if(accessPlan != null && accessPlan.getInputStream() != null)
|
||||||
readerPosition.getInputStream().reportException(error);
|
accessPlan.getInputStream().reportException(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,203 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* High efficiency filtering iterator designed to filter out reads only included
|
||||||
|
* in the query results due to the granularity of the BAM index.
|
||||||
|
*
|
||||||
|
* Built into the BAM index is a notion of 16kbase granularity -- an index query for
|
||||||
|
* two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will
|
||||||
|
* return exactly the same regions within the BAM file. This iterator is optimized
|
||||||
|
* to subtract out reads which do not at all overlap the interval list passed to the
|
||||||
|
* constructor.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* interval list: chr20:6-10
|
||||||
|
* Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15.
|
||||||
|
* Reads that would be discarded by the filter: chr20:1-5, chr20:11-15.
|
||||||
|
*/
|
||||||
|
class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||||
|
/**
|
||||||
|
* The wrapped iterator.
|
||||||
|
*/
|
||||||
|
private CloseableIterator<SAMRecord> iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The next read, queued up and ready to go.
|
||||||
|
*/
|
||||||
|
private SAMRecord nextRead;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rather than using the straight genomic bounds, use filter out only mapped reads.
|
||||||
|
*/
|
||||||
|
private boolean keepOnlyUnmappedReads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Custom representation of interval bounds.
|
||||||
|
* Makes it simpler to track current position.
|
||||||
|
*/
|
||||||
|
private int[] intervalContigIndices;
|
||||||
|
private int[] intervalStarts;
|
||||||
|
private int[] intervalEnds;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Position within the interval list.
|
||||||
|
*/
|
||||||
|
private int currentBound = 0;
|
||||||
|
|
||||||
|
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
||||||
|
this.iterator = iterator;
|
||||||
|
|
||||||
|
// Look at the interval list to detect whether we should worry about unmapped reads.
|
||||||
|
// If we find a mix of mapped/unmapped intervals, throw an exception.
|
||||||
|
boolean foundMappedIntervals = false;
|
||||||
|
for(GenomeLoc location: intervals) {
|
||||||
|
if(! GenomeLoc.isUnmapped(location))
|
||||||
|
foundMappedIntervals = true;
|
||||||
|
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if(foundMappedIntervals) {
|
||||||
|
if(keepOnlyUnmappedReads)
|
||||||
|
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
||||||
|
this.intervalContigIndices = new int[intervals.size()];
|
||||||
|
this.intervalStarts = new int[intervals.size()];
|
||||||
|
this.intervalEnds = new int[intervals.size()];
|
||||||
|
int i = 0;
|
||||||
|
for(GenomeLoc interval: intervals) {
|
||||||
|
intervalContigIndices[i] = interval.getContigIndex();
|
||||||
|
intervalStarts[i] = interval.getStart();
|
||||||
|
intervalEnds[i] = interval.getStop();
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return nextRead != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord next() {
|
||||||
|
if(nextRead == null)
|
||||||
|
throw new NoSuchElementException("No more reads left in this iterator.");
|
||||||
|
SAMRecord currentRead = nextRead;
|
||||||
|
advance();
|
||||||
|
return currentRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
iterator.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void advance() {
|
||||||
|
nextRead = null;
|
||||||
|
|
||||||
|
if(!iterator.hasNext())
|
||||||
|
return;
|
||||||
|
|
||||||
|
SAMRecord candidateRead = iterator.next();
|
||||||
|
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
||||||
|
if(!keepOnlyUnmappedReads) {
|
||||||
|
// Mapped read filter; check against GenomeLoc-derived bounds.
|
||||||
|
if(readEndsOnOrAfterStartingBound(candidateRead)) {
|
||||||
|
// This read ends after the current interval begins.
|
||||||
|
// Promising, but this read must be checked against the ending bound.
|
||||||
|
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
|
||||||
|
// Yes, this read is within both bounds. This must be our next read.
|
||||||
|
nextRead = candidateRead;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||||
|
currentBound++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Found an unmapped read. We're done.
|
||||||
|
if(candidateRead.getReadUnmappedFlag()) {
|
||||||
|
nextRead = candidateRead;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No more reads available. Stop the search.
|
||||||
|
if(!iterator.hasNext())
|
||||||
|
break;
|
||||||
|
|
||||||
|
// No reasonable read found; advance the iterator.
|
||||||
|
candidateRead = iterator.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
|
||||||
|
* end will be distorted, so rely only on the alignment start.
|
||||||
|
* @param read The read to position-check.
|
||||||
|
* @return True if the read starts after the current bounds. False otherwise.
|
||||||
|
*/
|
||||||
|
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
|
||||||
|
return
|
||||||
|
// Read ends on a later contig, or...
|
||||||
|
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||||
|
// Read ends of this contig...
|
||||||
|
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||||
|
// either after this location, or...
|
||||||
|
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||||
|
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||||
|
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether the read lies before the end of the current bound.
|
||||||
|
* @param read The read to position-check.
|
||||||
|
* @return True if the read starts after the current bounds. False otherwise.
|
||||||
|
*/
|
||||||
|
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
|
||||||
|
return
|
||||||
|
// Read starts on a prior contig, or...
|
||||||
|
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||||
|
// Read starts on this contig and the alignment start is registered before this end point.
|
||||||
|
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -36,7 +36,7 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class ReadShard extends Shard {
|
public class ReadShard extends Shard {
|
||||||
/**
|
/**
|
||||||
* What is the maximum number of reads which should go into a read shard.
|
* What is the maximum number of reads per BAM file which should go into a read shard.
|
||||||
*/
|
*/
|
||||||
public static int MAX_READS = 10000;
|
public static int MAX_READS = 10000;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||||
|
|
@ -567,9 +566,14 @@ public class SAMDataSource {
|
||||||
|
|
||||||
if(threadAllocation.getNumIOThreads() > 0) {
|
if(threadAllocation.getNumIOThreads() > 0) {
|
||||||
BlockInputStream inputStream = readers.getInputStream(id);
|
BlockInputStream inputStream = readers.getInputStream(id);
|
||||||
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
|
inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id)));
|
||||||
|
BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory);
|
||||||
|
codec.setInputStream(inputStream);
|
||||||
|
iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||||
}
|
}
|
||||||
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
|
||||||
if(shard.getGenomeLocs().size() > 0)
|
if(shard.getGenomeLocs().size() > 0)
|
||||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||||
iteratorMap.put(readers.getReader(id), iterator);
|
iteratorMap.put(readers.getReader(id), iterator);
|
||||||
|
|
@ -577,8 +581,6 @@ public class SAMDataSource {
|
||||||
|
|
||||||
MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
|
MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return applyDecoratingIterators(shard.getReadMetrics(),
|
return applyDecoratingIterators(shard.getReadMetrics(),
|
||||||
enableVerification,
|
enableVerification,
|
||||||
readProperties.useOriginalBaseQualities(),
|
readProperties.useOriginalBaseQualities(),
|
||||||
|
|
@ -592,6 +594,49 @@ public class SAMDataSource {
|
||||||
readProperties.defaultBaseQualities());
|
readProperties.defaultBaseQualities());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
|
||||||
|
private final BlockInputStream inputStream;
|
||||||
|
private final SAMFileReader reader;
|
||||||
|
private final BAMRecordCodec codec;
|
||||||
|
private SAMRecord nextRead;
|
||||||
|
|
||||||
|
private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) {
|
||||||
|
this.inputStream = inputStream;
|
||||||
|
this.reader = reader;
|
||||||
|
this.codec = codec;
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return nextRead != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord next() {
|
||||||
|
if(!hasNext())
|
||||||
|
throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty");
|
||||||
|
SAMRecord currentRead = nextRead;
|
||||||
|
advance();
|
||||||
|
return currentRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
// NO-OP.
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void advance() {
|
||||||
|
final long startCoordinate = inputStream.getFilePointer();
|
||||||
|
nextRead = codec.decode();
|
||||||
|
final long stopCoordinate = inputStream.getFilePointer();
|
||||||
|
|
||||||
|
if(reader != null && nextRead != null)
|
||||||
|
PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter reads based on user-specified criteria.
|
* Filter reads based on user-specified criteria.
|
||||||
*
|
*
|
||||||
|
|
@ -871,12 +916,9 @@ public class SAMDataSource {
|
||||||
public ReaderInitializer call() {
|
public ReaderInitializer call() {
|
||||||
final File indexFile = findIndexFile(readerID.samFile);
|
final File indexFile = findIndexFile(readerID.samFile);
|
||||||
try {
|
try {
|
||||||
if (threadAllocation.getNumIOThreads() > 0) {
|
if (threadAllocation.getNumIOThreads() > 0)
|
||||||
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||||
}
|
|
||||||
else
|
|
||||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
|
||||||
} catch ( RuntimeIOException e ) {
|
} catch ( RuntimeIOException e ) {
|
||||||
if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException )
|
if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException )
|
||||||
throw new UserException.CouldNotReadInputFile(readerID.samFile, e);
|
throw new UserException.CouldNotReadInputFile(readerID.samFile, e);
|
||||||
|
|
@ -933,167 +975,6 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
private class ReadGroupMapping extends HashMap<String,String> {}
|
private class ReadGroupMapping extends HashMap<String,String> {}
|
||||||
|
|
||||||
/**
|
|
||||||
* Filters out reads that do not overlap the current GenomeLoc.
|
|
||||||
* Note the custom implementation: BAM index querying returns all reads that could
|
|
||||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
|
||||||
* down performance, this implementation is highly customized to its task.
|
|
||||||
*/
|
|
||||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
|
||||||
/**
|
|
||||||
* The wrapped iterator.
|
|
||||||
*/
|
|
||||||
private CloseableIterator<SAMRecord> iterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The next read, queued up and ready to go.
|
|
||||||
*/
|
|
||||||
private SAMRecord nextRead;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Rather than using the straight genomic bounds, use filter out only mapped reads.
|
|
||||||
*/
|
|
||||||
private boolean keepOnlyUnmappedReads;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Custom representation of interval bounds.
|
|
||||||
* Makes it simpler to track current position.
|
|
||||||
*/
|
|
||||||
private int[] intervalContigIndices;
|
|
||||||
private int[] intervalStarts;
|
|
||||||
private int[] intervalEnds;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Position within the interval list.
|
|
||||||
*/
|
|
||||||
private int currentBound = 0;
|
|
||||||
|
|
||||||
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
|
||||||
this.iterator = iterator;
|
|
||||||
|
|
||||||
// Look at the interval list to detect whether we should worry about unmapped reads.
|
|
||||||
// If we find a mix of mapped/unmapped intervals, throw an exception.
|
|
||||||
boolean foundMappedIntervals = false;
|
|
||||||
for(GenomeLoc location: intervals) {
|
|
||||||
if(! GenomeLoc.isUnmapped(location))
|
|
||||||
foundMappedIntervals = true;
|
|
||||||
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if(foundMappedIntervals) {
|
|
||||||
if(keepOnlyUnmappedReads)
|
|
||||||
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
|
||||||
this.intervalContigIndices = new int[intervals.size()];
|
|
||||||
this.intervalStarts = new int[intervals.size()];
|
|
||||||
this.intervalEnds = new int[intervals.size()];
|
|
||||||
int i = 0;
|
|
||||||
for(GenomeLoc interval: intervals) {
|
|
||||||
intervalContigIndices[i] = interval.getContigIndex();
|
|
||||||
intervalStarts[i] = interval.getStart();
|
|
||||||
intervalEnds[i] = interval.getStop();
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return nextRead != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SAMRecord next() {
|
|
||||||
if(nextRead == null)
|
|
||||||
throw new NoSuchElementException("No more reads left in this iterator.");
|
|
||||||
SAMRecord currentRead = nextRead;
|
|
||||||
advance();
|
|
||||||
return currentRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
iterator.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void advance() {
|
|
||||||
nextRead = null;
|
|
||||||
|
|
||||||
if(!iterator.hasNext())
|
|
||||||
return;
|
|
||||||
|
|
||||||
SAMRecord candidateRead = iterator.next();
|
|
||||||
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
|
||||||
if(!keepOnlyUnmappedReads) {
|
|
||||||
// Mapped read filter; check against GenomeLoc-derived bounds.
|
|
||||||
if(readEndsOnOrAfterStartingBound(candidateRead)) {
|
|
||||||
// This read ends after the current interval begins.
|
|
||||||
// Promising, but this read must be checked against the ending bound.
|
|
||||||
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
|
|
||||||
// Yes, this read is within both bounds. This must be our next read.
|
|
||||||
nextRead = candidateRead;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
|
||||||
currentBound++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Found an unmapped read. We're done.
|
|
||||||
if(candidateRead.getReadUnmappedFlag()) {
|
|
||||||
nextRead = candidateRead;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No more reads available. Stop the search.
|
|
||||||
if(!iterator.hasNext())
|
|
||||||
break;
|
|
||||||
|
|
||||||
// No reasonable read found; advance the iterator.
|
|
||||||
candidateRead = iterator.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
|
|
||||||
* end will be distorted, so rely only on the alignment start.
|
|
||||||
* @param read The read to position-check.
|
|
||||||
* @return True if the read starts after the current bounds. False otherwise.
|
|
||||||
*/
|
|
||||||
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
|
|
||||||
return
|
|
||||||
// Read ends on a later contig, or...
|
|
||||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
|
||||||
// Read ends of this contig...
|
|
||||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
|
||||||
// either after this location, or...
|
|
||||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
|
||||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
|
||||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check whether the read lies before the end of the current bound.
|
|
||||||
* @param read The read to position-check.
|
|
||||||
* @return True if the read starts after the current bounds. False otherwise.
|
|
||||||
*/
|
|
||||||
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
|
|
||||||
return
|
|
||||||
// Read starts on a prior contig, or...
|
|
||||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
|
||||||
// Read starts on this contig and the alignment start is registered before this end point.
|
|
||||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Locates the index file alongside the given BAM, if present.
|
* Locates the index file alongside the given BAM, if present.
|
||||||
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
|
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
|
||||||
|
|
|
||||||
|
|
@ -17,9 +17,21 @@ import java.util.List;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Buffer shards of data which may or may not contain multiple loci into
|
* Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci
|
||||||
* iterators of all data which cover an interval. Its existence is an homage
|
* completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp
|
||||||
* to Mark's stillborn WindowMaker, RIP 2009.
|
* locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of
|
||||||
|
* loci to only those covered by the given interval list.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10
|
||||||
|
* Incoming intervals: chr20:3-7
|
||||||
|
*
|
||||||
|
* Locus iterator by state will produce the following stream of data:
|
||||||
|
* chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E},
|
||||||
|
* chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E}
|
||||||
|
*
|
||||||
|
* WindowMakerIterator will then filter the incoming stream, emitting the following stream:
|
||||||
|
* chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E}
|
||||||
*
|
*
|
||||||
* @author mhanna
|
* @author mhanna
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
|
|
|
||||||
|
|
@ -470,7 +470,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
|
|
||||||
if (op == CigarOperator.D) {
|
if (op == CigarOperator.D) {
|
||||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||||
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
|
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
|
||||||
size++;
|
size++;
|
||||||
nDeletions++;
|
nDeletions++;
|
||||||
if (read.getMappingQuality() == 0)
|
if (read.getMappingQuality() == 0)
|
||||||
|
|
@ -479,7 +479,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (!filterBaseInRead(read, location.getStart())) {
|
if (!filterBaseInRead(read, location.getStart())) {
|
||||||
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
|
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart())));
|
||||||
size++;
|
size++;
|
||||||
if (read.getMappingQuality() == 0)
|
if (read.getMappingQuality() == 0)
|
||||||
nMQ0Reads++;
|
nMQ0Reads++;
|
||||||
|
|
|
||||||
|
|
@ -102,7 +102,9 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the best view of loci for this walker given the available data.
|
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
|
||||||
|
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
|
||||||
|
* that comes along.
|
||||||
* @param walker walker to interrogate.
|
* @param walker walker to interrogate.
|
||||||
* @param dataProvider Data which which to drive the locus view.
|
* @param dataProvider Data which which to drive the locus view.
|
||||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||||
|
|
||||||
|
|
@ -200,7 +203,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
|
|
||||||
public class BAQedPileupElement extends PileupElement {
|
public class BAQedPileupElement extends PileupElement {
|
||||||
public BAQedPileupElement( final PileupElement PE ) {
|
public BAQedPileupElement( final PileupElement PE ) {
|
||||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
|
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -77,20 +77,20 @@ import java.util.Map;
|
||||||
* <h2>Output</h2>
|
* <h2>Output</h2>
|
||||||
* <p>
|
* <p>
|
||||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||||
*
|
*
|
||||||
* The first 20 lines of such a file is shown below.
|
* The first 20 lines of such a file is shown below.
|
||||||
* * The file begins with a series of comment lines describing:
|
* * The file begins with a series of comment lines describing:
|
||||||
* ** The number of counted loci
|
* ** The number of counted loci
|
||||||
* ** The number of counted bases
|
* ** The number of counted bases
|
||||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||||
*
|
*
|
||||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||||
*
|
*
|
||||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* # Counted Sites 19451059
|
* # Counted Sites 19451059
|
||||||
* # Counted Bases 56582018
|
* # Counted Bases 56582018
|
||||||
|
|
@ -129,13 +129,14 @@ import java.util.Map;
|
||||||
* -cov DinucCovariate \
|
* -cov DinucCovariate \
|
||||||
* -recalFile my_reads.recal_data.csv
|
* -recalFile my_reads.recal_data.csv
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
|
||||||
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
|
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
|
||||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
// Filter out all reads with zero or unavailable mapping quality
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||||
|
// This walker requires both -I input.bam and -R reference.fasta
|
||||||
@PartitionBy(PartitionType.LOCUS)
|
@PartitionBy(PartitionType.LOCUS)
|
||||||
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
||||||
|
|
||||||
|
|
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Shared Arguments
|
// Shared Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
@ArgumentCollection
|
||||||
|
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
|
|
@ -160,7 +162,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||||
*/
|
*/
|
||||||
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
|
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||||
*/
|
*/
|
||||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
|
@Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
|
||||||
@Gather(CountCovariatesGatherer.class)
|
@Gather(CountCovariatesGatherer.class)
|
||||||
public PrintStream RECAL_FILE;
|
public PrintStream RECAL_FILE;
|
||||||
|
|
||||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||||
private boolean LIST_ONLY = false;
|
private boolean LIST_ONLY = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* See the -list argument to view available covariates.
|
* See the -list argument to view available covariates.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||||
private String[] COVARIATES = null;
|
private String[] COVARIATES = null;
|
||||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||||
private boolean USE_STANDARD_COVARIATES = false;
|
private boolean USE_STANDARD_COVARIATES = false;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Debugging-only Arguments
|
// Debugging-only Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
@Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||||
private boolean DONT_SORT_OUTPUT = false;
|
private boolean DONT_SORT_OUTPUT = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||||
private boolean RUN_WITHOUT_DBSNP = false;
|
private boolean RUN_WITHOUT_DBSNP = false;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds the values of other to this, returning this
|
* Adds the values of other to this, returning this
|
||||||
|
*
|
||||||
* @param other
|
* @param other
|
||||||
* @return this object
|
* @return this object
|
||||||
*/
|
*/
|
||||||
|
|
@ -247,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
*/
|
*/
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
if (RAC.FORCE_PLATFORM != null) {
|
||||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||||
|
}
|
||||||
|
|
||||||
// Get a list of all available covariates
|
// Get a list of all available covariates
|
||||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
|
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
|
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
|
||||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
|
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
|
||||||
|
|
||||||
// Print and exit if that's what was requested
|
// Print and exit if that's what was requested
|
||||||
if ( LIST_ONLY ) {
|
if (LIST_ONLY) {
|
||||||
logger.info( "Available covariates:" );
|
logger.info("Available covariates:");
|
||||||
for( Class<?> covClass : covariateClasses ) {
|
for (Class<?> covClass : covariateClasses) {
|
||||||
logger.info( covClass.getSimpleName() );
|
logger.info(covClass.getSimpleName());
|
||||||
}
|
}
|
||||||
logger.info("");
|
logger.info("");
|
||||||
|
|
||||||
System.exit( 0 ); // Early exit here because user requested it
|
System.exit(0); // Early exit here because user requested it
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warn the user if no dbSNP file or other variant mask was specified
|
// Warn the user if no dbSNP file or other variant mask was specified
|
||||||
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
|
if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
|
||||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the requested covariates by parsing the -cov argument
|
// Initialize the requested covariates by parsing the -cov argument
|
||||||
// First add the required covariates
|
// First add the required covariates
|
||||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
if (requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
|
||||||
requestedCovariates.add( new QualityScoreCovariate() );
|
requestedCovariates.add(new QualityScoreCovariate());
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
||||||
}
|
}
|
||||||
// Next add the standard covariates if -standard was specified by the user
|
// Next add the standard covariates if -standard was specified by the user
|
||||||
if( USE_STANDARD_COVARIATES ) {
|
if (USE_STANDARD_COVARIATES) {
|
||||||
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
||||||
// A list of Classes can't be sorted, but a list of Class names can be
|
// A list of Classes can't be sorted, but a list of Class names can be
|
||||||
final List<String> standardClassNames = new ArrayList<String>();
|
final List<String> standardClassNames = new ArrayList<String>();
|
||||||
for( Class<?> covClass : standardClasses ) {
|
for (Class<?> covClass : standardClasses) {
|
||||||
standardClassNames.add( covClass.getName() );
|
standardClassNames.add(covClass.getName());
|
||||||
}
|
}
|
||||||
Collections.sort(standardClassNames); // Sort the list of class names
|
Collections.sort(standardClassNames); // Sort the list of class names
|
||||||
for( String className : standardClassNames ) {
|
for (String className : standardClassNames) {
|
||||||
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
|
for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
|
||||||
if( covClass.getName().equals( className ) ) {
|
if (covClass.getName().equals(className)) {
|
||||||
try {
|
try {
|
||||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||||
requestedCovariates.add( covariate );
|
requestedCovariates.add(covariate);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new DynamicClassResolutionException(covClass, e);
|
throw new DynamicClassResolutionException(covClass, e);
|
||||||
}
|
}
|
||||||
|
|
@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
||||||
if( COVARIATES != null ) {
|
if (COVARIATES != null) {
|
||||||
for( String requestedCovariateString : COVARIATES ) {
|
for (String requestedCovariateString : COVARIATES) {
|
||||||
boolean foundClass = false;
|
boolean foundClass = false;
|
||||||
for( Class<?> covClass : covariateClasses ) {
|
for (Class<?> covClass : covariateClasses) {
|
||||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
|
||||||
foundClass = true;
|
foundClass = true;
|
||||||
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
|
if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
|
||||||
try {
|
try {
|
||||||
// Now that we've found a matching class, try to instantiate it
|
// Now that we've found a matching class, try to instantiate it
|
||||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||||
requestedCovariates.add( covariate );
|
requestedCovariates.add(covariate);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new DynamicClassResolutionException(covClass, e);
|
throw new DynamicClassResolutionException(covClass, e);
|
||||||
}
|
}
|
||||||
|
|
@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !foundClass ) {
|
if (!foundClass) {
|
||||||
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info( "The covariates being used here: " );
|
logger.info("The covariates being used here: ");
|
||||||
for( Covariate cov : requestedCovariates ) {
|
for (Covariate cov : requestedCovariates) {
|
||||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
logger.info("\t" + cov.getClass().getSimpleName());
|
||||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// map
|
// map
|
||||||
|
|
@ -342,62 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||||
* whether or not the base matches the reference at this particular location
|
* whether or not the base matches the reference at this particular location
|
||||||
|
*
|
||||||
* @param tracker The reference metadata tracker
|
* @param tracker The reference metadata tracker
|
||||||
* @param ref The reference context
|
* @param ref The reference context
|
||||||
* @param context The alignment context
|
* @param context The alignment context
|
||||||
* @return Returns 1, but this value isn't used in the reduce step
|
* @return Returns 1, but this value isn't used in the reduce step
|
||||||
*/
|
*/
|
||||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
// Only use data from non-dbsnp sites
|
// Only use data from non-dbsnp sites
|
||||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||||
CountedData counter = new CountedData();
|
CountedData counter = new CountedData();
|
||||||
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||||
// For each read at this locus
|
// For each read at this locus
|
||||||
for( final PileupElement p : context.getBasePileup() ) {
|
for (final PileupElement p : context.getBasePileup()) {
|
||||||
final GATKSAMRecord gatkRead = p.getRead();
|
final GATKSAMRecord gatkRead = p.getRead();
|
||||||
int offset = p.getOffset();
|
int offset = p.getOffset();
|
||||||
|
|
||||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
|
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
|
||||||
{
|
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
|
||||||
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
|
RecalDataManager.parseSAMRecord(gatkRead, RAC);
|
||||||
RecalDataManager.parseSAMRecord( gatkRead, RAC );
|
|
||||||
|
|
||||||
// Skip over reads with no calls in the color space if the user requested it
|
// Skip over reads with no calls in the color space if the user requested it
|
||||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
|
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
|
||||||
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
|
gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
RecalDataManager.parseColorSpace( gatkRead );
|
RecalDataManager.parseColorSpace(gatkRead);
|
||||||
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
|
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
|
||||||
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip this position if base quality is zero
|
// Skip this position if base quality is zero
|
||||||
if( gatkRead.getBaseQualities()[offset] > 0 ) {
|
if (gatkRead.getBaseQualities()[offset] > 0) {
|
||||||
|
|
||||||
byte[] bases = gatkRead.getReadBases();
|
byte[] bases = gatkRead.getReadBases();
|
||||||
byte refBase = ref.getBase();
|
byte refBase = ref.getBase();
|
||||||
|
|
||||||
// Skip if this base is an 'N' or etc.
|
// Skip if this base is an 'N' or etc.
|
||||||
if( BaseUtils.isRegularBase( bases[offset] ) ) {
|
if (BaseUtils.isRegularBase(bases[offset])) {
|
||||||
|
|
||||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||||
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
|
!RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
|
||||||
|
|
||||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||||
updateDataFromRead( counter, gatkRead, offset, refBase );
|
updateDataFromRead(counter, gatkRead, offset, refBase);
|
||||||
|
|
||||||
} else { // calculate SOLID reference insertion rate
|
}
|
||||||
if( refBase == bases[offset] ) {
|
else { // calculate SOLID reference insertion rate
|
||||||
|
if (refBase == bases[offset]) {
|
||||||
counter.solidInsertedReferenceBases++;
|
counter.solidInsertedReferenceBases++;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
counter.otherColorSpaceInconsistency++;
|
counter.otherColorSpaceInconsistency++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counter.countedSites++;
|
counter.countedSites++;
|
||||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
}
|
||||||
|
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||||
counter.skippedSites++;
|
counter.skippedSites++;
|
||||||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||||
}
|
}
|
||||||
|
|
@ -413,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Update the mismatch / total_base counts for a given class of loci.
|
* Update the mismatch / total_base counts for a given class of loci.
|
||||||
*
|
*
|
||||||
* @param counter The CountedData to be updated
|
* @param counter The CountedData to be updated
|
||||||
|
|
@ -421,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* @param refBase The reference base
|
* @param refBase The reference base
|
||||||
*/
|
*/
|
||||||
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
||||||
for( PileupElement p : context.getBasePileup() ) {
|
for (PileupElement p : context.getBasePileup()) {
|
||||||
final byte readBase = p.getBase();
|
final byte readBase = p.getBase();
|
||||||
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
||||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||||
|
|
||||||
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
|
if (readBaseIndex != -1 && refBaseIndex != -1) {
|
||||||
if( readBaseIndex != refBaseIndex ) {
|
if (readBaseIndex != refBaseIndex) {
|
||||||
counter.novelCountsMM++;
|
counter.novelCountsMM++;
|
||||||
}
|
}
|
||||||
counter.novelCountsBases++;
|
counter.novelCountsBases++;
|
||||||
|
|
@ -439,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* Major workhorse routine for this walker.
|
* Major workhorse routine for this walker.
|
||||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||||
* adding one to the number of observations and potentially one to the number of mismatches
|
* adding one to the number of observations and potentially one to the number of mismatches
|
||||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||||
* because pulling things out of the SAMRecord is an expensive operation.
|
* because pulling things out of the SAMRecord is an expensive operation.
|
||||||
* @param counter Data structure which holds the counted bases
|
*
|
||||||
|
* @param counter Data structure which holds the counted bases
|
||||||
* @param gatkRead The SAMRecord holding all the data for this read
|
* @param gatkRead The SAMRecord holding all the data for this read
|
||||||
* @param offset The offset in the read for this locus
|
* @param offset The offset in the read for this locus
|
||||||
* @param refBase The reference base at this locus
|
* @param refBase The reference base at this locus
|
||||||
*/
|
*/
|
||||||
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
|
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
|
||||||
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||||
|
|
@ -453,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||||
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
||||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
|
RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
|
||||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||||
// initialized with zeros, will be incremented at end of method
|
// initialized with zeros, will be incremented at end of method
|
||||||
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
|
datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Need the bases to determine whether or not we have a mismatch
|
// Need the bases to determine whether or not we have a mismatch
|
||||||
|
|
@ -464,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
final long curMismatches = datum.getNumMismatches();
|
final long curMismatches = datum.getNumMismatches();
|
||||||
|
|
||||||
// Add one to the number of observations and potentially one to the number of mismatches
|
// Add one to the number of observations and potentially one to the number of mismatches
|
||||||
datum.incrementBaseCounts( base, refBase );
|
datum.incrementBaseCounts(base, refBase);
|
||||||
counter.countedBases++;
|
counter.countedBases++;
|
||||||
counter.novelCountsBases++;
|
counter.novelCountsBases++;
|
||||||
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// reduce
|
// reduce
|
||||||
|
|
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||||
|
*
|
||||||
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
||||||
*/
|
*/
|
||||||
public CountedData reduceInit() {
|
public CountedData reduceInit() {
|
||||||
|
|
@ -487,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Reduce method doesn't do anything for this walker.
|
* The Reduce method doesn't do anything for this walker.
|
||||||
|
*
|
||||||
* @param mapped Result of the map. This value is immediately ignored.
|
* @param mapped Result of the map. This value is immediately ignored.
|
||||||
* @param sum The summing CountedData used to output the CSV data
|
* @param sum The summing CountedData used to output the CSV data
|
||||||
* @return returns The sum used to output the CSV data
|
* @return returns The sum used to output the CSV data
|
||||||
*/
|
*/
|
||||||
public CountedData reduce( CountedData mapped, CountedData sum ) {
|
public CountedData reduce(CountedData mapped, CountedData sum) {
|
||||||
// Do a dbSNP sanity check every so often
|
// Do a dbSNP sanity check every so often
|
||||||
return validatingDbsnpMismatchRate(sum.add(mapped));
|
return validatingDbsnpMismatchRate(sum.add(mapped));
|
||||||
}
|
}
|
||||||
|
|
@ -500,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* Validate the dbSNP reference mismatch rates.
|
* Validate the dbSNP reference mismatch rates.
|
||||||
*/
|
*/
|
||||||
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
||||||
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
|
||||||
counter.lociSinceLastDbsnpCheck = 0;
|
counter.lociSinceLastDbsnpCheck = 0;
|
||||||
|
|
||||||
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
|
if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
|
||||||
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
|
final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
|
||||||
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
|
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
|
||||||
|
|
||||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
|
||||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
|
||||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
|
||||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
|
public CountedData treeReduce(CountedData sum1, CountedData sum2) {
|
||||||
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write out the full data hashmap to disk in CSV format
|
* Write out the full data hashmap to disk in CSV format
|
||||||
|
*
|
||||||
* @param sum The CountedData to write out to RECAL_FILE
|
* @param sum The CountedData to write out to RECAL_FILE
|
||||||
*/
|
*/
|
||||||
public void onTraversalDone( CountedData sum ) {
|
public void onTraversalDone(CountedData sum) {
|
||||||
logger.info( "Writing raw recalibration data..." );
|
logger.info("Writing raw recalibration data...");
|
||||||
if( sum.countedBases == 0L ) {
|
if (sum.countedBases == 0L) {
|
||||||
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
|
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
|
||||||
}
|
}
|
||||||
outputToCSV( sum, RECAL_FILE );
|
outputToCSV(sum, RECAL_FILE);
|
||||||
logger.info( "...done!" );
|
logger.info("...done!");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||||
|
*
|
||||||
* @param recalTableStream The PrintStream to write out to
|
* @param recalTableStream The PrintStream to write out to
|
||||||
*/
|
*/
|
||||||
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
|
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
|
||||||
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
||||||
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
||||||
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
||||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
|
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
|
||||||
|
|
||||||
if( sum.solidInsertedReferenceBases != 0 ) {
|
if (sum.solidInsertedReferenceBases != 0) {
|
||||||
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
||||||
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output header saying which covariates were used and in what order
|
// Output header saying which covariates were used and in what order
|
||||||
for( Covariate cov : requestedCovariates ) {
|
for (Covariate cov : requestedCovariates) {
|
||||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
|
||||||
}
|
}
|
||||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||||
|
|
||||||
if( DONT_SORT_OUTPUT ) {
|
if (DONT_SORT_OUTPUT) {
|
||||||
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -566,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||||
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
||||||
for( Object comp : data.keySet() ) {
|
for (Object comp : data.keySet()) {
|
||||||
keyList.add((Comparable) comp);
|
keyList.add((Comparable) comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
Collections.sort(keyList);
|
Collections.sort(keyList);
|
||||||
|
|
||||||
for( Comparable comp : keyList ) {
|
for (Comparable comp : keyList) {
|
||||||
key[curPos] = comp;
|
key[curPos] = comp;
|
||||||
final Object val = data.get(comp);
|
final Object val = data.get(comp);
|
||||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||||
// For each Covariate in the key
|
// For each Covariate in the key
|
||||||
for( Object compToPrint : key ) {
|
for (Object compToPrint : key) {
|
||||||
// Output the Covariate's value
|
// Output the Covariate's value
|
||||||
recalTableStream.print( compToPrint + "," );
|
recalTableStream.print(compToPrint + ",");
|
||||||
}
|
}
|
||||||
// Output the RecalDatum entry
|
// Output the RecalDatum entry
|
||||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||||
} else { // Another layer in the nested hash map
|
}
|
||||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
|
else { // Another layer in the nested hash map
|
||||||
|
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||||
for( Object comp : data.keySet() ) {
|
for (Object comp : data.keySet()) {
|
||||||
key[curPos] = comp;
|
key[curPos] = comp;
|
||||||
final Object val = data.get(comp);
|
final Object val = data.get(comp);
|
||||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||||
// For each Covariate in the key
|
// For each Covariate in the key
|
||||||
for( Object compToPrint : key ) {
|
for (Object compToPrint : key) {
|
||||||
// Output the Covariate's value
|
// Output the Covariate's value
|
||||||
recalTableStream.print( compToPrint + "," );
|
recalTableStream.print(compToPrint + ",");
|
||||||
}
|
}
|
||||||
// Output the RecalDatum entry
|
// Output the RecalDatum entry
|
||||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||||
} else { // Another layer in the nested hash map
|
}
|
||||||
printMappings( recalTableStream, curPos + 1, key, (Map) val );
|
else { // Another layer in the nested hash map
|
||||||
|
printMappings(recalTableStream, curPos + 1, key, (Map) val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -256,32 +257,6 @@ public class RecalDataManager {
|
||||||
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
|
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
|
||||||
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
|
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
|
||||||
|
|
||||||
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
|
|
||||||
if (readGroup == null) {
|
|
||||||
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
|
|
||||||
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
|
|
||||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
|
||||||
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
|
||||||
"First observed at read with name = " + read.getReadName());
|
|
||||||
warnUserNullReadGroup = true;
|
|
||||||
}
|
|
||||||
// There is no readGroup so defaulting to these values
|
|
||||||
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
|
|
||||||
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
|
|
||||||
((GATKSAMRecord) read).setReadGroup(readGroup);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
|
|
||||||
final String oldPlatform = readGroup.getPlatform();
|
|
||||||
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
|
|
||||||
readGroup.setPlatform(oldPlatform);
|
|
||||||
((GATKSAMRecord) read).setReadGroup(readGroup);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||||
readGroup.setPlatform(RAC.FORCE_PLATFORM);
|
readGroup.setPlatform(RAC.FORCE_PLATFORM);
|
||||||
}
|
}
|
||||||
|
|
@ -310,7 +285,7 @@ public class RecalDataManager {
|
||||||
public static void parseColorSpace(final GATKSAMRecord read) {
|
public static void parseColorSpace(final GATKSAMRecord read) {
|
||||||
|
|
||||||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||||
if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) {
|
if (ReadUtils.isSOLiDRead(read)) {
|
||||||
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
|
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
|
||||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||||
if (attr != null) {
|
if (attr != null) {
|
||||||
|
|
@ -408,7 +383,7 @@ public class RecalDataManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
|
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
|
||||||
if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) {
|
if (ReadUtils.isSOLiDRead(read)) {
|
||||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||||
if (attr != null) {
|
if (attr != null) {
|
||||||
byte[] colorSpace;
|
byte[] colorSpace;
|
||||||
|
|
@ -637,21 +612,17 @@ public class RecalDataManager {
|
||||||
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
|
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
|
||||||
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
|
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
|
||||||
|
|
||||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||||
for (int i = 0; i < numRequestedCovariates; i++) {
|
|
||||||
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
|
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
|
||||||
for (int j = 0; j < readLength; j++) {
|
for (int j = 0; j < readLength; j++)
|
||||||
//copy values into a 2D array that allows all covar types to be extracted at once for
|
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
|
||||||
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
|
|
||||||
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return covariateValues_offset_x_covar;
|
return covariateValues_offset_x_covar;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform a ceratin transversion (A <-> C or G <-> T) on the base.
|
* Perform a certain transversion (A <-> C or G <-> T) on the base.
|
||||||
*
|
*
|
||||||
* @param base the base [AaCcGgTt]
|
* @param base the base [AaCcGgTt]
|
||||||
* @return the transversion of the base, or the input base if it's not one of the understood ones
|
* @return the transversion of the base, or the input base if it's not one of the understood ones
|
||||||
|
|
|
||||||
|
|
@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
|
||||||
// Shared Command Line Arguments
|
// Shared Command Line Arguments
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
|
|
||||||
public String DEFAULT_READ_GROUP = null;
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||||
public String DEFAULT_PLATFORM = null;
|
public String DEFAULT_PLATFORM = null;
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
|
||||||
public String FORCE_READ_GROUP = null;
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||||
public String FORCE_PLATFORM = null;
|
public String FORCE_PLATFORM = null;
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
|
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
|
||||||
public int WINDOW_SIZE = 5;
|
public int WINDOW_SIZE = 5;
|
||||||
|
|
||||||
/**
|
|
||||||
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
|
||||||
*/
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
|
|
||||||
public int HOMOPOLYMER_NBACK = 7;
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
|
|
||||||
public boolean EXCEPTION_IF_NO_TILE = false;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||||
* reads which have had the reference inserted because of color space inconsistencies.
|
* reads which have had the reference inserted because of color space inconsistencies.
|
||||||
|
|
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
|
||||||
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
|
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
|
||||||
public int CONTEXT_SIZE = 8;
|
public int CONTEXT_SIZE = 8;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
|
||||||
|
public int HOMOPOLYMER_NBACK = 7;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -86,12 +86,12 @@ import java.util.regex.Pattern;
|
||||||
* -o my_reads.recal.bam \
|
* -o my_reads.recal.bam \
|
||||||
* -recalFile my_reads.recal_data.csv
|
* -recalFile my_reads.recal_data.csv
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||||
@WalkerName("TableRecalibration")
|
@WalkerName("TableRecalibration")
|
||||||
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
|
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||||
|
// This walker requires -I input.bam, it also requires -R reference.fasta
|
||||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
|
|
||||||
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
|
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
|
||||||
|
|
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Shared Arguments
|
// Shared Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
@ArgumentCollection
|
||||||
|
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
|
|
@ -110,12 +111,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||||
*/
|
*/
|
||||||
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
|
@Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
|
||||||
public File RECAL_FILE = null;
|
public File RECAL_FILE = null;
|
||||||
/**
|
/**
|
||||||
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
|
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
|
||||||
*/
|
*/
|
||||||
@Output(doc="The output recalibrated BAM file", required=true)
|
@Output(doc = "The output recalibrated BAM file", required = true)
|
||||||
private StingSAMFileWriter OUTPUT_BAM = null;
|
private StingSAMFileWriter OUTPUT_BAM = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -126,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
|
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
|
||||||
* are unmodified during recalibration, so they don't get inappropriately evaluated.
|
* are unmodified during recalibration, so they don't get inappropriately evaluated.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
@Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
|
||||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -135,37 +136,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
|
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
|
||||||
* --smoothing 15 for a large amount of smoothing.
|
* --smoothing 15 for a large amount of smoothing.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
@Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||||
private int SMOOTHING = 1;
|
private int SMOOTHING = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
||||||
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
|
@Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
|
||||||
private int MAX_QUALITY_SCORE = 50;
|
private int MAX_QUALITY_SCORE = 50;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
|
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
|
||||||
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
|
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
|
@Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
|
||||||
private boolean DO_NOT_WRITE_OQ = false;
|
private boolean DO_NOT_WRITE_OQ = false;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Debugging-only Arguments
|
// Debugging-only Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
@Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||||
private boolean NO_PG_TAG = false;
|
private boolean NO_PG_TAG = false;
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
|
@Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||||
private boolean REQUIRE_EOF = false;
|
private boolean REQUIRE_EOF = false;
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||||
private boolean skipUQUpdate = false;
|
private boolean skipUQUpdate = false;
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
*/
|
*/
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
if (RAC.FORCE_PLATFORM != null) {
|
||||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||||
|
}
|
||||||
|
|
||||||
// Get a list of all available covariates
|
// Get a list of all available covariates
|
||||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||||
|
|
@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
boolean foundAllCovariates = false;
|
boolean foundAllCovariates = false;
|
||||||
|
|
||||||
// Read in the data from the csv file and populate the data map and covariates list
|
// Read in the data from the csv file and populate the data map and covariates list
|
||||||
logger.info( "Reading in the data from input csv file..." );
|
logger.info("Reading in the data from input csv file...");
|
||||||
|
|
||||||
boolean sawEOF = false;
|
boolean sawEOF = false;
|
||||||
try {
|
try {
|
||||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
for (String line : new XReadLines(RECAL_FILE)) {
|
||||||
lineNumber++;
|
lineNumber++;
|
||||||
if ( EOF_MARKER.equals(line) ) {
|
if (EOF_MARKER.equals(line)) {
|
||||||
sawEOF = true;
|
sawEOF = true;
|
||||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
}
|
||||||
|
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||||
; // Skip over the comment lines, (which start with '#')
|
; // Skip over the comment lines, (which start with '#')
|
||||||
}
|
}
|
||||||
// Read in the covariates that were used from the input file
|
// Read in the covariates that were used from the input file
|
||||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
|
||||||
if( foundAllCovariates ) {
|
if (foundAllCovariates) {
|
||||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
|
||||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
}
|
||||||
|
else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||||
String[] vals = line.split(",");
|
String[] vals = line.split(",");
|
||||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||||
boolean foundClass = false;
|
boolean foundClass = false;
|
||||||
for( Class<?> covClass : classes ) {
|
for (Class<?> covClass : classes) {
|
||||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||||
foundClass = true;
|
foundClass = true;
|
||||||
try {
|
try {
|
||||||
Covariate covariate = (Covariate)covClass.newInstance();
|
Covariate covariate = (Covariate) covClass.newInstance();
|
||||||
requestedCovariates.add( covariate );
|
requestedCovariates.add(covariate);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new DynamicClassResolutionException(covClass, e);
|
throw new DynamicClassResolutionException(covClass, e);
|
||||||
}
|
}
|
||||||
|
|
@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !foundClass ) {
|
if (!foundClass) {
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else { // Found a line of data
|
}
|
||||||
if( !foundAllCovariates ) {
|
else { // Found a line of data
|
||||||
|
if (!foundAllCovariates) {
|
||||||
foundAllCovariates = true;
|
foundAllCovariates = true;
|
||||||
|
|
||||||
// At this point all the covariates should have been found and initialized
|
// At this point all the covariates should have been found and initialized
|
||||||
if( requestedCovariates.size() < 2 ) {
|
if (requestedCovariates.size() < 2) {
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||||
}
|
}
|
||||||
|
|
||||||
final boolean createCollapsedTables = true;
|
final boolean createCollapsedTables = true;
|
||||||
|
|
||||||
// Initialize any covariate member variables using the shared argument collection
|
// Initialize any covariate member variables using the shared argument collection
|
||||||
for( Covariate cov : requestedCovariates ) {
|
for (Covariate cov : requestedCovariates) {
|
||||||
cov.initialize( RAC );
|
cov.initialize(RAC);
|
||||||
}
|
}
|
||||||
// Initialize the data hashMaps
|
// Initialize the data hashMaps
|
||||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
|
||||||
|
|
||||||
}
|
}
|
||||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||||
} catch ( NumberFormatException e ) {
|
} catch (NumberFormatException e) {
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||||
}
|
}
|
||||||
logger.info( "...done!" );
|
logger.info("...done!");
|
||||||
|
|
||||||
if ( !sawEOF ) {
|
if (!sawEOF) {
|
||||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||||
if ( REQUIRE_EOF )
|
if (REQUIRE_EOF)
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||||
logger.warn(errorMessage);
|
logger.warn(errorMessage);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info( "The covariates being used here: " );
|
logger.info("The covariates being used here: ");
|
||||||
for( Covariate cov : requestedCovariates ) {
|
for (Covariate cov : requestedCovariates) {
|
||||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
logger.info("\t" + cov.getClass().getSimpleName());
|
||||||
}
|
}
|
||||||
|
|
||||||
if( dataManager == null ) {
|
if (dataManager == null) {
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
logger.info("Generating tables of empirical qualities for use in sequential calculation...");
|
||||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
|
||||||
logger.info( "...done!" );
|
logger.info("...done!");
|
||||||
|
|
||||||
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
||||||
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
||||||
if( !NO_PG_TAG ) {
|
if (!NO_PG_TAG) {
|
||||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||||
try {
|
try {
|
||||||
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||||
programRecord.setProgramVersion(version);
|
programRecord.setProgramVersion(version);
|
||||||
} catch (MissingResourceException e) {}
|
} catch (MissingResourceException e) {
|
||||||
|
}
|
||||||
|
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
|
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
|
||||||
sb.append(" Covariates=[");
|
sb.append(" Covariates=[");
|
||||||
for( Covariate cov : requestedCovariates ) {
|
for (Covariate cov : requestedCovariates) {
|
||||||
sb.append(cov.getClass().getSimpleName());
|
sb.append(cov.getClass().getSimpleName());
|
||||||
sb.append(", ");
|
sb.append(", ");
|
||||||
}
|
}
|
||||||
sb.setCharAt(sb.length()-2, ']');
|
sb.setCharAt(sb.length() - 2, ']');
|
||||||
sb.setCharAt(sb.length()-1, ' ');
|
sb.setCharAt(sb.length() - 1, ' ');
|
||||||
programRecord.setCommandLine(sb.toString());
|
programRecord.setCommandLine(sb.toString());
|
||||||
|
|
||||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size() + 1);
|
||||||
for ( SAMProgramRecord record : oldRecords ) {
|
for (SAMProgramRecord record : oldRecords) {
|
||||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
|
if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
|
||||||
newRecords.add(record);
|
newRecords.add(record);
|
||||||
}
|
}
|
||||||
newRecords.add(programRecord);
|
newRecords.add(programRecord);
|
||||||
header.setProgramRecords(newRecords);
|
header.setProgramRecords(newRecords);
|
||||||
|
|
||||||
// Write out the new header
|
// Write out the new header
|
||||||
OUTPUT_BAM.writeHeader( header );
|
OUTPUT_BAM.writeHeader(header);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||||
|
*
|
||||||
* @param line A line of CSV data read from the recalibration table data file
|
* @param line A line of CSV data read from the recalibration table data file
|
||||||
*/
|
*/
|
||||||
private void addCSVData(final File file, final String line) {
|
private void addCSVData(final File file, final String line) {
|
||||||
final String[] vals = line.split(",");
|
final String[] vals = line.split(",");
|
||||||
|
|
||||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||||
}
|
}
|
||||||
|
|
@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
final Object[] key = new Object[requestedCovariates.size()];
|
final Object[] key = new Object[requestedCovariates.size()];
|
||||||
Covariate cov;
|
Covariate cov;
|
||||||
int iii;
|
int iii;
|
||||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
for (iii = 0; iii < requestedCovariates.size(); iii++) {
|
||||||
cov = requestedCovariates.get( iii );
|
cov = requestedCovariates.get(iii);
|
||||||
key[iii] = cov.getValue( vals[iii] );
|
key[iii] = cov.getValue(vals[iii]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
|
||||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -366,64 +372,63 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||||
*
|
*
|
||||||
* @param refBases References bases over the length of the read
|
* @param refBases References bases over the length of the read
|
||||||
* @param read The read to be recalibrated
|
* @param read The read to be recalibrated
|
||||||
* @return The read with quality scores replaced
|
* @return The read with quality scores replaced
|
||||||
*/
|
*/
|
||||||
public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
|
||||||
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||||
return read;
|
return read;
|
||||||
}
|
}
|
||||||
|
|
||||||
RecalDataManager.parseSAMRecord( read, RAC );
|
RecalDataManager.parseSAMRecord(read, RAC);
|
||||||
|
|
||||||
byte[] originalQuals = read.getBaseQualities();
|
byte[] originalQuals = read.getBaseQualities();
|
||||||
final byte[] recalQuals = originalQuals.clone();
|
final byte[] recalQuals = originalQuals.clone();
|
||||||
|
|
||||||
final String platform = read.getReadGroup().getPlatform();
|
final String platform = read.getReadGroup().getPlatform();
|
||||||
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) {
|
if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
|
||||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) {
|
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
|
||||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
|
final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
|
||||||
if( badColor ) {
|
if (badColor) {
|
||||||
numReadsWithMalformedColorSpace++;
|
numReadsWithMalformedColorSpace++;
|
||||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||||
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
||||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
}
|
||||||
|
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||||
read.setReadFailsVendorQualityCheckFlag(true);
|
read.setReadFailsVendorQualityCheckFlag(true);
|
||||||
return read;
|
return read;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() );
|
originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
|
||||||
}
|
}
|
||||||
|
|
||||||
//compute all covariate values for this read
|
//compute all covariate values for this read
|
||||||
final Comparable[][] covariateValues_offset_x_covar =
|
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
|
||||||
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
|
|
||||||
|
|
||||||
// For each base in the read
|
// For each base in the read
|
||||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||||
|
|
||||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||||
|
|
||||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||||
if(qualityScore == null)
|
if (qualityScore == null) {
|
||||||
{
|
qualityScore = performSequentialQualityCalculation(fullCovariateKey);
|
||||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
|
||||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
recalQuals[offset] = qualityScore;
|
recalQuals[offset] = qualityScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||||
|
|
||||||
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
|
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
|
||||||
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
|
||||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||||
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -440,27 +445,28 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
*
|
*
|
||||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||||
*
|
*
|
||||||
* - calculate the global quality score shift across all data [DeltaQ]
|
* - calculate the global quality score shift across all data [DeltaQ]
|
||||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||||
* - The final shift equation is:
|
* - The final shift equation is:
|
||||||
|
*
|
||||||
|
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||||
*
|
*
|
||||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
|
||||||
* @param key The list of Comparables that were calculated from the covariates
|
* @param key The list of Comparables that were calculated from the covariates
|
||||||
* @return A recalibrated quality score as a byte
|
* @return A recalibrated quality score as a byte
|
||||||
*/
|
*/
|
||||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
private byte performSequentialQualityCalculation(final Object... key) {
|
||||||
|
|
||||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
|
||||||
final Object[] readGroupCollapsedKey = new Object[1];
|
final Object[] readGroupCollapsedKey = new Object[1];
|
||||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||||
final Object[] covariateCollapsedKey = new Object[3];
|
final Object[] covariateCollapsedKey = new Object[3];
|
||||||
|
|
||||||
// The global quality shift (over the read group only)
|
// The global quality shift (over the read group only)
|
||||||
readGroupCollapsedKey[0] = key[0];
|
readGroupCollapsedKey[0] = key[0];
|
||||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
|
||||||
double globalDeltaQ = 0.0;
|
double globalDeltaQ = 0.0;
|
||||||
if( globalRecalDatum != null ) {
|
if (globalRecalDatum != null) {
|
||||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||||
|
|
@ -469,9 +475,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
// The shift in quality between reported and empirical
|
// The shift in quality between reported and empirical
|
||||||
qualityScoreCollapsedKey[0] = key[0];
|
qualityScoreCollapsedKey[0] = key[0];
|
||||||
qualityScoreCollapsedKey[1] = key[1];
|
qualityScoreCollapsedKey[1] = key[1];
|
||||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
|
||||||
double deltaQReported = 0.0;
|
double deltaQReported = 0.0;
|
||||||
if( qReportedRecalDatum != null ) {
|
if (qReportedRecalDatum != null) {
|
||||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||||
}
|
}
|
||||||
|
|
@ -481,17 +487,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
double deltaQCovariateEmpirical;
|
double deltaQCovariateEmpirical;
|
||||||
covariateCollapsedKey[0] = key[0];
|
covariateCollapsedKey[0] = key[0];
|
||||||
covariateCollapsedKey[1] = key[1];
|
covariateCollapsedKey[1] = key[1];
|
||||||
for( int iii = 2; iii < key.length; iii++ ) {
|
for (int iii = 2; iii < key.length; iii++) {
|
||||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
|
||||||
if( covariateRecalDatum != null ) {
|
if (covariateRecalDatum != null) {
|
||||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
|
||||||
|
|
||||||
// Verbose printouts used to validate with old recalibrator
|
// Verbose printouts used to validate with old recalibrator
|
||||||
//if(key.contains(null)) {
|
//if(key.contains(null)) {
|
||||||
|
|
@ -508,12 +514,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||||
|
*
|
||||||
* @param originalQuals The list of original base quality scores
|
* @param originalQuals The list of original base quality scores
|
||||||
* @param recalQuals A list of the new recalibrated quality scores
|
* @param recalQuals A list of the new recalibrated quality scores
|
||||||
*/
|
*/
|
||||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||||
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
|
||||||
recalQuals[iii] = originalQuals[iii];
|
recalQuals[iii] = originalQuals[iii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start the reduce with a handle to the output bam file
|
* Start the reduce with a handle to the output bam file
|
||||||
|
*
|
||||||
* @return A FileWriter pointing to a new bam file
|
* @return A FileWriter pointing to a new bam file
|
||||||
*/
|
*/
|
||||||
public SAMFileWriter reduceInit() {
|
public SAMFileWriter reduceInit() {
|
||||||
|
|
@ -535,12 +543,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Output each read to disk
|
* Output each read to disk
|
||||||
* @param read The read to output
|
*
|
||||||
|
* @param read The read to output
|
||||||
* @param output The FileWriter to write the read to
|
* @param output The FileWriter to write the read to
|
||||||
* @return The FileWriter
|
* @return The FileWriter
|
||||||
*/
|
*/
|
||||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||||
if( output != null ) {
|
if (output != null) {
|
||||||
output.addAlignment(read);
|
output.addAlignment(read);
|
||||||
}
|
}
|
||||||
return output;
|
return output;
|
||||||
|
|
@ -548,20 +557,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do nothing
|
* Do nothing
|
||||||
|
*
|
||||||
* @param output The SAMFileWriter that outputs the bam file
|
* @param output The SAMFileWriter that outputs the bam file
|
||||||
*/
|
*/
|
||||||
public void onTraversalDone(SAMFileWriter output) {
|
public void onTraversalDone(SAMFileWriter output) {
|
||||||
if( numReadsWithMalformedColorSpace != 0 ) {
|
if (numReadsWithMalformedColorSpace != 0) {
|
||||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
}
|
||||||
|
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||||
"These reads were completely removed from the output bam file.");
|
"These reads were completely removed from the output bam file.");
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,57 +2,59 @@ package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||||
*/
|
*/
|
||||||
public class BaseUtils {
|
public class BaseUtils {
|
||||||
public final static byte A = (byte)'A';
|
public final static byte A = (byte) 'A';
|
||||||
public final static byte C = (byte)'C';
|
public final static byte C = (byte) 'C';
|
||||||
public final static byte G = (byte)'G';
|
public final static byte G = (byte) 'G';
|
||||||
public final static byte T = (byte)'T';
|
public final static byte T = (byte) 'T';
|
||||||
|
|
||||||
public final static byte N = (byte)'N';
|
public final static byte N = (byte) 'N';
|
||||||
public final static byte D = (byte)'D';
|
public final static byte D = (byte) 'D';
|
||||||
|
|
||||||
//
|
//
|
||||||
// todo -- we need a generalized base abstraction using the Base enum.
|
// todo -- we need a generalized base abstraction using the Base enum.
|
||||||
//
|
//
|
||||||
public final static byte[] BASES = { 'A', 'C', 'G', 'T' };
|
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
||||||
public final static byte[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
|
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
||||||
|
|
||||||
public enum Base {
|
public enum Base {
|
||||||
A ( 'A', 0 ),
|
A('A', 0),
|
||||||
C ( 'C', 1 ),
|
C('C', 1),
|
||||||
G ( 'G', 2 ),
|
G('G', 2),
|
||||||
T ( 'T', 3 );
|
T('T', 3);
|
||||||
|
|
||||||
byte b;
|
byte b;
|
||||||
int index;
|
int index;
|
||||||
|
|
||||||
private Base(char base, int index) {
|
private Base(char base, int index) {
|
||||||
this.b = (byte)base;
|
this.b = (byte) base;
|
||||||
this.index = index;
|
this.index = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte getBase() { return b; }
|
public byte getBase() { return b; }
|
||||||
public char getBaseAsChar() { return (char)b; }
|
|
||||||
|
public char getBaseAsChar() { return (char) b; }
|
||||||
|
|
||||||
public int getIndex() { return index; }
|
public int getIndex() { return index; }
|
||||||
|
|
||||||
public boolean sameBase(byte o) { return b == o; }
|
public boolean sameBase(byte o) { return b == o; }
|
||||||
public boolean sameBase(char o) { return b == (byte)o; }
|
|
||||||
public boolean sameBase(int i) { return index == i; }
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public boolean sameBase(char o) { return b == (byte) o; }
|
||||||
|
|
||||||
|
public boolean sameBase(int i) { return index == i; }
|
||||||
|
}
|
||||||
|
|
||||||
// todo -- fix me (enums?)
|
// todo -- fix me (enums?)
|
||||||
public static final byte DELETION_INDEX = 4;
|
public static final byte DELETION_INDEX = 4;
|
||||||
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
||||||
|
|
||||||
public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte)'G');
|
public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G');
|
||||||
public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte)'C');
|
public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C');
|
||||||
public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte)'A');
|
public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
|
||||||
public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte)'T');
|
public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
|
||||||
|
|
||||||
|
|
||||||
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
||||||
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
||||||
|
|
@ -64,28 +66,31 @@ public class BaseUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the base substitution type of the 2 state SNP
|
* Returns the base substitution type of the 2 state SNP
|
||||||
|
*
|
||||||
* @param base1
|
* @param base1
|
||||||
* @param base2
|
* @param base2
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) {
|
public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) {
|
||||||
BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
|
BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
|
||||||
//System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
|
//System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isTransition( byte base1, byte base2 ) {
|
public static boolean isTransition(byte base1, byte base2) {
|
||||||
int b1 = simpleBaseToBaseIndex(base1);
|
int b1 = simpleBaseToBaseIndex(base1);
|
||||||
int b2 = simpleBaseToBaseIndex(base2);
|
int b2 = simpleBaseToBaseIndex(base2);
|
||||||
return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
|
return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
|
||||||
b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
|
b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isTransversion( byte base1, byte base2 ) {
|
public static boolean isTransversion(byte base1, byte base2) {
|
||||||
return ! isTransition(base1, base2);
|
return !isTransition(base1, base2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Private constructor. No instantiating this class! */
|
/**
|
||||||
|
* Private constructor. No instantiating this class!
|
||||||
|
*/
|
||||||
private BaseUtils() {}
|
private BaseUtils() {}
|
||||||
|
|
||||||
static public boolean basesAreEqual(byte base1, byte base2) {
|
static public boolean basesAreEqual(byte base1, byte base2) {
|
||||||
|
|
@ -96,7 +101,6 @@ public class BaseUtils {
|
||||||
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a IUPAC nucleotide code to a pair of bases
|
* Converts a IUPAC nucleotide code to a pair of bases
|
||||||
*
|
*
|
||||||
|
|
@ -163,33 +167,37 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Converts a simple base to a base index
|
* Converts a simple base to a base index
|
||||||
*
|
*
|
||||||
* @param base [AaCcGgTt]
|
* @param base [AaCcGgTt]
|
||||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
||||||
*/
|
*/
|
||||||
static public int simpleBaseToBaseIndex(byte base) {
|
static public int simpleBaseToBaseIndex(byte base) {
|
||||||
switch (base) {
|
switch (base) {
|
||||||
case '*': // the wildcard character counts as an A
|
case '*': // the wildcard character counts as an A
|
||||||
case 'A':
|
case 'A':
|
||||||
case 'a': return 0;
|
case 'a':
|
||||||
|
return 0;
|
||||||
|
|
||||||
case 'C':
|
case 'C':
|
||||||
case 'c': return 1;
|
case 'c':
|
||||||
|
return 1;
|
||||||
|
|
||||||
case 'G':
|
case 'G':
|
||||||
case 'g': return 2;
|
case 'g':
|
||||||
|
return 2;
|
||||||
|
|
||||||
case 'T':
|
case 'T':
|
||||||
case 't': return 3;
|
case 't':
|
||||||
|
return 3;
|
||||||
|
|
||||||
default: return -1;
|
default:
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a simple base to a base index
|
* Converts a simple base to a base index
|
||||||
*
|
*
|
||||||
* @param base [AaCcGgTt]
|
* @param base [AaCcGgTt]
|
||||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -197,29 +205,37 @@ public class BaseUtils {
|
||||||
switch (base) {
|
switch (base) {
|
||||||
case '*': // the wildcard character counts as an A
|
case '*': // the wildcard character counts as an A
|
||||||
case 'A':
|
case 'A':
|
||||||
case 'a': return 0;
|
case 'a':
|
||||||
|
return 0;
|
||||||
|
|
||||||
case 'C':
|
case 'C':
|
||||||
case 'c': return 1;
|
case 'c':
|
||||||
|
return 1;
|
||||||
|
|
||||||
case 'G':
|
case 'G':
|
||||||
case 'g': return 2;
|
case 'g':
|
||||||
|
return 2;
|
||||||
|
|
||||||
case 'T':
|
case 'T':
|
||||||
case 't': return 3;
|
case 't':
|
||||||
|
return 3;
|
||||||
|
|
||||||
default: return -1;
|
default:
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static public int extendedBaseToBaseIndex(byte base) {
|
static public int extendedBaseToBaseIndex(byte base) {
|
||||||
switch (base) {
|
switch (base) {
|
||||||
case 'd':
|
case 'd':
|
||||||
case 'D': return DELETION_INDEX;
|
case 'D':
|
||||||
|
return DELETION_INDEX;
|
||||||
case 'n':
|
case 'n':
|
||||||
case 'N': return NO_CALL_INDEX;
|
case 'N':
|
||||||
|
return NO_CALL_INDEX;
|
||||||
|
|
||||||
default: return simpleBaseToBaseIndex(base);
|
default:
|
||||||
|
return simpleBaseToBaseIndex(base);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -232,11 +248,6 @@ public class BaseUtils {
|
||||||
return simpleBaseToBaseIndex(base) != -1;
|
return simpleBaseToBaseIndex(base) != -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
static public boolean isNBase(char base) {
|
|
||||||
return isNBase((byte)base);
|
|
||||||
}
|
|
||||||
|
|
||||||
static public boolean isNBase(byte base) {
|
static public boolean isNBase(byte base) {
|
||||||
return base == 'N' || base == 'n';
|
return base == 'N' || base == 'n';
|
||||||
}
|
}
|
||||||
|
|
@ -244,68 +255,83 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Converts a base index to a simple base
|
* Converts a base index to a simple base
|
||||||
*
|
*
|
||||||
* @param baseIndex 0, 1, 2, 3
|
* @param baseIndex 0, 1, 2, 3
|
||||||
* @return A, C, G, T, or '.' if the index can't be understood
|
* @return A, C, G, T, or '.' if the index can't be understood
|
||||||
*/
|
*/
|
||||||
static public byte baseIndexToSimpleBase(int baseIndex) {
|
static public byte baseIndexToSimpleBase(int baseIndex) {
|
||||||
switch (baseIndex) {
|
switch (baseIndex) {
|
||||||
case 0: return 'A';
|
case 0:
|
||||||
case 1: return 'C';
|
return 'A';
|
||||||
case 2: return 'G';
|
case 1:
|
||||||
case 3: return 'T';
|
return 'C';
|
||||||
default: return '.';
|
case 2:
|
||||||
|
return 'G';
|
||||||
|
case 3:
|
||||||
|
return 'T';
|
||||||
|
default:
|
||||||
|
return '.';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
static public char baseIndexToSimpleBaseAsChar(int baseIndex) {
|
static public char baseIndexToSimpleBaseAsChar(int baseIndex) {
|
||||||
return (char)baseIndexToSimpleBase(baseIndex);
|
return (char) baseIndexToSimpleBase(baseIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a base index to a base index representing its cross-talk partner
|
* Converts a base index to a base index representing its cross-talk partner
|
||||||
*
|
*
|
||||||
* @param baseIndex 0, 1, 2, 3
|
* @param baseIndex 0, 1, 2, 3
|
||||||
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
||||||
*/
|
*/
|
||||||
static public int crossTalkPartnerIndex(int baseIndex) {
|
static public int crossTalkPartnerIndex(int baseIndex) {
|
||||||
switch (baseIndex) {
|
switch (baseIndex) {
|
||||||
case 0: return 1; // A -> C
|
case 0:
|
||||||
case 1: return 0; // C -> A
|
return 1; // A -> C
|
||||||
case 2: return 3; // G -> T
|
case 1:
|
||||||
case 3: return 2; // T -> G
|
return 0; // C -> A
|
||||||
default: return -1;
|
case 2:
|
||||||
|
return 3; // G -> T
|
||||||
|
case 3:
|
||||||
|
return 2; // T -> G
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a base to the base representing its cross-talk partner
|
* Converts a base to the base representing its cross-talk partner
|
||||||
*
|
*
|
||||||
* @param base [AaCcGgTt]
|
* @param base [AaCcGgTt]
|
||||||
* @return C, A, T, G, or '.' if the base can't be understood
|
* @return C, A, T, G, or '.' if the base can't be understood
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
static public char crossTalkPartnerBase(char base) {
|
static public char crossTalkPartnerBase(char base) {
|
||||||
return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the complement of a base index.
|
* Return the complement of a base index.
|
||||||
*
|
*
|
||||||
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
||||||
* @return the complementary base index
|
* @return the complementary base index
|
||||||
*/
|
*/
|
||||||
static public byte complementIndex(int baseIndex) {
|
static public byte complementIndex(int baseIndex) {
|
||||||
switch (baseIndex) {
|
switch (baseIndex) {
|
||||||
case 0: return 3; // a -> t
|
case 0:
|
||||||
case 1: return 2; // c -> g
|
return 3; // a -> t
|
||||||
case 2: return 1; // g -> c
|
case 1:
|
||||||
case 3: return 0; // t -> a
|
return 2; // c -> g
|
||||||
default: return -1; // wtf?
|
case 2:
|
||||||
|
return 1; // g -> c
|
||||||
|
case 3:
|
||||||
|
return 0; // t -> a
|
||||||
|
default:
|
||||||
|
return -1; // wtf?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
||||||
*
|
*
|
||||||
* @param base the base [AaCcGgTt]
|
* @param base the base [AaCcGgTt]
|
||||||
|
|
@ -314,20 +340,25 @@ public class BaseUtils {
|
||||||
static public byte simpleComplement(byte base) {
|
static public byte simpleComplement(byte base) {
|
||||||
switch (base) {
|
switch (base) {
|
||||||
case 'A':
|
case 'A':
|
||||||
case 'a': return 'T';
|
case 'a':
|
||||||
|
return 'T';
|
||||||
case 'C':
|
case 'C':
|
||||||
case 'c': return 'G';
|
case 'c':
|
||||||
|
return 'G';
|
||||||
case 'G':
|
case 'G':
|
||||||
case 'g': return 'C';
|
case 'g':
|
||||||
|
return 'C';
|
||||||
case 'T':
|
case 'T':
|
||||||
case 't': return 'A';
|
case 't':
|
||||||
default: return base;
|
return 'A';
|
||||||
|
default:
|
||||||
|
return base;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
static public char simpleComplement(char base) {
|
static public char simpleComplement(char base) {
|
||||||
return (char)simpleComplement((byte)base);
|
return (char) simpleComplement((byte) base);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -349,7 +380,7 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
||||||
*
|
*
|
||||||
* @param bases the byte array of bases
|
* @param bases the byte array of bases
|
||||||
* @return the complement of the base byte array
|
* @return the complement of the base byte array
|
||||||
*/
|
*/
|
||||||
static public byte[] simpleComplement(byte[] bases) {
|
static public byte[] simpleComplement(byte[] bases) {
|
||||||
|
|
@ -382,7 +413,7 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Complement a char array of bases
|
* Complement a char array of bases
|
||||||
*
|
*
|
||||||
* @param bases the char array of bases
|
* @param bases the char array of bases
|
||||||
* @return the complement of the base char array
|
* @return the complement of the base char array
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -399,7 +430,7 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Reverse complement a String of bases. Preserves ambiguous bases.
|
* Reverse complement a String of bases. Preserves ambiguous bases.
|
||||||
*
|
*
|
||||||
* @param bases the String of bases
|
* @param bases the String of bases
|
||||||
* @return the reverse complement of the String
|
* @return the reverse complement of the String
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -407,11 +438,10 @@ public class BaseUtils {
|
||||||
return new String(simpleReverseComplement(bases.getBytes()));
|
return new String(simpleReverseComplement(bases.getBytes()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Complement a String of bases. Preserves ambiguous bases.
|
* Complement a String of bases. Preserves ambiguous bases.
|
||||||
*
|
*
|
||||||
* @param bases the String of bases
|
* @param bases the String of bases
|
||||||
* @return the complement of the String
|
* @return the complement of the String
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|
@ -451,7 +481,7 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
|
* Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
|
||||||
*
|
*
|
||||||
* @param baseCounts counts of a,c,g,t in order.
|
* @param baseCounts counts of a,c,g,t in order.
|
||||||
* @return the most common base
|
* @return the most common base
|
||||||
*/
|
*/
|
||||||
static public byte mostFrequentSimpleBase(int[] baseCounts) {
|
static public byte mostFrequentSimpleBase(int[] baseCounts) {
|
||||||
|
|
@ -461,13 +491,13 @@ public class BaseUtils {
|
||||||
/**
|
/**
|
||||||
* For the most frequent base in the sequence, return the percentage of the read it constitutes.
|
* For the most frequent base in the sequence, return the percentage of the read it constitutes.
|
||||||
*
|
*
|
||||||
* @param sequence the read sequence
|
* @param sequence the read sequence
|
||||||
* @return the percentage of the read that's made up of the most frequent base
|
* @return the percentage of the read that's made up of the most frequent base
|
||||||
*/
|
*/
|
||||||
static public double mostFrequentBaseFraction(byte[] sequence) {
|
static public double mostFrequentBaseFraction(byte[] sequence) {
|
||||||
int[] baseCounts = new int[4];
|
int[] baseCounts = new int[4];
|
||||||
|
|
||||||
for ( byte base : sequence ) {
|
for (byte base : sequence) {
|
||||||
int baseIndex = simpleBaseToBaseIndex(base);
|
int baseIndex = simpleBaseToBaseIndex(base);
|
||||||
|
|
||||||
if (baseIndex >= 0) {
|
if (baseIndex >= 0) {
|
||||||
|
|
@ -477,7 +507,7 @@ public class BaseUtils {
|
||||||
|
|
||||||
int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
|
int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
|
||||||
|
|
||||||
return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length);
|
return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------
|
||||||
|
|
@ -531,50 +561,50 @@ public class BaseUtils {
|
||||||
static public byte getRandomBase(char excludeBase) {
|
static public byte getRandomBase(char excludeBase) {
|
||||||
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
/** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
||||||
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
||||||
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
||||||
* of 2 (it has a period of 4 as well), and so does
|
* of 2 (it has a period of 4 as well), and so does
|
||||||
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
||||||
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
||||||
* or if specified minPeriod is greater than the sequence length.
|
* or if specified minPeriod is greater than the sequence length.
|
||||||
*
|
*
|
||||||
* @param seq
|
* @param seq
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
||||||
int period = ( minPeriod > seq.length ? seq.length : minPeriod );
|
int period = (minPeriod > seq.length ? seq.length : minPeriod);
|
||||||
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
||||||
// until we find correct period
|
// until we find correct period
|
||||||
|
|
||||||
for ( int pos = period ; pos < seq.length ; pos++ ) {
|
for (int pos = period; pos < seq.length; pos++) {
|
||||||
|
|
||||||
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
||||||
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
||||||
|
|
||||||
if ( Character.toUpperCase( seq[pos] ) !=
|
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
|
||||||
Character.toUpperCase( seq[offset] )
|
|
||||||
) {
|
// period we have been trying so far does not work.
|
||||||
|
// two possibilities:
|
||||||
// period we have been trying so far does not work.
|
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
||||||
// two possibilities:
|
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
||||||
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
||||||
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
// pos will be autoincremented and we will be checking next base
|
||||||
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
||||||
// pos will be autoincremented and we will be checking next base
|
// hence we should first check if it matches the first base of the sequence, and to do that
|
||||||
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
||||||
// hence we should first check if it matches the first base of the sequence, and to do that
|
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
||||||
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
// on the next loop re-entrance after pos is autoincremented)
|
||||||
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
if (offset == 0)
|
||||||
// on the next loop re-entrance after pos is autoincremented)
|
period = pos + 1;
|
||||||
if ( offset == 0 ) period = pos+1;
|
else
|
||||||
else period = pos-- ;
|
period = pos--;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return period;
|
return period;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ public enum NGSPlatform {
|
||||||
/**
|
/**
|
||||||
* Returns the NGSPlatform corresponding to the PL tag in the read group
|
* Returns the NGSPlatform corresponding to the PL tag in the read group
|
||||||
* @param plFromRG -- the PL field (or equivalent) in a ReadGroup object
|
* @param plFromRG -- the PL field (or equivalent) in a ReadGroup object
|
||||||
* @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match
|
* @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match
|
||||||
*/
|
*/
|
||||||
public static final NGSPlatform fromReadGroupPL(final String plFromRG) {
|
public static final NGSPlatform fromReadGroupPL(final String plFromRG) {
|
||||||
if ( plFromRG == null ) return UNKNOWN;
|
if ( plFromRG == null ) return UNKNOWN;
|
||||||
|
|
@ -105,4 +105,14 @@ public enum NGSPlatform {
|
||||||
|
|
||||||
return UNKNOWN;
|
return UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* checks whether or not the requested platform is listed in the set (and is not unknown)
|
||||||
|
*
|
||||||
|
* @param platform the read group string that describes the platform used
|
||||||
|
* @return true if the platform is known (i.e. it's in the list and is not UNKNOWN)
|
||||||
|
*/
|
||||||
|
public static final boolean isKnown (final String platform) {
|
||||||
|
return fromReadGroupPL(platform) != UNKNOWN;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
for (int i = 0; i < reads.size(); i++) {
|
for (int i = 0; i < reads.size(); i++) {
|
||||||
GATKSAMRecord read = reads.get(i);
|
GATKSAMRecord read = reads.get(i);
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||||
}
|
}
|
||||||
|
|
||||||
return pileup;
|
return pileup;
|
||||||
|
|
@ -196,7 +196,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
|
|
||||||
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
|
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
|
||||||
for (GATKSAMRecord read : reads) {
|
for (GATKSAMRecord read : reads) {
|
||||||
pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||||
}
|
}
|
||||||
|
|
||||||
return pileup;
|
return pileup;
|
||||||
|
|
@ -204,7 +204,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
|
|
||||||
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
|
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
|
||||||
|
|
||||||
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip);
|
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip);
|
||||||
|
|
||||||
// --------------------------------------------------------
|
// --------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement {
|
||||||
|
|
||||||
|
|
||||||
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
|
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
|
||||||
super(read, offset, type == Type.DELETION, false, false); // extended events are slated for removal
|
super(read, offset, type == Type.DELETION, false, false, false); // extended events are slated for removal
|
||||||
this.read = read;
|
this.read = read;
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.eventLength = eventLength;
|
this.eventLength = eventLength;
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
protected final GATKSAMRecord read;
|
protected final GATKSAMRecord read;
|
||||||
protected final int offset;
|
protected final int offset;
|
||||||
protected final boolean isDeletion;
|
protected final boolean isDeletion;
|
||||||
|
protected final boolean isBeforeDeletion;
|
||||||
protected final boolean isBeforeInsertion;
|
protected final boolean isBeforeInsertion;
|
||||||
protected final boolean isNextToSoftClip;
|
protected final boolean isNextToSoftClip;
|
||||||
|
|
||||||
|
|
@ -33,6 +34,7 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
* @param read the read we are adding to the pileup
|
* @param read the read we are adding to the pileup
|
||||||
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
|
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
|
||||||
* @param isDeletion whether or not this base is a deletion
|
* @param isDeletion whether or not this base is a deletion
|
||||||
|
* @param isBeforeDeletion whether or not this base is before a deletion
|
||||||
* @param isBeforeInsertion whether or not this base is before an insertion
|
* @param isBeforeInsertion whether or not this base is before an insertion
|
||||||
* @param isNextToSoftClip whether or not this base is next to a soft clipped base
|
* @param isNextToSoftClip whether or not this base is next to a soft clipped base
|
||||||
*/
|
*/
|
||||||
|
|
@ -40,13 +42,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
"read != null",
|
"read != null",
|
||||||
"offset >= -1",
|
"offset >= -1",
|
||||||
"offset <= read.getReadLength()"})
|
"offset <= read.getReadLength()"})
|
||||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) {
|
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) {
|
||||||
if (offset < 0 && isDeletion)
|
if (offset < 0 && isDeletion)
|
||||||
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
|
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
|
||||||
|
|
||||||
this.read = read;
|
this.read = read;
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.isDeletion = isDeletion;
|
this.isDeletion = isDeletion;
|
||||||
|
this.isBeforeDeletion = isBeforeDeletion;
|
||||||
this.isBeforeInsertion = isBeforeInsertion;
|
this.isBeforeInsertion = isBeforeInsertion;
|
||||||
this.isNextToSoftClip = isNextToSoftClip;
|
this.isNextToSoftClip = isNextToSoftClip;
|
||||||
}
|
}
|
||||||
|
|
@ -55,6 +58,10 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
return isDeletion;
|
return isDeletion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isBeforeDeletion() {
|
||||||
|
return isBeforeDeletion;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isBeforeInsertion() {
|
public boolean isBeforeInsertion() {
|
||||||
return isBeforeInsertion;
|
return isBeforeInsertion;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -96,7 +96,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup<
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
||||||
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
||||||
return new PileupElement(read, offset, isDeletion, isBeforeInsertion, isNextToSoftClip);
|
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -361,10 +361,10 @@ public class ArtificialSAMUtils {
|
||||||
final GATKSAMRecord left = pair.get(0);
|
final GATKSAMRecord left = pair.get(0);
|
||||||
final GATKSAMRecord right = pair.get(1);
|
final GATKSAMRecord right = pair.get(1);
|
||||||
|
|
||||||
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false));
|
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false));
|
||||||
|
|
||||||
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
|
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
|
||||||
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false));
|
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ import org.testng.annotations.DataProvider;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
public class GATKReportUnitTest extends BaseTest {
|
public class GATKReportUnitTest extends BaseTest {
|
||||||
@Test
|
@Test(enabled = false)
|
||||||
public void testParse() throws Exception {
|
public void testParse() throws Exception {
|
||||||
String reportPath = validationDataLocation + "exampleGATKReport.eval";
|
String reportPath = validationDataLocation + "exampleGATKReport.eval";
|
||||||
GATKReport report = new GATKReport(reportPath);
|
GATKReport report = new GATKReport(reportPath);
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReducedReadPileupElement() {
|
public void testReducedReadPileupElement() {
|
||||||
PileupElement readp = new PileupElement(read, 0, false, false, false);
|
PileupElement readp = new PileupElement(read, 0, false, false, false, false);
|
||||||
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false);
|
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false);
|
||||||
|
|
||||||
Assert.assertFalse(readp.getRead().isReducedRead());
|
Assert.assertFalse(readp.getRead().isReducedRead());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import collection.JavaConversions._
|
import collection.JavaConversions._
|
||||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils}
|
||||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
||||||
import net.sf.samtools.SAMFileHeader
|
import net.sf.samtools.SAMFileHeader
|
||||||
import java.util.Collections
|
import java.util.Collections
|
||||||
|
|
@ -50,6 +50,8 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) {
|
||||||
IntervalUtils.parseIntervalArguments(parser, intervals)
|
IntervalUtils.parseIntervalArguments(parser, intervals)
|
||||||
Collections.sort(parsedLocs)
|
Collections.sort(parsedLocs)
|
||||||
Collections.unmodifiableList(parsedLocs)
|
Collections.unmodifiableList(parsedLocs)
|
||||||
|
val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY)
|
||||||
|
Collections.unmodifiableList(mergedLocs)
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy val contigs = locs.map(_.getContig).distinct.toSeq
|
lazy val contigs = locs.map(_.getContig).distinct.toSeq
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
||||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile
|
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile
|
||||||
import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser}
|
import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser}
|
||||||
import collection.JavaConversions._
|
import collection.JavaConversions._
|
||||||
|
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||||
|
|
||||||
class GATKIntervalsUnitTest {
|
class GATKIntervalsUnitTest {
|
||||||
private final lazy val hg18Reference = new File(BaseTest.hg18Reference)
|
private final lazy val hg18Reference = new File(BaseTest.hg18Reference)
|
||||||
|
|
@ -60,7 +61,7 @@ class GATKIntervalsUnitTest {
|
||||||
// for(Item item: javaConvertedScalaList)
|
// for(Item item: javaConvertedScalaList)
|
||||||
// This for loop is actually an O(N^2) operation as the iterator calls the
|
// This for loop is actually an O(N^2) operation as the iterator calls the
|
||||||
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
||||||
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
Assert.assertEquals(IntervalUtils.splitFixedIntervals(gi.locs, 189894).size(), 189894)
|
||||||
Assert.assertEquals(gi.contigs.size, 24)
|
Assert.assertEquals(gi.contigs.size, 24)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -77,4 +78,17 @@ class GATKIntervalsUnitTest {
|
||||||
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3"))
|
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3"))
|
||||||
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3"))
|
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testSortAndMergeIntervals() {
|
||||||
|
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10"))
|
||||||
|
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12"))
|
||||||
|
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"))
|
||||||
|
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30"))
|
||||||
|
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30"))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) {
|
||||||
|
Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_)))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue