Solve merge conflicts
This commit is contained in:
commit
c1ea53d088
|
|
@ -0,0 +1,762 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package net.sf.samtools;
|
||||||
|
|
||||||
|
|
||||||
|
import net.sf.samtools.util.*;
|
||||||
|
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal class for reading and querying BAM files.
|
||||||
|
*/
|
||||||
|
class BAMFileReader extends SAMFileReader.ReaderImplementation {
|
||||||
|
// True if reading from a File rather than an InputStream
|
||||||
|
private boolean mIsSeekable = false;
|
||||||
|
|
||||||
|
// For converting bytes into other primitive types
|
||||||
|
private BinaryCodec mStream = null;
|
||||||
|
|
||||||
|
// Underlying compressed data stream.
|
||||||
|
private final BAMInputStream mInputStream;
|
||||||
|
private SAMFileHeader mFileHeader = null;
|
||||||
|
|
||||||
|
// Populated if the file is seekable and an index exists
|
||||||
|
private File mIndexFile;
|
||||||
|
private BAMIndex mIndex = null;
|
||||||
|
private long mFirstRecordPointer = 0;
|
||||||
|
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||||
|
|
||||||
|
// If true, all SAMRecords are fully decoded as they are read.
|
||||||
|
private final boolean eagerDecode;
|
||||||
|
|
||||||
|
// For error-checking.
|
||||||
|
private ValidationStringency mValidationStringency;
|
||||||
|
|
||||||
|
// For creating BAMRecords
|
||||||
|
private SAMRecordFactory samRecordFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the caching index reader implementation rather than the disk-hit-per-file model.
|
||||||
|
*/
|
||||||
|
private boolean mEnableIndexCaching = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
|
||||||
|
*/
|
||||||
|
private boolean mEnableIndexMemoryMapping = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add information about the origin (reader and position) to SAM records.
|
||||||
|
*/
|
||||||
|
private SAMFileReader mFileReader = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to read BAM from a stream (not seekable)
|
||||||
|
* @param stream source of bytes.
|
||||||
|
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||||
|
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||||
|
*/
|
||||||
|
BAMFileReader(final InputStream stream,
|
||||||
|
final File indexFile,
|
||||||
|
final boolean eagerDecode,
|
||||||
|
final ValidationStringency validationStringency,
|
||||||
|
final SAMRecordFactory factory)
|
||||||
|
throws IOException {
|
||||||
|
mIndexFile = indexFile;
|
||||||
|
mIsSeekable = false;
|
||||||
|
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
|
||||||
|
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
|
||||||
|
this.eagerDecode = eagerDecode;
|
||||||
|
this.mValidationStringency = validationStringency;
|
||||||
|
this.samRecordFactory = factory;
|
||||||
|
readHeader(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to read BAM from a file (seekable)
|
||||||
|
* @param file source of bytes.
|
||||||
|
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||||
|
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||||
|
*/
|
||||||
|
BAMFileReader(final File file,
|
||||||
|
final File indexFile,
|
||||||
|
final boolean eagerDecode,
|
||||||
|
final ValidationStringency validationStringency,
|
||||||
|
final SAMRecordFactory factory)
|
||||||
|
throws IOException {
|
||||||
|
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
|
||||||
|
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
|
||||||
|
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
|
||||||
|
" is older than BAM " + file.getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BAMFileReader(final SeekableStream strm,
|
||||||
|
final File indexFile,
|
||||||
|
final boolean eagerDecode,
|
||||||
|
final ValidationStringency validationStringency,
|
||||||
|
final SAMRecordFactory factory)
|
||||||
|
throws IOException {
|
||||||
|
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
|
||||||
|
indexFile,
|
||||||
|
eagerDecode,
|
||||||
|
strm.getSource(),
|
||||||
|
validationStringency,
|
||||||
|
factory);
|
||||||
|
}
|
||||||
|
|
||||||
|
private BAMFileReader(final BAMInputStream inputStream,
|
||||||
|
final File indexFile,
|
||||||
|
final boolean eagerDecode,
|
||||||
|
final String source,
|
||||||
|
final ValidationStringency validationStringency,
|
||||||
|
final SAMRecordFactory factory)
|
||||||
|
throws IOException {
|
||||||
|
mIndexFile = indexFile;
|
||||||
|
mIsSeekable = true;
|
||||||
|
mInputStream = inputStream;
|
||||||
|
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
|
||||||
|
this.eagerDecode = eagerDecode;
|
||||||
|
this.mValidationStringency = validationStringency;
|
||||||
|
this.samRecordFactory = factory;
|
||||||
|
readHeader(source);
|
||||||
|
mFirstRecordPointer = inputStream.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If true, writes the source of every read into the source SAMRecords.
|
||||||
|
* @param enabled true to write source information into each SAMRecord.
|
||||||
|
*/
|
||||||
|
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
|
||||||
|
this.mFileReader = enabled ? reader : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If true, uses the caching version of the index reader.
|
||||||
|
* @param enabled true to write source information into each SAMRecord.
|
||||||
|
*/
|
||||||
|
public void enableIndexCaching(final boolean enabled) {
|
||||||
|
if(mIndex != null)
|
||||||
|
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
|
||||||
|
this.mEnableIndexCaching = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
|
||||||
|
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
|
||||||
|
* @param enabled True to use memory mapping, false to use regular I/O.
|
||||||
|
*/
|
||||||
|
public void enableIndexMemoryMapping(final boolean enabled) {
|
||||||
|
if (mIndex != null) {
|
||||||
|
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
|
||||||
|
}
|
||||||
|
this.mEnableIndexMemoryMapping = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override void enableCrcChecking(final boolean enabled) {
|
||||||
|
this.mInputStream.setCheckCrcs(enabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if ths is a BAM file, and has an index
|
||||||
|
*/
|
||||||
|
public boolean hasIndex() {
|
||||||
|
return (mIndexFile != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
|
||||||
|
* @return An index of the given type.
|
||||||
|
*/
|
||||||
|
public BAMIndex getIndex() {
|
||||||
|
if(mIndexFile == null)
|
||||||
|
throw new SAMException("No index is available for this BAM file.");
|
||||||
|
if(mIndex == null)
|
||||||
|
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
|
||||||
|
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
|
||||||
|
return mIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void close() {
|
||||||
|
if (mStream != null) {
|
||||||
|
mStream.close();
|
||||||
|
}
|
||||||
|
if (mIndex != null) {
|
||||||
|
mIndex.close();
|
||||||
|
}
|
||||||
|
mStream = null;
|
||||||
|
mFileHeader = null;
|
||||||
|
mIndex = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMFileHeader getFileHeader() {
|
||||||
|
return mFileHeader;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set error-checking level for subsequent SAMRecord reads.
|
||||||
|
*/
|
||||||
|
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
||||||
|
this.mValidationStringency = validationStringency;
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMFileReader.ValidationStringency getValidationStringency() {
|
||||||
|
return this.mValidationStringency;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to iterate through the SAMRecords in file order.
|
||||||
|
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
||||||
|
* that iterator must be closed before getIterator() can be called again.
|
||||||
|
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
||||||
|
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
||||||
|
* done in that situation.
|
||||||
|
*/
|
||||||
|
CloseableIterator<SAMRecord> getIterator() {
|
||||||
|
if (mStream == null) {
|
||||||
|
throw new IllegalStateException("File reader is closed");
|
||||||
|
}
|
||||||
|
if (mCurrentIterator != null) {
|
||||||
|
throw new IllegalStateException("Iteration in progress");
|
||||||
|
}
|
||||||
|
if (mIsSeekable) {
|
||||||
|
try {
|
||||||
|
mInputStream.seek(mFirstRecordPointer);
|
||||||
|
} catch (IOException exc) {
|
||||||
|
throw new RuntimeException(exc.getMessage(), exc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mCurrentIterator = new BAMFileIterator();
|
||||||
|
return mCurrentIterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
|
||||||
|
if (mStream == null) {
|
||||||
|
throw new IllegalStateException("File reader is closed");
|
||||||
|
}
|
||||||
|
if (mCurrentIterator != null) {
|
||||||
|
throw new IllegalStateException("Iteration in progress");
|
||||||
|
}
|
||||||
|
if (!(chunks instanceof BAMFileSpan)) {
|
||||||
|
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create an iterator over the given chunk boundaries.
|
||||||
|
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
|
||||||
|
return mCurrentIterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
|
||||||
|
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
|
||||||
|
* the rightmost bound is guaranteed to be after the last read in the file.
|
||||||
|
* @return An unbounded pointer to the first record in the BAM file.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
SAMFileSpan getFilePointerSpanningReads() {
|
||||||
|
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to iterate through the SAMRecords that match the given interval.
|
||||||
|
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||||
|
* before calling any of the methods that return an iterator.
|
||||||
|
*
|
||||||
|
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||||
|
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||||
|
* matches the specified interval.
|
||||||
|
*
|
||||||
|
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||||
|
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||||
|
*
|
||||||
|
* @param sequence Reference sequence sought.
|
||||||
|
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
||||||
|
* A value of zero implies the start of the reference sequence.
|
||||||
|
* @param end A value of zero implies the end of the reference sequence.
|
||||||
|
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
||||||
|
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
||||||
|
* @return Iterator for the matching SAMRecords
|
||||||
|
*/
|
||||||
|
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||||
|
if (mStream == null) {
|
||||||
|
throw new IllegalStateException("File reader is closed");
|
||||||
|
}
|
||||||
|
if (mCurrentIterator != null) {
|
||||||
|
throw new IllegalStateException("Iteration in progress");
|
||||||
|
}
|
||||||
|
if (!mIsSeekable) {
|
||||||
|
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||||
|
}
|
||||||
|
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
||||||
|
return mCurrentIterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to iterate through the SAMRecords with the given alignment start.
|
||||||
|
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||||
|
* before calling any of the methods that return an iterator.
|
||||||
|
*
|
||||||
|
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||||
|
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||||
|
* matches the specified interval.
|
||||||
|
*
|
||||||
|
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||||
|
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||||
|
*
|
||||||
|
* @param sequence Reference sequence sought.
|
||||||
|
* @param start Alignment start sought.
|
||||||
|
* @return Iterator for the matching SAMRecords.
|
||||||
|
*/
|
||||||
|
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||||
|
if (mStream == null) {
|
||||||
|
throw new IllegalStateException("File reader is closed");
|
||||||
|
}
|
||||||
|
if (mCurrentIterator != null) {
|
||||||
|
throw new IllegalStateException("Iteration in progress");
|
||||||
|
}
|
||||||
|
if (!mIsSeekable) {
|
||||||
|
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||||
|
}
|
||||||
|
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
||||||
|
return mCurrentIterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||||
|
if (mStream == null) {
|
||||||
|
throw new IllegalStateException("File reader is closed");
|
||||||
|
}
|
||||||
|
if (mCurrentIterator != null) {
|
||||||
|
throw new IllegalStateException("Iteration in progress");
|
||||||
|
}
|
||||||
|
if (!mIsSeekable) {
|
||||||
|
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
|
||||||
|
if (startOfLastLinearBin != -1) {
|
||||||
|
mInputStream.seek(startOfLastLinearBin);
|
||||||
|
} else {
|
||||||
|
// No mapped reads in file, just start at the first read in file.
|
||||||
|
mInputStream.seek(mFirstRecordPointer);
|
||||||
|
}
|
||||||
|
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
||||||
|
return mCurrentIterator;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the header from the file or stream
|
||||||
|
* @param source Note that this is used only for reporting errors.
|
||||||
|
*/
|
||||||
|
private void readHeader(final String source)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
final byte[] buffer = new byte[4];
|
||||||
|
mStream.readBytes(buffer);
|
||||||
|
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||||
|
throw new IOException("Invalid BAM file header");
|
||||||
|
}
|
||||||
|
|
||||||
|
final int headerTextLength = mStream.readInt();
|
||||||
|
final String textHeader = mStream.readString(headerTextLength);
|
||||||
|
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||||
|
headerCodec.setValidationStringency(mValidationStringency);
|
||||||
|
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
||||||
|
source);
|
||||||
|
|
||||||
|
final int sequenceCount = mStream.readInt();
|
||||||
|
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
||||||
|
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||||
|
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
||||||
|
throw new SAMFormatException("Number of sequences in text header (" +
|
||||||
|
mFileHeader.getSequenceDictionary().size() +
|
||||||
|
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < sequenceCount; i++) {
|
||||||
|
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
||||||
|
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||||
|
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||||
|
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||||
|
source);
|
||||||
|
}
|
||||||
|
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||||
|
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||||
|
source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If only binary sequences are present, copy them into mFileHeader
|
||||||
|
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||||
|
for (int i = 0; i < sequenceCount; i++) {
|
||||||
|
sequences.add(readSequenceRecord(source));
|
||||||
|
}
|
||||||
|
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a single binary sequence record from the file or stream
|
||||||
|
* @param source Note that this is used only for reporting errors.
|
||||||
|
*/
|
||||||
|
private SAMSequenceRecord readSequenceRecord(final String source) {
|
||||||
|
final int nameLength = mStream.readInt();
|
||||||
|
if (nameLength <= 1) {
|
||||||
|
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
||||||
|
}
|
||||||
|
final String sequenceName = mStream.readString(nameLength - 1);
|
||||||
|
// Skip the null terminator
|
||||||
|
mStream.readByte();
|
||||||
|
final int sequenceLength = mStream.readInt();
|
||||||
|
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
||||||
|
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
||||||
|
*/
|
||||||
|
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
||||||
|
private SAMRecord mNextRecord = null;
|
||||||
|
private final BAMRecordCodec bamRecordCodec;
|
||||||
|
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
||||||
|
|
||||||
|
BAMFileIterator() {
|
||||||
|
this(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param advance Trick to enable subclass to do more setup before advancing
|
||||||
|
*/
|
||||||
|
BAMFileIterator(final boolean advance) {
|
||||||
|
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
|
||||||
|
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
|
||||||
|
|
||||||
|
if (advance) {
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
if (mCurrentIterator != null && this != mCurrentIterator) {
|
||||||
|
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||||
|
}
|
||||||
|
mCurrentIterator = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (mNextRecord != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord next() {
|
||||||
|
final SAMRecord result = mNextRecord;
|
||||||
|
advance();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Not supported: remove");
|
||||||
|
}
|
||||||
|
|
||||||
|
void advance() {
|
||||||
|
try {
|
||||||
|
mNextRecord = getNextRecord();
|
||||||
|
|
||||||
|
if (mNextRecord != null) {
|
||||||
|
++this.samRecordIndex;
|
||||||
|
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
||||||
|
mNextRecord.setValidationStringency(mValidationStringency);
|
||||||
|
|
||||||
|
if (mValidationStringency != ValidationStringency.SILENT) {
|
||||||
|
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
||||||
|
SAMUtils.processValidationErrors(validationErrors,
|
||||||
|
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (eagerDecode && mNextRecord != null) {
|
||||||
|
mNextRecord.eagerDecode();
|
||||||
|
}
|
||||||
|
} catch (IOException exc) {
|
||||||
|
throw new RuntimeException(exc.getMessage(), exc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the next record from the input stream.
|
||||||
|
*/
|
||||||
|
SAMRecord getNextRecord() throws IOException {
|
||||||
|
final long startCoordinate = mInputStream.getFilePointer();
|
||||||
|
final SAMRecord next = bamRecordCodec.decode();
|
||||||
|
final long stopCoordinate = mInputStream.getFilePointer();
|
||||||
|
|
||||||
|
if(mFileReader != null && next != null)
|
||||||
|
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
|
||||||
|
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The record that will be return by the next call to next()
|
||||||
|
*/
|
||||||
|
protected SAMRecord peek() {
|
||||||
|
return mNextRecord;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to iterate through SAMRecords matching the target interval.
|
||||||
|
* @param sequence Desired reference sequence.
|
||||||
|
* @param start 1-based start of target interval, inclusive.
|
||||||
|
* @param end 1-based end of target interval, inclusive.
|
||||||
|
* @param queryType contained, overlapping, or starting-at query.
|
||||||
|
*/
|
||||||
|
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
||||||
|
final int start,
|
||||||
|
final int end,
|
||||||
|
final QueryType queryType) {
|
||||||
|
long[] filePointers = null;
|
||||||
|
|
||||||
|
// Hit the index to determine the chunk boundaries for the required data.
|
||||||
|
final SAMFileHeader fileHeader = getFileHeader();
|
||||||
|
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||||
|
if (referenceIndex != -1) {
|
||||||
|
final BAMIndex fileIndex = getIndex();
|
||||||
|
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
|
||||||
|
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create an iterator over the above chunk boundaries.
|
||||||
|
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
||||||
|
|
||||||
|
// Add some preprocessing filters for edge-case reads that don't fit into this
|
||||||
|
// query type.
|
||||||
|
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
||||||
|
}
|
||||||
|
|
||||||
|
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look for BAM index file according to standard naming convention.
|
||||||
|
*
|
||||||
|
* @param dataFile BAM file name.
|
||||||
|
* @return Index file name, or null if not found.
|
||||||
|
*/
|
||||||
|
private static File findIndexFile(final File dataFile) {
|
||||||
|
// If input is foo.bam, look for foo.bai
|
||||||
|
final String bamExtension = ".bam";
|
||||||
|
File indexFile;
|
||||||
|
final String fileName = dataFile.getName();
|
||||||
|
if (fileName.endsWith(bamExtension)) {
|
||||||
|
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
|
||||||
|
indexFile = new File(dataFile.getParent(), bai);
|
||||||
|
if (indexFile.exists()) {
|
||||||
|
return indexFile;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If foo.bai doesn't exist look for foo.bam.bai
|
||||||
|
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||||
|
if (indexFile.exists()) {
|
||||||
|
return indexFile;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class BAMFileIndexIterator extends BAMFileIterator {
|
||||||
|
|
||||||
|
private long[] mFilePointers = null;
|
||||||
|
private int mFilePointerIndex = 0;
|
||||||
|
private long mFilePointerLimit = -1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
|
||||||
|
* @param filePointers the block / offset combination, stored in chunk format.
|
||||||
|
*/
|
||||||
|
BAMFileIndexIterator(final long[] filePointers) {
|
||||||
|
super(false); // delay advance() until after construction
|
||||||
|
mFilePointers = filePointers;
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMRecord getNextRecord()
|
||||||
|
throws IOException {
|
||||||
|
// Advance to next file block if necessary
|
||||||
|
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||||
|
if (mFilePointers == null ||
|
||||||
|
mFilePointerIndex >= mFilePointers.length) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
final long startOffset = mFilePointers[mFilePointerIndex++];
|
||||||
|
final long endOffset = mFilePointers[mFilePointerIndex++];
|
||||||
|
mInputStream.seek(startOffset);
|
||||||
|
mFilePointerLimit = endOffset;
|
||||||
|
}
|
||||||
|
// Pull next record from stream
|
||||||
|
return super.getNextRecord();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A decorating iterator that filters out records that are outside the bounds of the
|
||||||
|
* given query parameters.
|
||||||
|
*/
|
||||||
|
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||||
|
/**
|
||||||
|
* The wrapped iterator.
|
||||||
|
*/
|
||||||
|
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The next record to be returned. Will be null if no such record exists.
|
||||||
|
*/
|
||||||
|
private SAMRecord mNextRecord;
|
||||||
|
|
||||||
|
private final int mReferenceIndex;
|
||||||
|
private final int mRegionStart;
|
||||||
|
private final int mRegionEnd;
|
||||||
|
private final QueryType mQueryType;
|
||||||
|
|
||||||
|
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
||||||
|
this.wrappedIterator = iterator;
|
||||||
|
final SAMFileHeader fileHeader = getFileHeader();
|
||||||
|
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||||
|
mRegionStart = start;
|
||||||
|
if (queryType == QueryType.STARTING_AT) {
|
||||||
|
mRegionEnd = mRegionStart;
|
||||||
|
} else {
|
||||||
|
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
||||||
|
}
|
||||||
|
mQueryType = queryType;
|
||||||
|
mNextRecord = advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if a next element exists; false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return mNextRecord != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the next record from the given iterator.
|
||||||
|
* @return The next SAM record in the iterator.
|
||||||
|
*/
|
||||||
|
public SAMRecord next() {
|
||||||
|
if(!hasNext())
|
||||||
|
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
||||||
|
final SAMRecord currentRead = mNextRecord;
|
||||||
|
mNextRecord = advance();
|
||||||
|
return currentRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes down the existing iterator.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
if (this != mCurrentIterator) {
|
||||||
|
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||||
|
}
|
||||||
|
mCurrentIterator = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @throws UnsupportedOperationException always.
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Not supported: remove");
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMRecord advance() {
|
||||||
|
while (true) {
|
||||||
|
// Pull next record from stream
|
||||||
|
if(!wrappedIterator.hasNext())
|
||||||
|
return null;
|
||||||
|
|
||||||
|
final SAMRecord record = wrappedIterator.next();
|
||||||
|
// If beyond the end of this reference sequence, end iteration
|
||||||
|
final int referenceIndex = record.getReferenceIndex();
|
||||||
|
if (referenceIndex != mReferenceIndex) {
|
||||||
|
if (referenceIndex < 0 ||
|
||||||
|
referenceIndex > mReferenceIndex) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// If before this reference sequence, continue
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
||||||
|
// Quick exit to avoid expensive alignment end calculation
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
final int alignmentStart = record.getAlignmentStart();
|
||||||
|
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
||||||
|
// the query region, regardless of whether the mapped mate will be returned.
|
||||||
|
final int alignmentEnd;
|
||||||
|
if (mQueryType == QueryType.STARTING_AT) {
|
||||||
|
alignmentEnd = -1;
|
||||||
|
} else {
|
||||||
|
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
||||||
|
record.getAlignmentEnd(): alignmentStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alignmentStart > mRegionEnd) {
|
||||||
|
// If scanned beyond target region, end iteration
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// Filter for overlap with region
|
||||||
|
if (mQueryType == QueryType.CONTAINED) {
|
||||||
|
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
} else if (mQueryType == QueryType.OVERLAPPING) {
|
||||||
|
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (alignmentStart == mRegionStart) {
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
||||||
|
private BAMFileIndexUnmappedIterator() {
|
||||||
|
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
package net.sf.samtools;
|
package net.sf.samtools;
|
||||||
|
|
||||||
import net.sf.picard.util.PeekableIterator;
|
import net.sf.picard.util.PeekableIterator;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GATKBAMFileSpan from an existing BAMFileSpan.
|
||||||
|
* @param sourceFileSpan
|
||||||
|
*/
|
||||||
|
public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
|
||||||
|
if(!(sourceFileSpan instanceof BAMFileSpan))
|
||||||
|
throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
|
||||||
|
BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
|
||||||
|
for(Chunk chunk: sourceBAMFileSpan.getChunks())
|
||||||
|
add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convenience constructor to construct a BAM file span from
|
* Convenience constructor to construct a BAM file span from
|
||||||
* a single chunk.
|
* a single chunk.
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
|
||||||
super.setChunkEnd(value);
|
super.setChunkEnd(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getBlockStart() {
|
||||||
|
return getChunkStart() >>> 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBlockOffsetStart() {
|
||||||
|
return (int)(getChunkStart() & 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getBlockEnd() {
|
||||||
|
return getChunkEnd() >>> 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBlockOffsetEnd() {
|
||||||
|
return ((int)getChunkEnd() & 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes an approximation of the uncompressed size of the
|
* Computes an approximation of the uncompressed size of the
|
||||||
* chunk, in bytes. Can be used to determine relative weights
|
* chunk, in bytes. Can be used to determine relative weights
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package net.sf.samtools.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An input stream formulated for use reading BAM files. Supports
|
||||||
|
*/
|
||||||
|
public interface BAMInputStream {
|
||||||
|
/**
|
||||||
|
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||||
|
* not an actual byte offset.
|
||||||
|
*
|
||||||
|
* @param pos virtual file pointer
|
||||||
|
*/
|
||||||
|
public void seek(final long pos) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||||
|
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||||
|
* the two.
|
||||||
|
*/
|
||||||
|
public long getFilePointer();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||||
|
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||||
|
* operation and should be used accordingly.
|
||||||
|
*/
|
||||||
|
public void setCheckCrcs(final boolean check);
|
||||||
|
|
||||||
|
public int read() throws java.io.IOException;
|
||||||
|
|
||||||
|
public int read(byte[] bytes) throws java.io.IOException;
|
||||||
|
|
||||||
|
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
|
||||||
|
|
||||||
|
public long skip(long l) throws java.io.IOException;
|
||||||
|
|
||||||
|
public int available() throws java.io.IOException;
|
||||||
|
|
||||||
|
public void close() throws java.io.IOException;
|
||||||
|
|
||||||
|
public void mark(int i);
|
||||||
|
|
||||||
|
public void reset() throws java.io.IOException;
|
||||||
|
|
||||||
|
public boolean markSupported();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,483 @@
|
||||||
|
/*
|
||||||
|
* The MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
* THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package net.sf.samtools.util;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import net.sf.samtools.FileTruncatedException;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
|
||||||
|
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
|
||||||
|
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
|
||||||
|
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
|
||||||
|
*
|
||||||
|
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
|
||||||
|
*/
|
||||||
|
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
|
||||||
|
private InputStream mStream = null;
|
||||||
|
private SeekableStream mFile = null;
|
||||||
|
private byte[] mFileBuffer = null;
|
||||||
|
private byte[] mCurrentBlock = null;
|
||||||
|
private int mCurrentOffset = 0;
|
||||||
|
private long mBlockAddress = 0;
|
||||||
|
private int mLastBlockLength = 0;
|
||||||
|
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note that seek() is not supported if this ctor is used.
|
||||||
|
*/
|
||||||
|
public BlockCompressedInputStream(final InputStream stream) {
|
||||||
|
mStream = IOUtil.toBufferedStream(stream);
|
||||||
|
mFile = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use this ctor if you wish to call seek()
|
||||||
|
*/
|
||||||
|
public BlockCompressedInputStream(final File file)
|
||||||
|
throws IOException {
|
||||||
|
mFile = new SeekableFileStream(file);
|
||||||
|
mStream = null;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockCompressedInputStream(final URL url) {
|
||||||
|
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
|
||||||
|
mStream = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For providing some arbitrary data source. No additional buffering is
|
||||||
|
* provided, so if the underlying source is not buffered, wrap it in a
|
||||||
|
* SeekableBufferedStream before passing to this ctor.
|
||||||
|
*/
|
||||||
|
public BlockCompressedInputStream(final SeekableStream strm) {
|
||||||
|
mFile = strm;
|
||||||
|
mStream = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||||
|
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||||
|
* operation and should be used accordingly.
|
||||||
|
*/
|
||||||
|
public void setCheckCrcs(final boolean check) {
|
||||||
|
this.blockGunzipper.setCheckCrcs(check);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
|
||||||
|
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
|
||||||
|
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
|
||||||
|
* may block in order to fill an internal buffer if it has been exhausted.
|
||||||
|
*/
|
||||||
|
public int available()
|
||||||
|
throws IOException {
|
||||||
|
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
|
||||||
|
readBlock();
|
||||||
|
}
|
||||||
|
if (mCurrentBlock == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return mCurrentBlock.length - mCurrentOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the underlying InputStream or RandomAccessFile
|
||||||
|
*/
|
||||||
|
public void close()
|
||||||
|
throws IOException {
|
||||||
|
if (mFile != null) {
|
||||||
|
mFile.close();
|
||||||
|
mFile = null;
|
||||||
|
} else if (mStream != null) {
|
||||||
|
mStream.close();
|
||||||
|
mStream = null;
|
||||||
|
}
|
||||||
|
// Encourage garbage collection
|
||||||
|
mFileBuffer = null;
|
||||||
|
mCurrentBlock = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
|
||||||
|
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
|
||||||
|
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
|
||||||
|
|
||||||
|
* @return the next byte of data, or -1 if the end of the stream is reached.
|
||||||
|
*/
|
||||||
|
public int read()
|
||||||
|
throws IOException {
|
||||||
|
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
|
||||||
|
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
|
||||||
|
* or an exception is thrown.
|
||||||
|
*
|
||||||
|
* read(buf) has the same effect as read(buf, 0, buf.length).
|
||||||
|
*
|
||||||
|
* @param buffer the buffer into which the data is read.
|
||||||
|
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
|
||||||
|
* the stream has been reached.
|
||||||
|
*/
|
||||||
|
public int read(final byte[] buffer)
|
||||||
|
throws IOException {
|
||||||
|
return read(buffer, 0, buffer.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private volatile ByteArrayOutputStream buf = null;
|
||||||
|
private static final byte eol = '\n';
|
||||||
|
private static final byte eolCr = '\r';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
|
||||||
|
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
|
||||||
|
*
|
||||||
|
* @return A String containing the contents of the line, excluding the line terminating
|
||||||
|
* character, or null if the end of the stream has been reached
|
||||||
|
*
|
||||||
|
* @exception IOException If an I/O error occurs
|
||||||
|
* @
|
||||||
|
*/
|
||||||
|
public String readLine() throws IOException {
|
||||||
|
int available = available();
|
||||||
|
if (available == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if(null == buf){ // lazy initialisation
|
||||||
|
buf = new ByteArrayOutputStream(8192);
|
||||||
|
}
|
||||||
|
buf.reset();
|
||||||
|
boolean done = false;
|
||||||
|
boolean foundCr = false; // \r found flag
|
||||||
|
while (!done) {
|
||||||
|
int linetmpPos = mCurrentOffset;
|
||||||
|
int bCnt = 0;
|
||||||
|
while((available-- > 0)){
|
||||||
|
final byte c = mCurrentBlock[linetmpPos++];
|
||||||
|
if(c == eol){ // found \n
|
||||||
|
done = true;
|
||||||
|
break;
|
||||||
|
} else if(foundCr){ // previous char was \r
|
||||||
|
--linetmpPos; // current char is not \n so put it back
|
||||||
|
done = true;
|
||||||
|
break;
|
||||||
|
} else if(c == eolCr){ // found \r
|
||||||
|
foundCr = true;
|
||||||
|
continue; // no ++bCnt
|
||||||
|
}
|
||||||
|
++bCnt;
|
||||||
|
}
|
||||||
|
if(mCurrentOffset < linetmpPos){
|
||||||
|
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
|
||||||
|
mCurrentOffset = linetmpPos;
|
||||||
|
}
|
||||||
|
available = available();
|
||||||
|
if(available == 0){
|
||||||
|
// EOF
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
|
||||||
|
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
|
||||||
|
*
|
||||||
|
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
|
||||||
|
*
|
||||||
|
* @param buffer buffer into which data is read.
|
||||||
|
* @param offset the start offset in array b at which the data is written.
|
||||||
|
* @param length the maximum number of bytes to read.
|
||||||
|
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
|
||||||
|
* the stream has been reached.
|
||||||
|
*/
|
||||||
|
public int read(final byte[] buffer, int offset, int length)
|
||||||
|
throws IOException {
|
||||||
|
final int originalLength = length;
|
||||||
|
while (length > 0) {
|
||||||
|
final int available = available();
|
||||||
|
if (available == 0) {
|
||||||
|
// Signal EOF to caller
|
||||||
|
if (originalLength == length) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
final int copyLength = Math.min(length, available);
|
||||||
|
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
|
||||||
|
mCurrentOffset += copyLength;
|
||||||
|
offset += copyLength;
|
||||||
|
length -= copyLength;
|
||||||
|
}
|
||||||
|
return originalLength - length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||||
|
* not an actual byte offset.
|
||||||
|
*
|
||||||
|
* @param pos virtual file pointer
|
||||||
|
*/
|
||||||
|
public void seek(final long pos)
|
||||||
|
throws IOException {
|
||||||
|
if (mFile == null) {
|
||||||
|
throw new IOException("Cannot seek on stream based file");
|
||||||
|
}
|
||||||
|
// Decode virtual file pointer
|
||||||
|
// Upper 48 bits is the byte offset into the compressed stream of a block.
|
||||||
|
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
|
||||||
|
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
|
||||||
|
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
|
||||||
|
final int available;
|
||||||
|
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
|
||||||
|
available = mCurrentBlock.length;
|
||||||
|
} else {
|
||||||
|
mFile.seek(compressedOffset);
|
||||||
|
mBlockAddress = compressedOffset;
|
||||||
|
mLastBlockLength = 0;
|
||||||
|
readBlock();
|
||||||
|
available = available();
|
||||||
|
}
|
||||||
|
if (uncompressedOffset > available ||
|
||||||
|
(uncompressedOffset == available && !eof())) {
|
||||||
|
throw new IOException("Invalid file pointer: " + pos);
|
||||||
|
}
|
||||||
|
mCurrentOffset = uncompressedOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean eof() throws IOException {
|
||||||
|
if (mFile.eof()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
|
||||||
|
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||||
|
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||||
|
* the two.
|
||||||
|
*/
|
||||||
|
public long getFilePointer() {
|
||||||
|
if (mCurrentOffset == mCurrentBlock.length) {
|
||||||
|
// If current offset is at the end of the current block, file pointer should point
|
||||||
|
// to the beginning of the next block.
|
||||||
|
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
|
||||||
|
}
|
||||||
|
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long getFileBlock(final long bgzfOffset) {
|
||||||
|
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
|
||||||
|
* @return true if the given file looks like a valid BGZF file.
|
||||||
|
*/
|
||||||
|
public static boolean isValidFile(final InputStream stream)
|
||||||
|
throws IOException {
|
||||||
|
if (!stream.markSupported()) {
|
||||||
|
throw new RuntimeException("Cannot test non-buffered stream");
|
||||||
|
}
|
||||||
|
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
||||||
|
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
stream.reset();
|
||||||
|
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isValidBlockHeader(final byte[] buffer) {
|
||||||
|
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
|
||||||
|
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
|
||||||
|
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
|
||||||
|
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
|
||||||
|
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
|
||||||
|
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readBlock()
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
if (mFileBuffer == null) {
|
||||||
|
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
|
||||||
|
}
|
||||||
|
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
if (count == 0) {
|
||||||
|
// Handle case where there is no empty gzip block at end.
|
||||||
|
mCurrentOffset = 0;
|
||||||
|
mBlockAddress += mLastBlockLength;
|
||||||
|
mCurrentBlock = new byte[0];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
|
||||||
|
throw new IOException("Premature end of file");
|
||||||
|
}
|
||||||
|
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
|
||||||
|
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
|
||||||
|
throw new IOException("Unexpected compressed block length: " + blockLength);
|
||||||
|
}
|
||||||
|
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
|
||||||
|
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
|
||||||
|
if (count != remaining) {
|
||||||
|
throw new FileTruncatedException("Premature end of file");
|
||||||
|
}
|
||||||
|
inflateBlock(mFileBuffer, blockLength);
|
||||||
|
mCurrentOffset = 0;
|
||||||
|
mBlockAddress += mLastBlockLength;
|
||||||
|
mLastBlockLength = blockLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
|
||||||
|
throws IOException {
|
||||||
|
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
|
||||||
|
byte[] buffer = mCurrentBlock;
|
||||||
|
mCurrentBlock = null;
|
||||||
|
if (buffer == null || buffer.length != uncompressedLength) {
|
||||||
|
try {
|
||||||
|
buffer = new byte[uncompressedLength];
|
||||||
|
} catch (NegativeArraySizeException e) {
|
||||||
|
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
|
||||||
|
mCurrentBlock = buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readBytes(final byte[] buffer, final int offset, final int length)
|
||||||
|
throws IOException {
|
||||||
|
if (mFile != null) {
|
||||||
|
return readBytes(mFile, buffer, offset, length);
|
||||||
|
} else if (mStream != null) {
|
||||||
|
return readBytes(mStream, buffer, offset, length);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
|
||||||
|
throws IOException {
|
||||||
|
int bytesRead = 0;
|
||||||
|
while (bytesRead < length) {
|
||||||
|
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
|
||||||
|
if (count <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bytesRead += count;
|
||||||
|
}
|
||||||
|
return bytesRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
|
||||||
|
throws IOException {
|
||||||
|
int bytesRead = 0;
|
||||||
|
while (bytesRead < length) {
|
||||||
|
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
|
||||||
|
if (count <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bytesRead += count;
|
||||||
|
}
|
||||||
|
return bytesRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int unpackInt16(final byte[] buffer, final int offset) {
|
||||||
|
return ((buffer[offset] & 0xFF) |
|
||||||
|
((buffer[offset+1] & 0xFF) << 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
private int unpackInt32(final byte[] buffer, final int offset) {
|
||||||
|
return ((buffer[offset] & 0xFF) |
|
||||||
|
((buffer[offset+1] & 0xFF) << 8) |
|
||||||
|
((buffer[offset+2] & 0xFF) << 16) |
|
||||||
|
((buffer[offset+3] & 0xFF) << 24));
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
|
||||||
|
|
||||||
|
public static FileTermination checkTermination(final File file)
|
||||||
|
throws IOException {
|
||||||
|
final long fileSize = file.length();
|
||||||
|
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
|
||||||
|
return FileTermination.DEFECTIVE;
|
||||||
|
}
|
||||||
|
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
|
||||||
|
try {
|
||||||
|
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||||
|
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
|
||||||
|
raFile.readFully(buf);
|
||||||
|
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
|
||||||
|
return FileTermination.HAS_TERMINATOR_BLOCK;
|
||||||
|
}
|
||||||
|
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
|
||||||
|
buf = new byte[bufsize];
|
||||||
|
raFile.seek(fileSize - bufsize);
|
||||||
|
raFile.read(buf);
|
||||||
|
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
|
||||||
|
i >= 0; --i) {
|
||||||
|
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
|
||||||
|
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
|
||||||
|
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
|
||||||
|
if (buf.length - i == totalBlockSizeMinusOne + 1) {
|
||||||
|
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
|
||||||
|
} else {
|
||||||
|
return FileTermination.DEFECTIVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return FileTermination.DEFECTIVE;
|
||||||
|
} finally {
|
||||||
|
raFile.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
|
||||||
|
for (int i = 0; i < length; ++i) {
|
||||||
|
if (preamble[i] != buf[i + startOffset]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
|
||||||
* used to indicate an error occured
|
* used to indicate an error occured
|
||||||
*
|
*
|
||||||
* @param msg the message
|
* @param msg the message
|
||||||
* @param e the error
|
* @param t the error
|
||||||
*/
|
*/
|
||||||
public static void exitSystemWithError(String msg, final Exception e) {
|
public static void exitSystemWithError(String msg, final Throwable t) {
|
||||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||||
errorPrintf("stack trace %n");
|
errorPrintf("stack trace %n");
|
||||||
e.printStackTrace();
|
t.printStackTrace();
|
||||||
|
|
||||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||||
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
|
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
|
||||||
|
|
@ -392,10 +392,10 @@ public abstract class CommandLineProgram {
|
||||||
/**
|
/**
|
||||||
* used to indicate an error occured
|
* used to indicate an error occured
|
||||||
*
|
*
|
||||||
* @param e the exception occured
|
* @param t the exception that occurred
|
||||||
*/
|
*/
|
||||||
public static void exitSystemWithError(Exception e) {
|
public static void exitSystemWithError(Throwable t) {
|
||||||
exitSystemWithError(e.getMessage(), e);
|
exitSystemWithError(t.getMessage(), t);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ import java.util.*;
|
||||||
*
|
*
|
||||||
* The IntervalBinding<T> is a formal GATK argument that bridges between a walker and
|
* The IntervalBinding<T> is a formal GATK argument that bridges between a walker and
|
||||||
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
|
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
|
||||||
* either be a RodBinding<T>, a string of one or more intervals, or a file with interval strings.
|
* either be a RodBinding<T>, a string of one interval, or a file with interval strings.
|
||||||
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
|
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
|
||||||
*
|
*
|
||||||
* Note that this class is immutable.
|
* Note that this class is immutable.
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Attribution;
|
import org.broadinstitute.sting.gatk.walkers.Attribution;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
|
|
@ -97,13 +96,20 @@ public class CommandLineGATK extends CommandLineExecutable {
|
||||||
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
|
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
|
||||||
exitSystemWithUserError(e);
|
exitSystemWithUserError(e);
|
||||||
} catch (net.sf.samtools.SAMException e) {
|
} catch (net.sf.samtools.SAMException e) {
|
||||||
// Let's try this out and see how it is received by our users
|
checkForTooManyOpenFilesProblem(e.getMessage());
|
||||||
exitSystemWithSamError(e);
|
exitSystemWithSamError(e);
|
||||||
} catch (Exception e) {
|
} catch (Throwable t) {
|
||||||
exitSystemWithError(e);
|
checkForTooManyOpenFilesProblem(t.getMessage());
|
||||||
|
exitSystemWithError(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void checkForTooManyOpenFilesProblem(String message) {
|
||||||
|
// Special case the "Too many open files" error because it's a common User Error for which we know what to do
|
||||||
|
if ( message.indexOf("Too many open files") != -1 )
|
||||||
|
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates the a short blurb about the GATK, copyright info, and where to get documentation.
|
* Creates the a short blurb about the GATK, copyright info, and where to get documentation.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||||
|
|
@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
|
||||||
*/
|
*/
|
||||||
private Collection<ReadFilter> filters;
|
private Collection<ReadFilter> filters;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls the allocation of threads between CPU vs IO.
|
||||||
|
*/
|
||||||
|
private ThreadAllocation threadAllocation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A currently hacky unique name for this GATK instance
|
* A currently hacky unique name for this GATK instance
|
||||||
*/
|
*/
|
||||||
|
|
@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
|
||||||
if (this.getArguments().nonDeterministicRandomSeed)
|
if (this.getArguments().nonDeterministicRandomSeed)
|
||||||
resetRandomGenerator(System.currentTimeMillis());
|
resetRandomGenerator(System.currentTimeMillis());
|
||||||
|
|
||||||
|
// Determine how the threads should be divided between CPU vs. IO.
|
||||||
|
determineThreadAllocation();
|
||||||
|
|
||||||
// Prepare the data for traversal.
|
// Prepare the data for traversal.
|
||||||
initializeDataSources();
|
initializeDataSources();
|
||||||
|
|
||||||
|
|
@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
|
||||||
// create the output streams "
|
// create the output streams "
|
||||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||||
|
|
||||||
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||||
|
|
||||||
// execute the microscheduler, storing the results
|
// execute the microscheduler, storing the results
|
||||||
return microScheduler.execute(this.walker, shardStrategy);
|
return microScheduler.execute(this.walker, shardStrategy);
|
||||||
|
|
@ -266,6 +275,16 @@ public class GenomeAnalysisEngine {
|
||||||
return Collections.unmodifiableList(filters);
|
return Collections.unmodifiableList(filters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse out the thread allocation from the given command-line argument.
|
||||||
|
*/
|
||||||
|
private void determineThreadAllocation() {
|
||||||
|
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
|
||||||
|
Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null;
|
||||||
|
Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null;
|
||||||
|
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allow subclasses and others within this package direct access to the walker manager.
|
* Allow subclasses and others within this package direct access to the walker manager.
|
||||||
* @return The walker manager used by this package.
|
* @return The walker manager used by this package.
|
||||||
|
|
@ -286,7 +305,7 @@ public class GenomeAnalysisEngine {
|
||||||
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
|
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
|
||||||
}
|
}
|
||||||
|
|
||||||
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
|
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected DownsamplingMethod getDownsamplingMethod() {
|
protected DownsamplingMethod getDownsamplingMethod() {
|
||||||
|
|
@ -397,103 +416,49 @@ public class GenomeAnalysisEngine {
|
||||||
* @param intervals intervals
|
* @param intervals intervals
|
||||||
* @return the sharding strategy
|
* @return the sharding strategy
|
||||||
*/
|
*/
|
||||||
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||||
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||||
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
||||||
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
|
|
||||||
// sharding system; it's required with the new sharding system only for locus walkers.
|
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
|
||||||
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
|
if(!readsDataSource.isEmpty()) {
|
||||||
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
||||||
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
|
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
|
||||||
if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||||
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
|
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
|
||||||
|
|
||||||
Shard.ShardType shardType;
|
|
||||||
if(walker instanceof LocusWalker) {
|
if(walker instanceof LocusWalker) {
|
||||||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||||
shardType = Shard.ShardType.LOCUS;
|
if(intervals == null)
|
||||||
|
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||||
|
else
|
||||||
|
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
|
||||||
|
}
|
||||||
|
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
|
||||||
|
// Apply special validation to read pair walkers.
|
||||||
|
if(walker instanceof ReadPairWalker) {
|
||||||
|
if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
||||||
|
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
|
||||||
|
if(intervals != null && !intervals.isEmpty())
|
||||||
|
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(intervals == null)
|
||||||
|
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||||
|
else
|
||||||
|
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
|
||||||
}
|
}
|
||||||
else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
|
|
||||||
shardType = Shard.ShardType.READ;
|
|
||||||
else
|
else
|
||||||
throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
|
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
|
||||||
|
}
|
||||||
List<GenomeLoc> region;
|
else {
|
||||||
if(intervals != null)
|
final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000;
|
||||||
region = intervals.toList();
|
if(intervals == null)
|
||||||
else {
|
return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
|
||||||
region = new ArrayList<GenomeLoc>();
|
else
|
||||||
for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
|
return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
|
||||||
region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ShardStrategy shardStrategy;
|
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
|
||||||
|
|
||||||
long SHARD_SIZE = 100000L;
|
|
||||||
|
|
||||||
if (walker instanceof LocusWalker) {
|
|
||||||
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
|
|
||||||
|
|
||||||
if (intervals != null && !intervals.isEmpty()) {
|
|
||||||
if (readsDataSource == null)
|
|
||||||
throw new IllegalArgumentException("readsDataSource is null");
|
|
||||||
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
|
||||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
|
||||||
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
|
||||||
referenceDataSource.getReference(),
|
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
|
||||||
drivingDataSource.getSequenceDictionary(),
|
|
||||||
SHARD_SIZE,
|
|
||||||
getGenomeLocParser(),
|
|
||||||
intervals);
|
|
||||||
} else
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
|
||||||
referenceDataSource.getReference(),
|
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
|
||||||
drivingDataSource.getSequenceDictionary(),
|
|
||||||
SHARD_SIZE,getGenomeLocParser());
|
|
||||||
} else if (walker instanceof ReadWalker ||
|
|
||||||
walker instanceof DuplicateWalker) {
|
|
||||||
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
|
|
||||||
|
|
||||||
if (intervals != null && !intervals.isEmpty()) {
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
|
||||||
referenceDataSource.getReference(),
|
|
||||||
shardType,
|
|
||||||
drivingDataSource.getSequenceDictionary(),
|
|
||||||
SHARD_SIZE,
|
|
||||||
getGenomeLocParser(),
|
|
||||||
intervals);
|
|
||||||
} else {
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
|
||||||
referenceDataSource.getReference(),
|
|
||||||
shardType,
|
|
||||||
drivingDataSource.getSequenceDictionary(),
|
|
||||||
SHARD_SIZE,
|
|
||||||
getGenomeLocParser());
|
|
||||||
}
|
|
||||||
} else if (walker instanceof ReadPairWalker) {
|
|
||||||
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
|
||||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
|
|
||||||
if(intervals != null && !intervals.isEmpty())
|
|
||||||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
|
||||||
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
|
||||||
referenceDataSource.getReference(),
|
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
|
|
||||||
drivingDataSource.getSequenceDictionary(),
|
|
||||||
SHARD_SIZE,
|
|
||||||
getGenomeLocParser());
|
|
||||||
} else
|
|
||||||
throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
|
|
||||||
|
|
||||||
return shardStrategy;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean flashbackData() {
|
protected boolean flashbackData() {
|
||||||
|
|
@ -751,6 +716,8 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
return new SAMDataSource(
|
return new SAMDataSource(
|
||||||
samReaderIDs,
|
samReaderIDs,
|
||||||
|
threadAllocation,
|
||||||
|
argCollection.numberOfBAMFileHandles,
|
||||||
genomeLocParser,
|
genomeLocParser,
|
||||||
argCollection.useOriginalBaseQualities,
|
argCollection.useOriginalBaseQualities,
|
||||||
argCollection.strictnessLevel,
|
argCollection.strictnessLevel,
|
||||||
|
|
@ -763,8 +730,7 @@ public class GenomeAnalysisEngine {
|
||||||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||||
getWalkerBAQQualityMode(),
|
getWalkerBAQQualityMode(),
|
||||||
refReader,
|
refReader,
|
||||||
argCollection.defaultBaseQualities,
|
argCollection.defaultBaseQualities);
|
||||||
!argCollection.disableLowMemorySharding);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -194,10 +194,14 @@ public class GATKArgumentCollection {
|
||||||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||||
public ValidationExclusion.TYPE unsafe;
|
public ValidationExclusion.TYPE unsafe;
|
||||||
|
|
||||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
|
/** How many threads should be allocated to this analysis. */
|
||||||
public int numberOfThreads = 1;
|
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
|
||||||
|
public Integer numberOfThreads = 1;
|
||||||
|
|
||||||
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false)
|
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
|
||||||
|
public Integer numberOfBAMFileHandles = null;
|
||||||
|
|
||||||
|
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
|
||||||
public List<String> readGroupBlackList = null;
|
public List<String> readGroupBlackList = null;
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -292,9 +296,6 @@ public class GATKArgumentCollection {
|
||||||
@Hidden
|
@Hidden
|
||||||
public boolean allowIntervalsWithUnindexedBAM = false;
|
public boolean allowIntervalsWithUnindexedBAM = false;
|
||||||
|
|
||||||
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
|
|
||||||
public boolean disableLowMemorySharding = false;
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// methods
|
// methods
|
||||||
|
|
@ -365,7 +366,11 @@ public class GATKArgumentCollection {
|
||||||
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
|
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (other.numberOfThreads != this.numberOfThreads) {
|
if (!other.numberOfThreads.equals(this.numberOfThreads)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
|
||||||
|
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (other.intervalMerging != this.intervalMerging) {
|
if (other.intervalMerging != this.intervalMerging) {
|
||||||
|
|
@ -389,9 +394,6 @@ public class GATKArgumentCollection {
|
||||||
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
|
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (disableLowMemorySharding != other.disableLowMemorySharding)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,128 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.ByteOrder;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: mhanna
|
|
||||||
* Date: Feb 7, 2011
|
|
||||||
* Time: 2:46:34 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
public class BAMBlockStartIterator implements Iterator<Long> {
|
|
||||||
/**
|
|
||||||
* How large is a BGZF header?
|
|
||||||
*/
|
|
||||||
private static int BGZF_HEADER_SIZE = 18;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Where within the header does the BLOCKSIZE actually live?
|
|
||||||
*/
|
|
||||||
private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
|
|
||||||
|
|
||||||
private FileChannel bamInputChannel;
|
|
||||||
private ByteBuffer headerByteBuffer;
|
|
||||||
|
|
||||||
private long nextLocation = 0;
|
|
||||||
|
|
||||||
public BAMBlockStartIterator(File bamFile) {
|
|
||||||
try {
|
|
||||||
FileInputStream bamInputStream = new FileInputStream(bamFile);
|
|
||||||
bamInputChannel = bamInputStream.getChannel();
|
|
||||||
|
|
||||||
headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
|
|
||||||
headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new StingException("Could not open file",ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return nextLocation != -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Long next() {
|
|
||||||
long currentLocation = nextLocation;
|
|
||||||
advance();
|
|
||||||
return currentLocation;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
|
|
||||||
}
|
|
||||||
|
|
||||||
private void advance() {
|
|
||||||
int readStatus;
|
|
||||||
|
|
||||||
headerByteBuffer.clear();
|
|
||||||
try {
|
|
||||||
readStatus = bamInputChannel.read(headerByteBuffer);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new StingException("Could not read header data",ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(readStatus == -1) {
|
|
||||||
nextLocation = -1;
|
|
||||||
try {
|
|
||||||
bamInputChannel.close();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new StingException("Could not close input file",ex);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
|
|
||||||
int blockSize = headerByteBuffer.getShort();
|
|
||||||
|
|
||||||
try {
|
|
||||||
bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
|
|
||||||
nextLocation = bamInputChannel.position();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new StingException("Could not reposition input stream",ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String argv[]) throws IOException {
|
|
||||||
BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
|
|
||||||
int i = 0;
|
|
||||||
while(blockStartIterator.hasNext())
|
|
||||||
System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,195 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.samtools.GATKBin;
|
|
||||||
import net.sf.samtools.GATKChunk;
|
|
||||||
import net.sf.samtools.LinearIndex;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Represents the contents of a bam index file for one reference.
|
|
||||||
* A BAM index (.bai) file contains information for all references in the bam file.
|
|
||||||
* This class describes the data present in the index file for one of these references;
|
|
||||||
* including the bins, chunks, and linear index.
|
|
||||||
*/
|
|
||||||
class BAMIndexContent {
|
|
||||||
/**
|
|
||||||
* The reference sequence for the data currently loaded.
|
|
||||||
*/
|
|
||||||
private final int mReferenceSequence;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A list of all bins in the above reference sequence.
|
|
||||||
*/
|
|
||||||
private final BinList mBinList;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The linear index for the reference sequence above.
|
|
||||||
*/
|
|
||||||
private final LinearIndex mLinearIndex;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param referenceSequence Content corresponds to this reference.
|
|
||||||
* @param bins Array of bins represented by this content, possibly sparse
|
|
||||||
* @param numberOfBins Number of non-null bins
|
|
||||||
* @param linearIndex Additional index used to optimize queries
|
|
||||||
*/
|
|
||||||
BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
|
|
||||||
this.mReferenceSequence = referenceSequence;
|
|
||||||
this.mBinList = new BinList(bins, numberOfBins);
|
|
||||||
this.mLinearIndex = linearIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reference for this Content
|
|
||||||
*/
|
|
||||||
public int getReferenceSequence() {
|
|
||||||
return mReferenceSequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Does this content have anything in this bin?
|
|
||||||
*/
|
|
||||||
public boolean containsBin(final GATKBin bin) {
|
|
||||||
return mBinList.getBin(bin.getBinNumber()) != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return iterable list of bins represented by this content
|
|
||||||
*/
|
|
||||||
public BinList getBins() {
|
|
||||||
return mBinList;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the number of non-null bins represented by this content
|
|
||||||
*/
|
|
||||||
int getNumberOfNonNullBins() {
|
|
||||||
return mBinList.getNumberOfNonNullBins();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return all chunks associated with all bins in this content
|
|
||||||
*/
|
|
||||||
public List<GATKChunk> getAllChunks() {
|
|
||||||
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
|
|
||||||
for (GATKBin b : mBinList)
|
|
||||||
if (b.getChunkList() != null) {
|
|
||||||
allChunks.addAll(Arrays.asList(b.getChunkList()));
|
|
||||||
}
|
|
||||||
return Collections.unmodifiableList(allChunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the linear index represented by this content
|
|
||||||
*/
|
|
||||||
public LinearIndex getLinearIndex() {
|
|
||||||
return mLinearIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class is used to encapsulate the list of Bins store in the BAMIndexContent
|
|
||||||
* While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
|
|
||||||
*/
|
|
||||||
class BinList implements Iterable<GATKBin> {
|
|
||||||
|
|
||||||
private final GATKBin[] mBinArray;
|
|
||||||
public final int numberOfNonNullBins;
|
|
||||||
public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param binArray a sparse array representation of the bins. The index into the array is the bin number.
|
|
||||||
* @param numberOfNonNullBins
|
|
||||||
*/
|
|
||||||
BinList(GATKBin[] binArray, int numberOfNonNullBins) {
|
|
||||||
this.mBinArray = binArray;
|
|
||||||
this.numberOfNonNullBins = numberOfNonNullBins;
|
|
||||||
this.maxBinNumber = mBinArray.length - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
GATKBin getBin(int binNumber) {
|
|
||||||
if (binNumber > maxBinNumber) return null;
|
|
||||||
return mBinArray[binNumber];
|
|
||||||
}
|
|
||||||
|
|
||||||
int getNumberOfNonNullBins() {
|
|
||||||
return numberOfNonNullBins;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets an iterator over all non-null bins.
|
|
||||||
*
|
|
||||||
* @return An iterator over all bins.
|
|
||||||
*/
|
|
||||||
public Iterator<GATKBin> iterator() {
|
|
||||||
return new BinIterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
private class BinIterator implements Iterator<GATKBin> {
|
|
||||||
/**
|
|
||||||
* Stores the bin # of the Bin currently in use.
|
|
||||||
*/
|
|
||||||
private int nextBin;
|
|
||||||
|
|
||||||
public BinIterator() {
|
|
||||||
nextBin = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Are there more bins in this set, waiting to be returned?
|
|
||||||
*
|
|
||||||
* @return True if more bins are remaining.
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
while (nextBin <= maxBinNumber) {
|
|
||||||
if (getBin(nextBin) != null) return true;
|
|
||||||
nextBin++;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the next bin in the provided BinList.
|
|
||||||
*
|
|
||||||
* @return the next available bin in the BinList.
|
|
||||||
*/
|
|
||||||
public GATKBin next() {
|
|
||||||
if (!hasNext())
|
|
||||||
throw new NoSuchElementException("This BinIterator is currently empty");
|
|
||||||
GATKBin result = getBin(nextBin);
|
|
||||||
nextBin++;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Unable to remove from a bin iterator");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.samtools.Bin;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Models a bin at which all BAM files in the merged input stream overlap.
|
|
||||||
*/
|
|
||||||
class BAMOverlap {
|
|
||||||
public final int start;
|
|
||||||
public final int stop;
|
|
||||||
|
|
||||||
private final Map<SAMReaderID,Bin> bins = new HashMap<SAMReaderID,Bin>();
|
|
||||||
|
|
||||||
public BAMOverlap(final int start, final int stop) {
|
|
||||||
this.start = start;
|
|
||||||
this.stop = stop;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addBin(final SAMReaderID id, final Bin bin) {
|
|
||||||
bins.put(id,bin);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Bin getBin(final SAMReaderID id) {
|
|
||||||
return bins.get(id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new BAM schedule based on the given index.
|
* Create a new BAM schedule based on the given index.
|
||||||
* @param indexFiles Index files.
|
* @param dataSource The SAM data source to use.
|
||||||
* @param intervals List of
|
* @param intervals List of
|
||||||
*/
|
*/
|
||||||
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) {
|
public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
|
||||||
if(intervals.isEmpty())
|
if(intervals.isEmpty())
|
||||||
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
||||||
|
|
||||||
referenceSequence = intervals.get(0).getContigIndex();
|
referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
|
||||||
|
|
||||||
createScheduleFile();
|
createScheduleFile();
|
||||||
|
|
||||||
readerIDs.addAll(indexFiles.keySet());
|
readerIDs.addAll(dataSource.getReaderIDs());
|
||||||
|
|
||||||
for(final SAMReaderID reader: readerIDs) {
|
for(final SAMReaderID reader: readerIDs) {
|
||||||
final GATKBAMIndex index = indexFiles.get(reader);
|
final GATKBAMIndex index = dataSource.getIndex(reader);
|
||||||
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
|
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
|
||||||
|
|
||||||
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
||||||
|
|
@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
if(selectedIterators.isEmpty())
|
if(selectedIterators.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// Create the target schedule entry
|
||||||
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
|
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
|
||||||
|
|
||||||
|
// For each schedule entry with data, load the data into the merged schedule.
|
||||||
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
|
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
|
||||||
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
|
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
|
||||||
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
|
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
|
||||||
|
|
@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
scheduleIterator.next();
|
scheduleIterator.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For each schedule entry without data, add a blank entry.
|
||||||
|
for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
|
||||||
|
mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
|
||||||
|
}
|
||||||
|
|
||||||
nextScheduleEntry = mergedScheduleEntry;
|
nextScheduleEntry = mergedScheduleEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
import net.sf.picard.util.PeekableIterator;
|
import net.sf.picard.util.PeekableIterator;
|
||||||
import net.sf.samtools.GATKBAMFileSpan;
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
import net.sf.samtools.GATKChunk;
|
import net.sf.samtools.GATKChunk;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMFileSpan;
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
|
|
||||||
private FilePointer nextFilePointer = null;
|
private FilePointer nextFilePointer = null;
|
||||||
|
|
||||||
private final GenomeLocSortedSet loci;
|
private GenomeLocSortedSet loci;
|
||||||
|
private PeekableIterator<GenomeLoc> locusIterator;
|
||||||
|
private GenomeLoc currentLocus;
|
||||||
|
|
||||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
|
||||||
|
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||||
|
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
|
||||||
|
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
|
||||||
|
// Match only on sequence name; trust startup validation to make sure all the sequences match.
|
||||||
|
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
|
||||||
|
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||||
|
}
|
||||||
|
scheduler.populateFilteredIntervalList(intervals);
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
private GenomeLoc currentLocus;
|
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||||
|
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||||
|
scheduler.populateUnfilteredIntervalList(parser);
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||||
|
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||||
|
scheduler.populateFilteredIntervalList(loci);
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private BAMScheduler(final SAMDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
for(SAMReaderID reader: dataSource.getReaderIDs()) {
|
||||||
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
GATKBAMIndex index = dataSource.getIndex(reader);
|
||||||
|
if(index != null)
|
||||||
|
indexFiles.put(reader,dataSource.getIndex(reader));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
|
||||||
|
* @param loci The list of locations to search and iterate over.
|
||||||
|
*/
|
||||||
|
private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
|
||||||
this.loci = loci;
|
this.loci = loci;
|
||||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
if(!indexFiles.isEmpty()) {
|
||||||
if(locusIterator.hasNext())
|
// If index data is available, start up the iterator.
|
||||||
currentLocus = locusIterator.next();
|
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||||
advance();
|
if(locusIterator.hasNext())
|
||||||
|
currentLocus = locusIterator.next();
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Otherwise, seed the iterator with a single file pointer over the entire region.
|
||||||
|
nextFilePointer = generatePointerOverEntireFileset();
|
||||||
|
for(GenomeLoc locus: loci)
|
||||||
|
nextFilePointer.addLocation(locus);
|
||||||
|
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
|
||||||
|
* from just before the start of the region to the end of the region.
|
||||||
|
*/
|
||||||
|
private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
|
||||||
|
this.loci = new GenomeLocSortedSet(parser);
|
||||||
|
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||||
|
nextFilePointer = generatePointerOverEntireFileset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a span that runs from the end of the BAM header to the end of the fle.
|
||||||
|
* @return A file pointer over the specified region.
|
||||||
|
*/
|
||||||
|
private FilePointer generatePointerOverEntireFileset() {
|
||||||
|
FilePointer filePointer = new FilePointer();
|
||||||
|
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
|
||||||
|
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||||
|
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
|
||||||
|
return filePointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
|
|
@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
if(!hasNext())
|
if(!hasNext())
|
||||||
throw new NoSuchElementException("No next element available in interval sharder");
|
throw new NoSuchElementException("No next element available in interval sharder");
|
||||||
FilePointer currentFilePointer = nextFilePointer;
|
FilePointer currentFilePointer = nextFilePointer;
|
||||||
|
nextFilePointer = null;
|
||||||
advance();
|
advance();
|
||||||
|
|
||||||
return currentFilePointer;
|
return currentFilePointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
if(loci.isEmpty())
|
if(loci.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
nextFilePointer = null;
|
|
||||||
while(nextFilePointer == null && currentLocus != null) {
|
while(nextFilePointer == null && currentLocus != null) {
|
||||||
// special case handling of the unmapped shard.
|
// special case handling of the unmapped shard.
|
||||||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
|
nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
|
||||||
currentLocus = null;
|
currentLocus = null;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
int coveredRegionStop = Integer.MAX_VALUE;
|
int coveredRegionStop = Integer.MAX_VALUE;
|
||||||
GenomeLoc coveredRegion = null;
|
GenomeLoc coveredRegion = null;
|
||||||
|
|
||||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
|
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
|
||||||
|
|
||||||
// No overlapping data at all.
|
// No overlapping data at all.
|
||||||
if(scheduleEntry != null) {
|
if(scheduleEntry != null) {
|
||||||
|
|
@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
|
||||||
for(SAMReaderID reader: indexFiles.keySet())
|
for(SAMReaderID reader: indexFiles.keySet())
|
||||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||||
}
|
}
|
||||||
|
|
@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
// Early exit if no bins were found.
|
// Early exit if no bins were found.
|
||||||
if(coveredRegion == null) {
|
if(coveredRegion == null) {
|
||||||
// for debugging only: maximum split is 16384.
|
// for debugging only: maximum split is 16384.
|
||||||
if(currentLocus.size() > 16384) {
|
nextFilePointer.addLocation(currentLocus);
|
||||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||||
nextFilePointer.addLocation(splitContigs[0]);
|
|
||||||
currentLocus = splitContigs[1];
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
nextFilePointer.addLocation(currentLocus);
|
|
||||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Early exit if only part of the first interval was found.
|
// Early exit if only part of the first interval was found.
|
||||||
if(currentLocus.startsBefore(coveredRegion)) {
|
if(currentLocus.startsBefore(coveredRegion)) {
|
||||||
// for debugging only: maximum split is 16384.
|
|
||||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||||
nextFilePointer.addLocation(splitContigs[0]);
|
nextFilePointer.addLocation(splitContigs[0]);
|
||||||
|
|
@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||||
* @param indices BAM indices.
|
|
||||||
* @param currentLocus The actual locus for which to check overlap.
|
* @param currentLocus The actual locus for which to check overlap.
|
||||||
* @return The next schedule entry overlapping with the given list of loci.
|
* @return The next schedule entry overlapping with the given list of loci.
|
||||||
*/
|
*/
|
||||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
|
||||||
|
// Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
|
||||||
|
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
|
||||||
|
// we'll be using the correct contig index for the BAMs.
|
||||||
|
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
|
||||||
|
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
|
||||||
|
|
||||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
|
||||||
if(bamScheduleIterator != null)
|
if(bamScheduleIterator != null)
|
||||||
bamScheduleIterator.close();
|
bamScheduleIterator.close();
|
||||||
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
|
lastReferenceSequenceLoaded = currentContigIndex;
|
||||||
|
|
||||||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||||
for(GenomeLoc locus: loci) {
|
for(GenomeLoc locus: loci) {
|
||||||
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
|
if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
|
||||||
lociInContig.add(locus);
|
lociInContig.add(locus);
|
||||||
}
|
}
|
||||||
|
|
||||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(dataSource,lociInContig));
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!bamScheduleIterator.hasNext())
|
if(!bamScheduleIterator.hasNext())
|
||||||
|
|
@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a span from the given start point to the end of the file.
|
||||||
|
* @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
|
||||||
|
* @return A file span from the given point to the end of the file.
|
||||||
|
*/
|
||||||
|
private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
|
||||||
|
return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preloads BGZF blocks in preparation for unzipping and data processing.
|
||||||
|
* TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
|
||||||
|
* TODO: be replaced with a central thread management strategy.
|
||||||
|
*/
|
||||||
|
public class BGZFBlockLoadingDispatcher {
|
||||||
|
/**
|
||||||
|
* The file handle cache, used when allocating blocks from the dispatcher.
|
||||||
|
*/
|
||||||
|
private final FileHandleCache fileHandleCache;
|
||||||
|
|
||||||
|
private final ExecutorService threadPool;
|
||||||
|
|
||||||
|
private final Queue<SAMReaderPosition> inputQueue;
|
||||||
|
|
||||||
|
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
||||||
|
threadPool = Executors.newFixedThreadPool(numThreads);
|
||||||
|
fileHandleCache = new FileHandleCache(numFileHandles);
|
||||||
|
inputQueue = new LinkedList<SAMReaderPosition>();
|
||||||
|
|
||||||
|
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initiates a request for a new block load.
|
||||||
|
* @param readerPosition Position at which to load.
|
||||||
|
*/
|
||||||
|
void queueBlockLoad(final SAMReaderPosition readerPosition) {
|
||||||
|
synchronized(inputQueue) {
|
||||||
|
inputQueue.add(readerPosition);
|
||||||
|
inputQueue.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Claims the next work request from the queue.
|
||||||
|
* @return The next work request, or null if none is available.
|
||||||
|
*/
|
||||||
|
SAMReaderPosition claimNextWorkRequest() {
|
||||||
|
synchronized(inputQueue) {
|
||||||
|
while(inputQueue.isEmpty()) {
|
||||||
|
try {
|
||||||
|
inputQueue.wait();
|
||||||
|
}
|
||||||
|
catch(InterruptedException ex) {
|
||||||
|
throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return inputQueue.poll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,436 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
|
import net.sf.samtools.GATKChunk;
|
||||||
|
import net.sf.samtools.util.BAMInputStream;
|
||||||
|
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||||
|
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||||
|
import net.sf.samtools.util.RuntimeEOFException;
|
||||||
|
import net.sf.samtools.util.SeekableStream;
|
||||||
|
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Presents decompressed blocks to the SAMFileReader.
|
||||||
|
*/
|
||||||
|
public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||||
|
/**
|
||||||
|
* Mechanism for triggering block loads.
|
||||||
|
*/
|
||||||
|
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The reader whose data is supplied by this input stream.
|
||||||
|
*/
|
||||||
|
private final SAMReaderID reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Length of the input stream.
|
||||||
|
*/
|
||||||
|
private final long length;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The latest error reported by an asynchronous block load.
|
||||||
|
*/
|
||||||
|
private Throwable error;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Current position.
|
||||||
|
*/
|
||||||
|
private SAMReaderPosition position;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A stream of compressed data blocks.
|
||||||
|
*/
|
||||||
|
private final ByteBuffer buffer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Offsets of the given blocks in the buffer.
|
||||||
|
*/
|
||||||
|
private LinkedList<Integer> blockOffsets = new LinkedList<Integer>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Source positions of the given blocks in the buffer.
|
||||||
|
*/
|
||||||
|
private LinkedList<Long> blockPositions = new LinkedList<Long>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides a lock to wait for more data to arrive.
|
||||||
|
*/
|
||||||
|
private final Object lock = new Object();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An input stream to use when comparing data back to what it should look like.
|
||||||
|
*/
|
||||||
|
private final BlockCompressedInputStream validatingInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Has the buffer been filled since last request?
|
||||||
|
*/
|
||||||
|
private boolean bufferFilled = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new block presenting input stream with a dedicated buffer.
|
||||||
|
* @param dispatcher the block loading messenger.
|
||||||
|
* @param reader the reader for which to load data.
|
||||||
|
* @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
|
||||||
|
*/
|
||||||
|
BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
|
||||||
|
this.reader = reader;
|
||||||
|
this.length = reader.samFile.length();
|
||||||
|
|
||||||
|
buffer = ByteBuffer.wrap(new byte[64*1024]);
|
||||||
|
buffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
// The state of the buffer assumes that the range of data written into the buffer appears in the range
|
||||||
|
// [position,limit), while extra capacity exists in the range [limit,capacity)
|
||||||
|
buffer.limit(0);
|
||||||
|
|
||||||
|
this.dispatcher = dispatcher;
|
||||||
|
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
||||||
|
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||||
|
|
||||||
|
try {
|
||||||
|
if(validate) {
|
||||||
|
System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
|
||||||
|
validatingInputStream = new BlockCompressedInputStream(reader.samFile);
|
||||||
|
// A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
|
||||||
|
// Poke the stream to start reading data.
|
||||||
|
validatingInputStream.available();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
validatingInputStream = null;
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public long length() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getFilePointer() {
|
||||||
|
long filePointer;
|
||||||
|
synchronized(lock) {
|
||||||
|
if(buffer.remaining() > 0) {
|
||||||
|
// If there's data in the buffer, figure out from whence it came.
|
||||||
|
final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
|
||||||
|
final int blockOffset = buffer.position();
|
||||||
|
filePointer = blockAddress << 16 | blockOffset;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Otherwise, find the next position to load.
|
||||||
|
filePointer = position.getBlockAddress() << 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
|
||||||
|
throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
|
||||||
|
BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
|
||||||
|
BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
|
||||||
|
|
||||||
|
return filePointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void seek(long target) {
|
||||||
|
// TODO: Validate the seek point.
|
||||||
|
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||||
|
synchronized(lock) {
|
||||||
|
clearBuffers();
|
||||||
|
position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
|
||||||
|
waitForBufferFill();
|
||||||
|
buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||||
|
|
||||||
|
if(validatingInputStream != null) {
|
||||||
|
try {
|
||||||
|
validatingInputStream.seek(target);
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void clearBuffers() {
|
||||||
|
this.position.reset();
|
||||||
|
|
||||||
|
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
||||||
|
// Indicate no data to be read.
|
||||||
|
buffer.clear();
|
||||||
|
buffer.limit(0);
|
||||||
|
|
||||||
|
blockOffsets.clear();
|
||||||
|
blockPositions.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean eof() {
|
||||||
|
synchronized(lock) {
|
||||||
|
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
||||||
|
return position != null && position.getBlockAddress() >= length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCheckCrcs(final boolean check) {
|
||||||
|
// TODO: Implement
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Submits a new access plan for the given dataset.
|
||||||
|
* @param position The next seek point for BAM data in this reader.
|
||||||
|
*/
|
||||||
|
public void submitAccessPlan(final SAMReaderPosition position) {
|
||||||
|
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
||||||
|
synchronized(lock) {
|
||||||
|
// Assume that the access plan is going to tell us to start where we are and move forward.
|
||||||
|
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
|
||||||
|
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
|
||||||
|
position.advancePosition(this.position.getBlockAddress());
|
||||||
|
}
|
||||||
|
this.position = position;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void compactBuffer() {
|
||||||
|
// Compact buffer to maximize storage space.
|
||||||
|
int bytesToRemove = 0;
|
||||||
|
|
||||||
|
// Look ahead to see if we can compact away the first block in the series.
|
||||||
|
while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
|
||||||
|
bytesToRemove += blockOffsets.remove();
|
||||||
|
blockPositions.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we end up with an empty block at the end of the series, compact this as well.
|
||||||
|
if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
|
||||||
|
bytesToRemove += buffer.position();
|
||||||
|
blockOffsets.remove();
|
||||||
|
blockPositions.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
int finalBufferStart = buffer.position() - bytesToRemove;
|
||||||
|
int finalBufferSize = buffer.remaining();
|
||||||
|
|
||||||
|
buffer.position(bytesToRemove);
|
||||||
|
buffer.compact();
|
||||||
|
|
||||||
|
buffer.position(finalBufferStart);
|
||||||
|
buffer.limit(finalBufferStart+finalBufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Push contents of incomingBuffer into the end of this buffer.
|
||||||
|
* MUST be called from a thread that is NOT the reader thread.
|
||||||
|
* @param incomingBuffer The data being pushed into this input stream.
|
||||||
|
* @param position target position for the data.
|
||||||
|
*/
|
||||||
|
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
|
||||||
|
synchronized(lock) {
|
||||||
|
try {
|
||||||
|
compactBuffer();
|
||||||
|
// Open up the buffer for more reading.
|
||||||
|
buffer.limit(buffer.capacity());
|
||||||
|
|
||||||
|
// Advance the position to take the most recent read into account.
|
||||||
|
long lastReadPosition = position.getBlockAddress();
|
||||||
|
|
||||||
|
byte[] validBytes = null;
|
||||||
|
if(validatingInputStream != null) {
|
||||||
|
validBytes = new byte[incomingBuffer.remaining()];
|
||||||
|
|
||||||
|
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
||||||
|
int pos = incomingBuffer.position();
|
||||||
|
int lim = incomingBuffer.limit();
|
||||||
|
incomingBuffer.get(currentBytes);
|
||||||
|
|
||||||
|
incomingBuffer.limit(lim);
|
||||||
|
incomingBuffer.position(pos);
|
||||||
|
|
||||||
|
long currentFilePointer = validatingInputStream.getFilePointer();
|
||||||
|
validatingInputStream.seek(lastReadPosition << 16);
|
||||||
|
validatingInputStream.read(validBytes);
|
||||||
|
validatingInputStream.seek(currentFilePointer);
|
||||||
|
|
||||||
|
if(!Arrays.equals(validBytes,currentBytes))
|
||||||
|
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
||||||
|
}
|
||||||
|
|
||||||
|
this.position = position;
|
||||||
|
position.advancePosition(filePosition);
|
||||||
|
|
||||||
|
if(buffer.remaining() < incomingBuffer.remaining()) {
|
||||||
|
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
|
||||||
|
lock.wait();
|
||||||
|
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queue list of block offsets / block positions.
|
||||||
|
blockOffsets.add(buffer.position());
|
||||||
|
blockPositions.add(lastReadPosition);
|
||||||
|
|
||||||
|
buffer.put(incomingBuffer);
|
||||||
|
|
||||||
|
// Set up the buffer for reading.
|
||||||
|
buffer.flip();
|
||||||
|
bufferFilled = true;
|
||||||
|
|
||||||
|
lock.notify();
|
||||||
|
}
|
||||||
|
catch(Exception ex) {
|
||||||
|
reportException(ex);
|
||||||
|
lock.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void reportException(Throwable t) {
|
||||||
|
synchronized(lock) {
|
||||||
|
this.error = t;
|
||||||
|
lock.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkForErrors() {
|
||||||
|
synchronized(lock) {
|
||||||
|
if(error != null) {
|
||||||
|
ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
|
||||||
|
toThrow.setStackTrace(error.getStackTrace());
|
||||||
|
throw toThrow;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the next byte of data from the input stream.
|
||||||
|
* @return Next byte of data, from 0->255, as an int.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int read() {
|
||||||
|
byte[] singleByte = new byte[1];
|
||||||
|
read(singleByte);
|
||||||
|
return singleByte[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills the given byte array to the extent possible.
|
||||||
|
* @param bytes byte array to be filled.
|
||||||
|
* @return The number of bytes actually read.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int read(byte[] bytes) {
|
||||||
|
return read(bytes,0,bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(byte[] bytes, final int offset, final int length) {
|
||||||
|
int remaining = length;
|
||||||
|
synchronized(lock) {
|
||||||
|
while(remaining > 0) {
|
||||||
|
// Check for error conditions during last read.
|
||||||
|
checkForErrors();
|
||||||
|
|
||||||
|
// If completely out of space, queue up another buffer fill.
|
||||||
|
waitForBufferFill();
|
||||||
|
|
||||||
|
// Couldn't manage to load any data at all; abort and return what's available.
|
||||||
|
if(buffer.remaining() == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
int numBytesToCopy = Math.min(buffer.remaining(),remaining);
|
||||||
|
buffer.get(bytes,length-remaining+offset,numBytesToCopy);
|
||||||
|
remaining -= numBytesToCopy;
|
||||||
|
|
||||||
|
//if(remaining > 0)
|
||||||
|
// System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
|
||||||
|
// TODO: Assert that we don't copy across a block boundary
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify any waiting threads that some of the contents of the buffer were removed.
|
||||||
|
if(length-remaining > 0)
|
||||||
|
lock.notify();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(validatingInputStream != null) {
|
||||||
|
byte[] validBytes = new byte[length];
|
||||||
|
try {
|
||||||
|
validatingInputStream.read(validBytes,offset,length);
|
||||||
|
for(int i = offset; i < offset+length; i++) {
|
||||||
|
if(bytes[i] != validBytes[i]) {
|
||||||
|
System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
|
||||||
|
throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return length - remaining;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
if(validatingInputStream != null) {
|
||||||
|
try {
|
||||||
|
validatingInputStream.close();
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSource() {
|
||||||
|
return reader.getSamFilePath();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForBufferFill() {
|
||||||
|
synchronized(lock) {
|
||||||
|
bufferFilled = false;
|
||||||
|
if(buffer.remaining() == 0 && !eof()) {
|
||||||
|
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
||||||
|
dispatcher.queueBlockLoad(position);
|
||||||
|
try {
|
||||||
|
lock.wait();
|
||||||
|
}
|
||||||
|
catch(InterruptedException ex) {
|
||||||
|
// TODO: handle me.
|
||||||
|
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(bufferFilled && buffer.remaining() == 0)
|
||||||
|
throw new RuntimeEOFException("No more data left in InputStream");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,188 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.util.zip.DataFormatException;
|
||||||
|
import java.util.zip.Inflater;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An engine for loading blocks.
|
||||||
|
*/
|
||||||
|
class BlockLoader implements Runnable {
|
||||||
|
/**
|
||||||
|
* Coordinates the input queue.
|
||||||
|
*/
|
||||||
|
private BGZFBlockLoadingDispatcher dispatcher;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A cache from which to retrieve open file handles.
|
||||||
|
*/
|
||||||
|
private final FileHandleCache fileHandleCache;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether asynchronous decompression should happen.
|
||||||
|
*/
|
||||||
|
private final boolean decompress;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An direct input buffer for incoming data from disk.
|
||||||
|
*/
|
||||||
|
private final ByteBuffer inputBuffer;
|
||||||
|
|
||||||
|
public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
|
||||||
|
this.dispatcher = dispatcher;
|
||||||
|
this.fileHandleCache = fileHandleCache;
|
||||||
|
this.decompress = decompress;
|
||||||
|
|
||||||
|
this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||||
|
inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
for(;;) {
|
||||||
|
SAMReaderPosition readerPosition = null;
|
||||||
|
try {
|
||||||
|
readerPosition = dispatcher.claimNextWorkRequest();
|
||||||
|
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
|
||||||
|
|
||||||
|
long blockAddress = readerPosition.getBlockAddress();
|
||||||
|
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
||||||
|
|
||||||
|
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
|
||||||
|
long nextBlockAddress = position(inputStream);
|
||||||
|
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
|
||||||
|
|
||||||
|
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
||||||
|
int bytesCopied = block.remaining();
|
||||||
|
|
||||||
|
BlockInputStream bamInputStream = readerPosition.getInputStream();
|
||||||
|
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
|
||||||
|
|
||||||
|
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
||||||
|
}
|
||||||
|
catch(Throwable error) {
|
||||||
|
if(readerPosition != null && readerPosition.getInputStream() != null)
|
||||||
|
readerPosition.getInputStream().reportException(error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
|
||||||
|
FileChannel channel = inputStream.getChannel();
|
||||||
|
|
||||||
|
// Read the block header
|
||||||
|
channel.position(blockAddress);
|
||||||
|
|
||||||
|
int uncompressedDataSize = 0;
|
||||||
|
int bufferSize = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
inputBuffer.clear();
|
||||||
|
inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
channel.read(inputBuffer);
|
||||||
|
|
||||||
|
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||||
|
// value into an int buffer to transfer the bitwise contents into an int.
|
||||||
|
inputBuffer.flip();
|
||||||
|
if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
|
||||||
|
throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
|
||||||
|
|
||||||
|
// Verify that the file was read at a valid point.
|
||||||
|
if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
|
||||||
|
unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
|
||||||
|
unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
|
||||||
|
unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
|
||||||
|
unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
|
||||||
|
unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
|
||||||
|
throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
|
||||||
|
}
|
||||||
|
|
||||||
|
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
|
||||||
|
bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
|
||||||
|
|
||||||
|
// Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
|
||||||
|
inputBuffer.limit(bufferSize);
|
||||||
|
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
channel.read(inputBuffer);
|
||||||
|
|
||||||
|
// Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
|
||||||
|
uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
|
||||||
|
//System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
|
||||||
|
}
|
||||||
|
while(uncompressedDataSize == 0 && channel.position() < channel.size());
|
||||||
|
|
||||||
|
// Prepare the buffer for reading.
|
||||||
|
inputBuffer.flip();
|
||||||
|
|
||||||
|
return inputBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
|
||||||
|
final int compressedBufferSize = bgzfBlock.remaining();
|
||||||
|
|
||||||
|
// Determine the uncompressed buffer size (
|
||||||
|
bgzfBlock.position(bgzfBlock.limit()-4);
|
||||||
|
int uncompressedBufferSize = bgzfBlock.getInt();
|
||||||
|
byte[] uncompressedContent = new byte[uncompressedBufferSize];
|
||||||
|
|
||||||
|
// Bound the CDATA section of the buffer.
|
||||||
|
bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
|
||||||
|
bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||||
|
byte[] compressedContent = new byte[bgzfBlock.remaining()];
|
||||||
|
ByteBuffer.wrap(compressedContent).put(bgzfBlock);
|
||||||
|
|
||||||
|
// Decompress the buffer.
|
||||||
|
final Inflater inflater = new Inflater(true);
|
||||||
|
inflater.setInput(compressedContent);
|
||||||
|
int bytesUncompressed = inflater.inflate(uncompressedContent);
|
||||||
|
if(bytesUncompressed != uncompressedBufferSize)
|
||||||
|
throw new ReviewedStingException("Error decompressing block");
|
||||||
|
|
||||||
|
return ByteBuffer.wrap(uncompressedContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long position(final FileInputStream inputStream) throws IOException {
|
||||||
|
return inputStream.getChannel().position();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int unpackUByte8(final ByteBuffer buffer,final int position) {
|
||||||
|
return buffer.get(position) & 0xFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int unpackUInt16(final ByteBuffer buffer,final int position) {
|
||||||
|
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||||
|
// value into an int buffer to transfer the bitwise contents into an int.
|
||||||
|
return buffer.getShort(position) & 0xFFFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,231 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Queue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Caches frequently used file handles. Right now, caches only a single file handle.
|
||||||
|
* TODO: Generalize to support arbitrary file handle caches.
|
||||||
|
*/
|
||||||
|
public class FileHandleCache {
|
||||||
|
/**
|
||||||
|
* The underlying data structure storing file handles.
|
||||||
|
*/
|
||||||
|
private final FileHandleStorage fileHandleStorage;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many file handles should be kept open at once.
|
||||||
|
*/
|
||||||
|
private final int cacheSize;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A uniquifier: assign a unique ID to every instance of a file handle.
|
||||||
|
*/
|
||||||
|
private final Map<SAMReaderID,Integer> keyCounter = new HashMap<SAMReaderID,Integer>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A shared lock, private so that outside users cannot notify it.
|
||||||
|
*/
|
||||||
|
private final Object lock = new Object();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates how many file handles are outstanding at this point.
|
||||||
|
*/
|
||||||
|
private int numOutstandingFileHandles = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new file handle cache of the given cache size.
|
||||||
|
* @param cacheSize how many readers to hold open at once.
|
||||||
|
*/
|
||||||
|
public FileHandleCache(final int cacheSize) {
|
||||||
|
this.cacheSize = cacheSize;
|
||||||
|
fileHandleStorage = new FileHandleStorage();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves or opens a file handle for the given reader ID.
|
||||||
|
* @param key The ke
|
||||||
|
* @return A file input stream from the cache, if available, or otherwise newly opened.
|
||||||
|
*/
|
||||||
|
public FileInputStream claimFileInputStream(final SAMReaderID key) {
|
||||||
|
synchronized(lock) {
|
||||||
|
FileInputStream inputStream = findExistingEntry(key);
|
||||||
|
if(inputStream == null) {
|
||||||
|
try {
|
||||||
|
// If the cache is maxed out, wait for another file handle to emerge.
|
||||||
|
if(numOutstandingFileHandles >= cacheSize)
|
||||||
|
lock.wait();
|
||||||
|
}
|
||||||
|
catch(InterruptedException ex) {
|
||||||
|
throw new ReviewedStingException("Interrupted while waiting for a file handle");
|
||||||
|
}
|
||||||
|
inputStream = openInputStream(key);
|
||||||
|
}
|
||||||
|
numOutstandingFileHandles++;
|
||||||
|
|
||||||
|
//System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
|
||||||
|
return inputStream;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Releases the current reader and returns it to the cache.
|
||||||
|
* @param key The reader.
|
||||||
|
* @param inputStream The stream being used.
|
||||||
|
*/
|
||||||
|
public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
|
||||||
|
synchronized(lock) {
|
||||||
|
numOutstandingFileHandles--;
|
||||||
|
UniqueKey newID = allocateKey(key);
|
||||||
|
fileHandleStorage.put(newID,inputStream);
|
||||||
|
// Let any listeners know that another file handle has become available.
|
||||||
|
lock.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds an existing entry in the storage mechanism.
|
||||||
|
* @param key Reader.
|
||||||
|
* @return a cached stream, if available. Otherwise,
|
||||||
|
*/
|
||||||
|
private FileInputStream findExistingEntry(final SAMReaderID key) {
|
||||||
|
int existingHandles = getMostRecentUniquifier(key);
|
||||||
|
|
||||||
|
// See if any of the keys currently exist in the repository.
|
||||||
|
for(int i = 0; i <= existingHandles; i++) {
|
||||||
|
UniqueKey uniqueKey = new UniqueKey(key,i);
|
||||||
|
if(fileHandleStorage.containsKey(uniqueKey))
|
||||||
|
return fileHandleStorage.remove(uniqueKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the most recent uniquifier used for the given reader.
|
||||||
|
* @param reader Reader for which to determine uniqueness.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private int getMostRecentUniquifier(final SAMReaderID reader) {
|
||||||
|
if(keyCounter.containsKey(reader))
|
||||||
|
return keyCounter.get(reader);
|
||||||
|
else return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private UniqueKey allocateKey(final SAMReaderID reader) {
|
||||||
|
int uniquifier = getMostRecentUniquifier(reader)+1;
|
||||||
|
keyCounter.put(reader,uniquifier);
|
||||||
|
return new UniqueKey(reader,uniquifier);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileInputStream openInputStream(final SAMReaderID reader) {
|
||||||
|
try {
|
||||||
|
return new FileInputStream(reader.getSamFilePath());
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new StingException("Unable to open input file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void closeInputStream(final FileInputStream inputStream) {
|
||||||
|
try {
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new StingException("Unable to open input file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Actually contains the file handles, purging them as they get too old.
|
||||||
|
*/
|
||||||
|
private class FileHandleStorage extends LinkedHashMap<UniqueKey,FileInputStream> {
|
||||||
|
/**
|
||||||
|
* Remove the oldest entry
|
||||||
|
* @param entry Entry to consider removing.
|
||||||
|
* @return True if the cache size has been exceeded. False otherwise.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected boolean removeEldestEntry(Map.Entry<UniqueKey,FileInputStream> entry) {
|
||||||
|
synchronized (lock) {
|
||||||
|
if(size() > cacheSize) {
|
||||||
|
keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
|
||||||
|
closeInputStream(entry.getValue());
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uniquifies a key by adding a numerical uniquifier.
|
||||||
|
*/
|
||||||
|
private class UniqueKey {
|
||||||
|
/**
|
||||||
|
* The file handle's key.
|
||||||
|
*/
|
||||||
|
private final SAMReaderID key;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A uniquifier, so that multiple of the same reader can exist in the cache.
|
||||||
|
*/
|
||||||
|
private final int uniqueID;
|
||||||
|
|
||||||
|
public UniqueKey(final SAMReaderID reader, final int uniqueID) {
|
||||||
|
this.key = reader;
|
||||||
|
this.uniqueID = uniqueID;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if(!(other instanceof UniqueKey))
|
||||||
|
return false;
|
||||||
|
UniqueKey otherUniqueKey = (UniqueKey)other;
|
||||||
|
return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return key.hashCode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
|
||||||
import net.sf.samtools.SAMFileSpan;
|
import net.sf.samtools.SAMFileSpan;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||||
|
|
@ -40,28 +41,25 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
public class FilePointer {
|
public class FilePointer {
|
||||||
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
|
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
|
||||||
protected final BAMOverlap overlap;
|
protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||||
protected final List<GenomeLoc> locations;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Does this file pointer point into an unmapped region?
|
* Does this file pointer point into an unmapped region?
|
||||||
*/
|
*/
|
||||||
protected final boolean isRegionUnmapped;
|
protected final boolean isRegionUnmapped;
|
||||||
|
|
||||||
public FilePointer() {
|
public FilePointer(final GenomeLoc... locations) {
|
||||||
this((BAMOverlap)null);
|
this.locations.addAll(Arrays.asList(locations));
|
||||||
}
|
boolean foundMapped = false, foundUnmapped = false;
|
||||||
|
for(GenomeLoc location: locations) {
|
||||||
public FilePointer(final GenomeLoc location) {
|
if(GenomeLoc.isUnmapped(location))
|
||||||
this.overlap = null;
|
foundUnmapped = true;
|
||||||
this.locations = Collections.singletonList(location);
|
else
|
||||||
this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
|
foundMapped = true;
|
||||||
}
|
}
|
||||||
|
if(foundMapped && foundUnmapped)
|
||||||
public FilePointer(final BAMOverlap overlap) {
|
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||||
this.overlap = overlap;
|
this.isRegionUnmapped = foundUnmapped;
|
||||||
this.locations = new ArrayList<GenomeLoc>();
|
|
||||||
this.isRegionUnmapped = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -217,4 +215,20 @@ public class FilePointer {
|
||||||
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
|
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
|
||||||
combined.addFileSpans(initialElement.getKey(),fileSpan);
|
combined.addFileSpans(initialElement.getKey(),fileSpan);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
builder.append("FilePointer:%n");
|
||||||
|
builder.append("\tlocations = {");
|
||||||
|
builder.append(Utils.join(";",locations));
|
||||||
|
builder.append("}%n\tregions = %n");
|
||||||
|
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
|
||||||
|
builder.append(entry.getKey());
|
||||||
|
builder.append("= {");
|
||||||
|
builder.append(entry.getValue());
|
||||||
|
builder.append("}");
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,419 +25,58 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
import net.sf.picard.util.PeekableIterator;
|
import net.sf.picard.util.PeekableIterator;
|
||||||
import net.sf.samtools.AbstractBAMFileIndex;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import net.sf.samtools.Bin;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import net.sf.samtools.BrowseableBAMIndex;
|
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Shard intervals based on position within the BAM file.
|
* Handles the process of aggregating BAM intervals into individual shards.
|
||||||
*
|
* TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
|
||||||
* @author mhanna
|
|
||||||
* @version 0.1
|
|
||||||
*/
|
*/
|
||||||
public class IntervalSharder {
|
public class IntervalSharder implements Iterator<FilePointer> {
|
||||||
private static Logger logger = Logger.getLogger(IntervalSharder.class);
|
/**
|
||||||
|
* The iterator actually laying out the data for BAM scheduling.
|
||||||
|
*/
|
||||||
|
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||||
|
|
||||||
public static Iterator<FilePointer> shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
/**
|
||||||
return new IntervalSharder.FilePointerIterator(dataSource,loci);
|
* The parser, for interval manipulation.
|
||||||
|
*/
|
||||||
|
private final GenomeLocParser parser;
|
||||||
|
|
||||||
|
public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||||
|
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
|
||||||
|
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||||
|
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
|
||||||
|
}
|
||||||
|
|
||||||
|
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
|
||||||
|
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return wrappedIterator.hasNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A lazy-loading iterator over file pointers.
|
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||||
|
* @return The next file pointer to process.
|
||||||
*/
|
*/
|
||||||
private static class FilePointerIterator implements Iterator<FilePointer> {
|
public FilePointer next() {
|
||||||
final SAMDataSource dataSource;
|
FilePointer current = wrappedIterator.next();
|
||||||
final GenomeLocSortedSet loci;
|
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||||
final PeekableIterator<GenomeLoc> locusIterator;
|
current = current.combine(parser,wrappedIterator.next());
|
||||||
final Queue<FilePointer> cachedFilePointers = new LinkedList<FilePointer>();
|
return current;
|
||||||
|
|
||||||
public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.loci = loci;
|
|
||||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return !cachedFilePointers.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public FilePointer next() {
|
|
||||||
if(!hasNext())
|
|
||||||
throw new NoSuchElementException("FilePointerIterator iteration is complete");
|
|
||||||
FilePointer filePointer = cachedFilePointers.remove();
|
|
||||||
if(cachedFilePointers.isEmpty())
|
|
||||||
advance();
|
|
||||||
return filePointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
|
|
||||||
}
|
|
||||||
|
|
||||||
private void advance() {
|
|
||||||
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
|
|
||||||
String contig = null;
|
|
||||||
|
|
||||||
// If the next section of the BAM to be processed is unmapped, handle this region separately.
|
|
||||||
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
|
|
||||||
contig = null;
|
|
||||||
while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
|
|
||||||
GenomeLoc nextLocus = locusIterator.next();
|
|
||||||
contig = nextLocus.getContig();
|
|
||||||
nextBatch.add(nextLocus);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(nextBatch.size() > 0) {
|
|
||||||
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||||
* Merge / split intervals based on an awareness of the structure of the BAM file.
|
|
||||||
* @param dataSource
|
|
||||||
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
|
|
||||||
* @param loci
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
|
|
||||||
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
|
|
||||||
if(contig == null) {
|
|
||||||
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
|
||||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
|
||||||
filePointer.addFileSpans(id,null);
|
|
||||||
return Collections.singletonList(filePointer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
|
||||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
|
||||||
FilePointer lastFilePointer = null;
|
|
||||||
BAMOverlap lastBAMOverlap = null;
|
|
||||||
|
|
||||||
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
|
|
||||||
IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
|
|
||||||
for(SAMReaderID id: dataSource.getReaderIDs()) {
|
|
||||||
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
|
|
||||||
// If this contig can't be found in the reference, skip over it.
|
|
||||||
if(referenceSequence == null && contig != null)
|
|
||||||
continue;
|
|
||||||
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
|
|
||||||
binMerger.addReader(id,
|
|
||||||
index,
|
|
||||||
referenceSequence.getSequenceIndex(),
|
|
||||||
index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
|
|
||||||
// Cache the reader for later data lookup.
|
|
||||||
readerToIndexMap.put(id,index);
|
|
||||||
}
|
|
||||||
|
|
||||||
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
|
|
||||||
|
|
||||||
for(GenomeLoc location: loci) {
|
|
||||||
if(!location.getContig().equals(contig))
|
|
||||||
throw new ReviewedStingException("Location outside bounds of contig");
|
|
||||||
|
|
||||||
if(!binIterator.hasNext())
|
|
||||||
break;
|
|
||||||
|
|
||||||
int locationStart = location.getStart();
|
|
||||||
final int locationStop = location.getStop();
|
|
||||||
|
|
||||||
// Advance to first bin.
|
|
||||||
while(binIterator.peek().stop < locationStart)
|
|
||||||
binIterator.next();
|
|
||||||
|
|
||||||
// Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
|
|
||||||
// sure the extending bin is not pruned from the list.
|
|
||||||
List<BAMOverlap> bamOverlaps = new ArrayList<BAMOverlap>();
|
|
||||||
while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
|
|
||||||
bamOverlaps.add(binIterator.next());
|
|
||||||
if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
|
|
||||||
bamOverlaps.add(binIterator.peek());
|
|
||||||
|
|
||||||
// Bins found; try to match bins with locations.
|
|
||||||
Iterator<BAMOverlap> bamOverlapIterator = bamOverlaps.iterator();
|
|
||||||
|
|
||||||
while(locationStop >= locationStart) {
|
|
||||||
int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
|
|
||||||
int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
|
|
||||||
|
|
||||||
while(binStop < locationStart && bamOverlapIterator.hasNext()) {
|
|
||||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
|
||||||
filePointers.add(lastFilePointer);
|
|
||||||
|
|
||||||
lastBAMOverlap = bamOverlapIterator.next();
|
|
||||||
lastFilePointer = new FilePointer(lastBAMOverlap);
|
|
||||||
binStart = lastFilePointer.overlap.start;
|
|
||||||
binStop = lastFilePointer.overlap.stop;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(locationStart < binStart) {
|
|
||||||
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
|
||||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
|
||||||
filePointers.add(lastFilePointer);
|
|
||||||
lastFilePointer = null;
|
|
||||||
lastBAMOverlap = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int regionStop = Math.min(locationStop,binStart-1);
|
|
||||||
|
|
||||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
|
|
||||||
lastFilePointer = new FilePointer(subset);
|
|
||||||
|
|
||||||
locationStart = regionStop + 1;
|
|
||||||
}
|
|
||||||
else if(locationStart > binStop) {
|
|
||||||
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
|
||||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
|
||||||
filePointers.add(lastFilePointer);
|
|
||||||
lastFilePointer = null;
|
|
||||||
lastBAMOverlap = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
|
|
||||||
filePointers.add(new FilePointer(subset));
|
|
||||||
|
|
||||||
locationStart = locationStop + 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if(lastFilePointer == null)
|
|
||||||
throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
|
|
||||||
|
|
||||||
// The start of the region overlaps the bin. Add the overlapping subset.
|
|
||||||
final int regionStop = Math.min(locationStop,binStop);
|
|
||||||
lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
|
|
||||||
locationStart = regionStop + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
|
||||||
filePointers.add(lastFilePointer);
|
|
||||||
|
|
||||||
// Lookup the locations for every file pointer in the index.
|
|
||||||
for(SAMReaderID id: readerToIndexMap.keySet()) {
|
|
||||||
BrowseableBAMIndex index = readerToIndexMap.get(id);
|
|
||||||
for(FilePointer filePointer: filePointers)
|
|
||||||
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return filePointers;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class BinMergingIterator implements Iterator<BAMOverlap> {
|
|
||||||
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
|
|
||||||
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
|
|
||||||
|
|
||||||
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
|
|
||||||
binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public BAMOverlap next() {
|
|
||||||
if(!hasNext())
|
|
||||||
throw new NoSuchElementException("No elements left in merging iterator");
|
|
||||||
if(pendingOverlaps.isEmpty())
|
|
||||||
advance();
|
|
||||||
return pendingOverlaps.remove();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void advance() {
|
|
||||||
List<ReaderBin> bins = new ArrayList<ReaderBin>();
|
|
||||||
int boundsStart, boundsStop;
|
|
||||||
|
|
||||||
// Prime the pump
|
|
||||||
if(binQueue.isEmpty())
|
|
||||||
return;
|
|
||||||
bins.add(getNextBin());
|
|
||||||
boundsStart = bins.get(0).getStart();
|
|
||||||
boundsStop = bins.get(0).getStop();
|
|
||||||
|
|
||||||
// Accumulate all the bins that overlap the current bin, in sorted order.
|
|
||||||
while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
|
|
||||||
ReaderBin bin = getNextBin();
|
|
||||||
bins.add(bin);
|
|
||||||
boundsStart = Math.min(boundsStart,bin.getStart());
|
|
||||||
boundsStop = Math.max(boundsStop,bin.getStop());
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Pair<Integer,Integer>> range = new ArrayList<Pair<Integer,Integer>>();
|
|
||||||
int start = bins.get(0).getStart();
|
|
||||||
int stop = bins.get(0).getStop();
|
|
||||||
while(start <= boundsStop) {
|
|
||||||
// Find the next stopping point.
|
|
||||||
for(ReaderBin bin: bins) {
|
|
||||||
stop = Math.min(stop,bin.getStop());
|
|
||||||
if(start < bin.getStart())
|
|
||||||
stop = Math.min(stop,bin.getStart()-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
range.add(new Pair<Integer,Integer>(start,stop));
|
|
||||||
// If the last entry added included the last element, stop.
|
|
||||||
if(stop >= boundsStop)
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Find the next start.
|
|
||||||
start = stop + 1;
|
|
||||||
for(ReaderBin bin: bins) {
|
|
||||||
if(start >= bin.getStart() && start <= bin.getStop())
|
|
||||||
break;
|
|
||||||
else if(start < bin.getStart()) {
|
|
||||||
start = bin.getStart();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the next series of BAM overlaps to the window.
|
|
||||||
for(Pair<Integer,Integer> window: range) {
|
|
||||||
BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
|
|
||||||
for(ReaderBin bin: bins)
|
|
||||||
bamOverlap.addBin(bin.id,bin.bin);
|
|
||||||
pendingOverlaps.add(bamOverlap);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
|
|
||||||
|
|
||||||
private ReaderBin peekNextBin() {
|
|
||||||
if(binQueue.isEmpty())
|
|
||||||
throw new NoSuchElementException("No more bins are available");
|
|
||||||
BinQueueState current = binQueue.peek();
|
|
||||||
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
|
|
||||||
}
|
|
||||||
|
|
||||||
private ReaderBin getNextBin() {
|
|
||||||
if(binQueue.isEmpty())
|
|
||||||
throw new NoSuchElementException("No more bins are available");
|
|
||||||
BinQueueState current = binQueue.remove();
|
|
||||||
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
|
|
||||||
if(current.hasNextBin())
|
|
||||||
binQueue.add(current);
|
|
||||||
return readerBin;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Filters out bins not at the lowest level in the tree.
|
|
||||||
*/
|
|
||||||
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
|
|
||||||
private BrowseableBAMIndex index;
|
|
||||||
private Iterator<Bin> wrappedIterator;
|
|
||||||
|
|
||||||
private Bin nextBin;
|
|
||||||
|
|
||||||
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
|
|
||||||
this.index = index;
|
|
||||||
this.wrappedIterator = iterator;
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return nextBin != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Bin next() {
|
|
||||||
Bin bin = nextBin;
|
|
||||||
advance();
|
|
||||||
return bin;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
|
|
||||||
|
|
||||||
private void advance() {
|
|
||||||
nextBin = null;
|
|
||||||
while(wrappedIterator.hasNext() && nextBin == null) {
|
|
||||||
Bin bin = wrappedIterator.next();
|
|
||||||
if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
|
|
||||||
nextBin = bin;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class BinQueueState implements Comparable<org.broadinstitute.sting.gatk.datasources.reads.BinQueueState> {
|
|
||||||
private final SAMReaderID id;
|
|
||||||
private final BrowseableBAMIndex index;
|
|
||||||
private final int referenceSequence;
|
|
||||||
private final PeekableIterator<Bin> bins;
|
|
||||||
|
|
||||||
private int firstLocusInCurrentBin;
|
|
||||||
private int lastLocusInCurrentBin;
|
|
||||||
|
|
||||||
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
|
|
||||||
this.id = id;
|
|
||||||
this.index = index;
|
|
||||||
this.referenceSequence = referenceSequence;
|
|
||||||
this.bins = new PeekableIterator<Bin>(bins);
|
|
||||||
refreshLocusInBinCache();
|
|
||||||
}
|
|
||||||
|
|
||||||
public SAMReaderID getReaderID() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public BrowseableBAMIndex getIndex() {
|
|
||||||
return index;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReferenceSequence() {
|
|
||||||
return referenceSequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNextBin() {
|
|
||||||
return bins.hasNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Bin peekNextBin() {
|
|
||||||
return bins.peek();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Bin nextBin() {
|
|
||||||
Bin nextBin = bins.next();
|
|
||||||
refreshLocusInBinCache();
|
|
||||||
return nextBin;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
|
|
||||||
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
|
|
||||||
if(!this.bins.hasNext()) return -1;
|
|
||||||
if(!this.bins.hasNext()) return 1;
|
|
||||||
|
|
||||||
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
|
|
||||||
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
|
|
||||||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
|
|
||||||
throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
|
|
||||||
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
|
|
||||||
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
|
|
||||||
|
|
||||||
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
|
|
||||||
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void refreshLocusInBinCache() {
|
|
||||||
firstLocusInCurrentBin = -1;
|
|
||||||
lastLocusInCurrentBin = -1;
|
|
||||||
if(bins.hasNext()) {
|
|
||||||
Bin bin = bins.peek();
|
|
||||||
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
|
|
||||||
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -22,30 +22,34 @@
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package net.sf.samtools;
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A temporary solution to work around Java access rights issues:
|
* Batch granular file pointers into potentially larger shards.
|
||||||
* override chunk and make it public.
|
|
||||||
* TODO: Eliminate once we determine the final fate of the BAM index reading code.
|
|
||||||
*/
|
*/
|
||||||
public class GATKBinList extends BinList {
|
public class LocusShardBalancer extends ShardBalancer {
|
||||||
/**
|
/**
|
||||||
* Create a new BinList over sequenceCount sequences, consisting of the given bins.
|
* Convert iterators of file pointers into balanced iterators of shards.
|
||||||
* @param referenceSequence Reference sequence to which these bins are relevant.
|
* @return An iterator over balanced shards.
|
||||||
* @param bins The given bins to include.
|
|
||||||
*/
|
*/
|
||||||
public GATKBinList(final int referenceSequence, final BitSet bins) {
|
public Iterator<Shard> iterator() {
|
||||||
super(referenceSequence,bins);
|
return new Iterator<Shard>() {
|
||||||
}
|
public boolean hasNext() {
|
||||||
|
return filePointers.hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
public Shard next() {
|
||||||
* Retrieves the bins stored in this list.
|
FilePointer current = filePointers.next();
|
||||||
* @return A bitset where a bin is present in the list if the bit is true.
|
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
|
||||||
*/
|
current = current.combine(parser,filePointers.next());
|
||||||
public BitSet getBins() {
|
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
|
||||||
return super.getBins();
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,178 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
|
||||||
import net.sf.samtools.SAMFileSpan;
|
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A sharding strategy for loci based on reading of the index.
|
|
||||||
*/
|
|
||||||
public class LocusShardStrategy implements ShardStrategy {
|
|
||||||
/**
|
|
||||||
* The data source to use when performing this sharding.
|
|
||||||
*/
|
|
||||||
private final SAMDataSource reads;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the parser for creating shards
|
|
||||||
*/
|
|
||||||
private GenomeLocParser genomeLocParser;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An iterator through the available file pointers.
|
|
||||||
*/
|
|
||||||
private final Iterator<FilePointer> filePointerIterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
|
||||||
* @param reads Data source from which to load index data.
|
|
||||||
* @param locations List of locations for which to load data.
|
|
||||||
*/
|
|
||||||
public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
|
|
||||||
this.reads = reads;
|
|
||||||
this.genomeLocParser = genomeLocParser;
|
|
||||||
|
|
||||||
if(!reads.isEmpty()) {
|
|
||||||
GenomeLocSortedSet intervals;
|
|
||||||
if(locations == null) {
|
|
||||||
// If no locations were passed in, shard the entire BAM file.
|
|
||||||
SAMFileHeader header = reads.getHeader();
|
|
||||||
intervals = new GenomeLocSortedSet(genomeLocParser);
|
|
||||||
|
|
||||||
for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
|
|
||||||
// Check this sequence against the reference sequence dictionary.
|
|
||||||
// TODO: Do a better job of merging reads + reference.
|
|
||||||
SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
|
|
||||||
if(refSequenceRecord != null) {
|
|
||||||
final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
|
|
||||||
intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
intervals = locations;
|
|
||||||
|
|
||||||
if(reads.isLowMemoryShardingEnabled()) {
|
|
||||||
/*
|
|
||||||
Iterator<FilePointer> filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
|
||||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
|
||||||
while(filePointerIterator.hasNext())
|
|
||||||
filePointers.add(filePointerIterator.next());
|
|
||||||
this.filePointerIterator = filePointers.iterator();
|
|
||||||
*/
|
|
||||||
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
final int maxShardSize = 100000;
|
|
||||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
|
||||||
if(locations == null) {
|
|
||||||
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
|
||||||
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
|
||||||
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
|
||||||
filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
for(GenomeLoc interval: locations) {
|
|
||||||
while(interval.size() > maxShardSize) {
|
|
||||||
filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
|
|
||||||
interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
|
||||||
}
|
|
||||||
filePointers.add(new FilePointer(interval));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
filePointerIterator = filePointers.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* returns true if there are additional shards
|
|
||||||
*
|
|
||||||
* @return false if we're done processing shards
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
return filePointerIterator.hasNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long shardNumber = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* gets the next Shard
|
|
||||||
*
|
|
||||||
* @return the next shard
|
|
||||||
*/
|
|
||||||
public LocusShard next() {
|
|
||||||
FilePointer nextFilePointer = filePointerIterator.next();
|
|
||||||
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
|
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.printf("Shard %d: interval = {",++shardNumber);
|
|
||||||
for(GenomeLoc locus: nextFilePointer.locations)
|
|
||||||
System.out.printf("%s;",locus);
|
|
||||||
System.out.printf("}; ");
|
|
||||||
|
|
||||||
if(fileSpansBounding == null)
|
|
||||||
System.out.printf("no shard data%n");
|
|
||||||
else {
|
|
||||||
SortedMap<SAMReaderID,SAMFileSpan> sortedSpans = new TreeMap<SAMReaderID,SAMFileSpan>(fileSpansBounding);
|
|
||||||
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: sortedSpans.entrySet()) {
|
|
||||||
System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** we don't support the remove command */
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* makes the IntervalShard iterable, i.e. usable in a for loop.
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public Iterator<Shard> iterator() {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,68 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.picard.util.PeekableIterator;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Handles the process of aggregating BAM intervals into individual shards.
|
|
||||||
*/
|
|
||||||
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
|
||||||
/**
|
|
||||||
* The iterator actually laying out the data for BAM scheduling.
|
|
||||||
*/
|
|
||||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The parser, for interval manipulation.
|
|
||||||
*/
|
|
||||||
private final GenomeLocParser parser;
|
|
||||||
|
|
||||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
|
||||||
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
|
||||||
parser = loci.getGenomeLocParser();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return wrappedIterator.hasNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
|
||||||
* @return The next file pointer to process.
|
|
||||||
*/
|
|
||||||
public FilePointer next() {
|
|
||||||
FilePointer current = wrappedIterator.next();
|
|
||||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
|
||||||
current = current.combine(parser,wrappedIterator.next());
|
|
||||||
return current;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
|
||||||
}
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A single, monolithic shard bridging all available data.
|
|
||||||
* @author mhanna
|
|
||||||
* @version 0.1
|
|
||||||
*/
|
|
||||||
public class MonolithicShard extends Shard {
|
|
||||||
/**
|
|
||||||
* Creates a new monolithic shard of the given type.
|
|
||||||
* @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
|
|
||||||
* @param locs Intervals that this monolithic shard should process.
|
|
||||||
*/
|
|
||||||
public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List<GenomeLoc> locs) {
|
|
||||||
super(parser, shardType, locs, readsDataSource, null, false);
|
|
||||||
if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
|
|
||||||
throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* String representation of this shard.
|
|
||||||
* @return "entire genome".
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "entire genome";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,77 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a giant shard representing all the data in the input BAM(s).
|
|
||||||
*
|
|
||||||
* @author mhanna
|
|
||||||
* @version 0.1
|
|
||||||
*/
|
|
||||||
public class MonolithicShardStrategy implements ShardStrategy {
|
|
||||||
/**
|
|
||||||
* The single shard associated with this sharding strategy.
|
|
||||||
*/
|
|
||||||
private MonolithicShard shard;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a new shard strategy for shards of the given type.
|
|
||||||
* @param shardType The shard type.
|
|
||||||
*/
|
|
||||||
public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List<GenomeLoc> region) {
|
|
||||||
shard = new MonolithicShard(parser,readsDataSource,shardType,region);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
|
|
||||||
* will only return another copy of the active iterator.
|
|
||||||
* @return A copy of this.
|
|
||||||
*/
|
|
||||||
public Iterator<Shard> iterator() {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if the monolithic shard has not yet been consumed, or false otherwise.
|
|
||||||
* @return True if shard has been consumed, false otherwise.
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
return shard != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the monolithic shard if it has not already been retrieved.
|
|
||||||
* @return The monolithic shard.
|
|
||||||
* @throws NoSuchElementException if no such data exists.
|
|
||||||
*/
|
|
||||||
public Shard next() {
|
|
||||||
if(shard == null)
|
|
||||||
throw new NoSuchElementException("Monolithic shard has already been retrived.");
|
|
||||||
|
|
||||||
Shard working = shard;
|
|
||||||
shard = null;
|
|
||||||
return working;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
|
||||||
*/
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Cannot remove from a shard strategy");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
|
||||||
* @param size adjust the next size to this
|
|
||||||
*/
|
|
||||||
public void adjustNextShardSize( long size ) {
|
|
||||||
throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
@ -35,10 +35,15 @@ import java.util.Map;
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
*/
|
*/
|
||||||
public class ReadShard extends Shard {
|
public class ReadShard extends Shard {
|
||||||
|
/**
|
||||||
|
* What is the maximum number of reads which should go into a read shard.
|
||||||
|
*/
|
||||||
|
public static final int MAX_READS = 10000;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The reads making up this shard.
|
* The reads making up this shard.
|
||||||
*/
|
*/
|
||||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
|
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(MAX_READS);
|
||||||
|
|
||||||
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
|
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
|
||||||
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
|
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
|
||||||
|
|
@ -66,7 +71,7 @@ public class ReadShard extends Shard {
|
||||||
* @return True if this shard's buffer is full (and the shard can buffer reads).
|
* @return True if this shard's buffer is full (and the shard can buffer reads).
|
||||||
*/
|
*/
|
||||||
public boolean isBufferFull() {
|
public boolean isBufferFull() {
|
||||||
return reads.size() > ReadShardStrategy.MAX_READS;
|
return reads.size() > ReadShard.MAX_READS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
|
import net.sf.samtools.SAMFileSpan;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||||
|
*/
|
||||||
|
public class ReadShardBalancer extends ShardBalancer {
|
||||||
|
/**
|
||||||
|
* Convert iterators of file pointers into balanced iterators of shards.
|
||||||
|
* @return An iterator over balanced shards.
|
||||||
|
*/
|
||||||
|
public Iterator<Shard> iterator() {
|
||||||
|
return new Iterator<Shard>() {
|
||||||
|
/**
|
||||||
|
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||||
|
*/
|
||||||
|
private Shard nextShard = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The file pointer currently being processed.
|
||||||
|
*/
|
||||||
|
private FilePointer currentFilePointer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ending position of the last shard in the file.
|
||||||
|
*/
|
||||||
|
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||||
|
|
||||||
|
{
|
||||||
|
if(filePointers.hasNext())
|
||||||
|
currentFilePointer = filePointers.next();
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return nextShard != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Shard next() {
|
||||||
|
if(!hasNext())
|
||||||
|
throw new NoSuchElementException("No next read shard available");
|
||||||
|
Shard currentShard = nextShard;
|
||||||
|
advance();
|
||||||
|
return currentShard;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void advance() {
|
||||||
|
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||||
|
nextShard = null;
|
||||||
|
|
||||||
|
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||||
|
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||||
|
shardPosition = currentFilePointer.fileSpans;
|
||||||
|
|
||||||
|
for(SAMReaderID id: shardPosition.keySet()) {
|
||||||
|
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||||
|
if(!fileSpan.isEmpty())
|
||||||
|
selectedReaders.put(id,fileSpan);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(selectedReaders.size() > 0) {
|
||||||
|
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||||
|
readsDataSource.fillShard(shard);
|
||||||
|
|
||||||
|
if(!shard.isBufferEmpty()) {
|
||||||
|
nextShard = shard;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedReaders.clear();
|
||||||
|
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
position = readsDataSource.getCurrentPosition();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,183 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileSpan;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
|
||||||
* has a specific number of reads (default to 10K) which is configured in the constructor.
|
|
||||||
* @author aaron
|
|
||||||
* @version 1.0
|
|
||||||
* @date Apr 14, 2009
|
|
||||||
*/
|
|
||||||
public class ReadShardStrategy implements ShardStrategy {
|
|
||||||
/**
|
|
||||||
* What is the maximum number of reads which should go into a read shard.
|
|
||||||
*/
|
|
||||||
protected static final int MAX_READS = 10000;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The data source used to shard.
|
|
||||||
*/
|
|
||||||
private final SAMDataSource dataSource;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The intervals to be processed.
|
|
||||||
*/
|
|
||||||
private final GenomeLocSortedSet locations;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
|
||||||
*/
|
|
||||||
private Shard nextShard = null;
|
|
||||||
|
|
||||||
/** our storage of the genomic locations they'd like to shard over */
|
|
||||||
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Iterator over the list of file pointers.
|
|
||||||
*/
|
|
||||||
private final Iterator<FilePointer> filePointerIterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The file pointer currently being processed.
|
|
||||||
*/
|
|
||||||
private FilePointer currentFilePointer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Ending position of the last shard in the file.
|
|
||||||
*/
|
|
||||||
private Map<SAMReaderID,SAMFileSpan> position;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An indicator whether the strategy has sharded into the unmapped region.
|
|
||||||
*/
|
|
||||||
private boolean isIntoUnmappedRegion = false;
|
|
||||||
|
|
||||||
private final GenomeLocParser parser;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a new read shard strategy, loading read shards from the given BAM file.
|
|
||||||
* @param dataSource Data source from which to load shards.
|
|
||||||
* @param locations intervals to use for sharding.
|
|
||||||
*/
|
|
||||||
public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.parser = parser;
|
|
||||||
this.position = this.dataSource.getCurrentPosition();
|
|
||||||
this.locations = locations;
|
|
||||||
|
|
||||||
if(locations != null)
|
|
||||||
filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
|
|
||||||
else
|
|
||||||
filePointerIterator = filePointers.iterator();
|
|
||||||
|
|
||||||
if(filePointerIterator.hasNext())
|
|
||||||
currentFilePointer = filePointerIterator.next();
|
|
||||||
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* do we have another read shard?
|
|
||||||
* @return True if any more data is available. False otherwise.
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
return nextShard != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves the next shard, if available.
|
|
||||||
* @return The next shard, if available.
|
|
||||||
* @throws java.util.NoSuchElementException if no such shard is available.
|
|
||||||
*/
|
|
||||||
public Shard next() {
|
|
||||||
if(!hasNext())
|
|
||||||
throw new NoSuchElementException("No next read shard available");
|
|
||||||
Shard currentShard = nextShard;
|
|
||||||
advance();
|
|
||||||
return currentShard;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void advance() {
|
|
||||||
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
|
|
||||||
nextShard = null;
|
|
||||||
|
|
||||||
if(locations != null) {
|
|
||||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
|
||||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
|
||||||
shardPosition = currentFilePointer.fileSpans;
|
|
||||||
|
|
||||||
for(SAMReaderID id: shardPosition.keySet()) {
|
|
||||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
|
||||||
if(!fileSpan.isEmpty())
|
|
||||||
selectedReaders.put(id,fileSpan);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(selectedReaders.size() > 0) {
|
|
||||||
Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
|
||||||
dataSource.fillShard(shard);
|
|
||||||
|
|
||||||
if(!shard.isBufferEmpty()) {
|
|
||||||
nextShard = shard;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
selectedReaders.clear();
|
|
||||||
currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// todo -- this nulling of intervals is a bit annoying since readwalkers without
|
|
||||||
// todo -- any -L values need to be special cased throughout the code.
|
|
||||||
Shard shard = new ReadShard(parser,dataSource,position,null,false);
|
|
||||||
dataSource.fillShard(shard);
|
|
||||||
nextShard = !shard.isBufferEmpty() ? shard : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
this.position = dataSource.getCurrentPosition();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @throws UnsupportedOperationException always.
|
|
||||||
*/
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Remove not supported");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convenience method for using ShardStrategy in an foreach loop.
|
|
||||||
* @return A iterator over shards.
|
|
||||||
*/
|
|
||||||
public Iterator<Shard> iterator() {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.samtools.Bin;
|
|
||||||
import net.sf.samtools.BrowseableBAMIndex;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: mhanna
|
|
||||||
* Date: Feb 2, 2011
|
|
||||||
* Time: 4:36:40 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
class ReaderBin {
|
|
||||||
public final SAMReaderID id;
|
|
||||||
public final BrowseableBAMIndex index;
|
|
||||||
public final int referenceSequence;
|
|
||||||
public final Bin bin;
|
|
||||||
|
|
||||||
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
|
|
||||||
this.id = id;
|
|
||||||
this.index = index;
|
|
||||||
this.referenceSequence = referenceSequence;
|
|
||||||
this.bin = bin;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getStart() {
|
|
||||||
return index.getFirstLocusInBin(bin);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getStop() {
|
|
||||||
return index.getLastLocusInBin(bin);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
@ -71,7 +73,7 @@ public class SAMDataSource {
|
||||||
/**
|
/**
|
||||||
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
|
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
|
||||||
*/
|
*/
|
||||||
private final GenomeLocParser genomeLocParser;
|
protected final GenomeLocParser genomeLocParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Identifiers for the readers driving this data source.
|
* Identifiers for the readers driving this data source.
|
||||||
|
|
@ -91,13 +93,18 @@ public class SAMDataSource {
|
||||||
/**
|
/**
|
||||||
* How far along is each reader?
|
* How far along is each reader?
|
||||||
*/
|
*/
|
||||||
private final Map<SAMReaderID, SAMFileSpan> readerPositions = new HashMap<SAMReaderID,SAMFileSpan>();
|
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The merged header.
|
* The merged header.
|
||||||
*/
|
*/
|
||||||
private final SAMFileHeader mergedHeader;
|
private final SAMFileHeader mergedHeader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The constituent headers of the unmerged files.
|
||||||
|
*/
|
||||||
|
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The sort order of the BAM files. Files without a sort order tag are assumed to be
|
* The sort order of the BAM files. Files without a sort order tag are assumed to be
|
||||||
* in coordinate order.
|
* in coordinate order.
|
||||||
|
|
@ -131,17 +138,24 @@ public class SAMDataSource {
|
||||||
private final SAMResourcePool resourcePool;
|
private final SAMResourcePool resourcePool;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether to enable the new low-memory sharding mechanism.
|
* Asynchronously loads BGZF blocks.
|
||||||
*/
|
*/
|
||||||
private boolean enableLowMemorySharding = false;
|
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How are threads allocated.
|
||||||
|
*/
|
||||||
|
private final ThreadAllocation threadAllocation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new SAM data source given the supplied read metadata.
|
* Create a new SAM data source given the supplied read metadata.
|
||||||
* @param samFiles list of reads files.
|
* @param samFiles list of reads files.
|
||||||
*/
|
*/
|
||||||
public SAMDataSource(Collection<SAMReaderID> samFiles,GenomeLocParser genomeLocParser) {
|
public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
|
||||||
this(
|
this(
|
||||||
samFiles,
|
samFiles,
|
||||||
|
threadAllocation,
|
||||||
|
numFileHandles,
|
||||||
genomeLocParser,
|
genomeLocParser,
|
||||||
false,
|
false,
|
||||||
SAMFileReader.ValidationStringency.STRICT,
|
SAMFileReader.ValidationStringency.STRICT,
|
||||||
|
|
@ -150,8 +164,7 @@ public class SAMDataSource {
|
||||||
new ValidationExclusion(),
|
new ValidationExclusion(),
|
||||||
new ArrayList<ReadFilter>(),
|
new ArrayList<ReadFilter>(),
|
||||||
false,
|
false,
|
||||||
false,
|
false);
|
||||||
true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -159,6 +172,8 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
public SAMDataSource(
|
public SAMDataSource(
|
||||||
Collection<SAMReaderID> samFiles,
|
Collection<SAMReaderID> samFiles,
|
||||||
|
ThreadAllocation threadAllocation,
|
||||||
|
Integer numFileHandles,
|
||||||
GenomeLocParser genomeLocParser,
|
GenomeLocParser genomeLocParser,
|
||||||
boolean useOriginalBaseQualities,
|
boolean useOriginalBaseQualities,
|
||||||
SAMFileReader.ValidationStringency strictness,
|
SAMFileReader.ValidationStringency strictness,
|
||||||
|
|
@ -167,9 +182,10 @@ public class SAMDataSource {
|
||||||
ValidationExclusion exclusionList,
|
ValidationExclusion exclusionList,
|
||||||
Collection<ReadFilter> supplementalFilters,
|
Collection<ReadFilter> supplementalFilters,
|
||||||
boolean includeReadsWithDeletionAtLoci,
|
boolean includeReadsWithDeletionAtLoci,
|
||||||
boolean generateExtendedEvents,
|
boolean generateExtendedEvents) {
|
||||||
boolean enableLowMemorySharding) {
|
|
||||||
this( samFiles,
|
this( samFiles,
|
||||||
|
threadAllocation,
|
||||||
|
numFileHandles,
|
||||||
genomeLocParser,
|
genomeLocParser,
|
||||||
useOriginalBaseQualities,
|
useOriginalBaseQualities,
|
||||||
strictness,
|
strictness,
|
||||||
|
|
@ -182,8 +198,7 @@ public class SAMDataSource {
|
||||||
BAQ.CalculationMode.OFF,
|
BAQ.CalculationMode.OFF,
|
||||||
BAQ.QualityMode.DONT_MODIFY,
|
BAQ.QualityMode.DONT_MODIFY,
|
||||||
null, // no BAQ
|
null, // no BAQ
|
||||||
(byte) -1,
|
(byte) -1);
|
||||||
enableLowMemorySharding);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -205,6 +220,8 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
public SAMDataSource(
|
public SAMDataSource(
|
||||||
Collection<SAMReaderID> samFiles,
|
Collection<SAMReaderID> samFiles,
|
||||||
|
ThreadAllocation threadAllocation,
|
||||||
|
Integer numFileHandles,
|
||||||
GenomeLocParser genomeLocParser,
|
GenomeLocParser genomeLocParser,
|
||||||
boolean useOriginalBaseQualities,
|
boolean useOriginalBaseQualities,
|
||||||
SAMFileReader.ValidationStringency strictness,
|
SAMFileReader.ValidationStringency strictness,
|
||||||
|
|
@ -217,13 +234,19 @@ public class SAMDataSource {
|
||||||
BAQ.CalculationMode cmode,
|
BAQ.CalculationMode cmode,
|
||||||
BAQ.QualityMode qmode,
|
BAQ.QualityMode qmode,
|
||||||
IndexedFastaSequenceFile refReader,
|
IndexedFastaSequenceFile refReader,
|
||||||
byte defaultBaseQualities,
|
byte defaultBaseQualities) {
|
||||||
boolean enableLowMemorySharding) {
|
|
||||||
this.enableLowMemorySharding(enableLowMemorySharding);
|
|
||||||
this.readMetrics = new ReadMetrics();
|
this.readMetrics = new ReadMetrics();
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
|
|
||||||
readerIDs = samFiles;
|
readerIDs = samFiles;
|
||||||
|
|
||||||
|
this.threadAllocation = threadAllocation;
|
||||||
|
// TODO: Consider a borrowed-thread dispatcher implementation.
|
||||||
|
if(this.threadAllocation.getNumIOThreads() > 0)
|
||||||
|
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
|
||||||
|
else
|
||||||
|
dispatcher = null;
|
||||||
|
|
||||||
validationStringency = strictness;
|
validationStringency = strictness;
|
||||||
for (SAMReaderID readerID : samFiles) {
|
for (SAMReaderID readerID : samFiles) {
|
||||||
if (!readerID.samFile.canRead())
|
if (!readerID.samFile.canRead())
|
||||||
|
|
@ -235,10 +258,13 @@ public class SAMDataSource {
|
||||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||||
|
|
||||||
// Determine the sort order.
|
// Determine the sort order.
|
||||||
for(SAMFileReader reader: readers.values()) {
|
for(SAMReaderID readerID: readerIDs) {
|
||||||
// Get the sort order, forcing it to coordinate if unsorted.
|
// Get the sort order, forcing it to coordinate if unsorted.
|
||||||
|
SAMFileReader reader = readers.getReader(readerID);
|
||||||
SAMFileHeader header = reader.getFileHeader();
|
SAMFileHeader header = reader.getFileHeader();
|
||||||
|
|
||||||
|
headers.put(readerID,header);
|
||||||
|
|
||||||
if ( header.getReadGroups().isEmpty() ) {
|
if ( header.getReadGroups().isEmpty() ) {
|
||||||
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
|
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
|
||||||
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
|
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
|
||||||
|
|
@ -275,7 +301,7 @@ public class SAMDataSource {
|
||||||
qmode,
|
qmode,
|
||||||
refReader,
|
refReader,
|
||||||
defaultBaseQualities);
|
defaultBaseQualities);
|
||||||
|
|
||||||
// cache the read group id (original) -> read group id (merged)
|
// cache the read group id (original) -> read group id (merged)
|
||||||
// and read group id (merged) -> read group id (original) mappings.
|
// and read group id (merged) -> read group id (original) mappings.
|
||||||
for(SAMReaderID id: readerIDs) {
|
for(SAMReaderID id: readerIDs) {
|
||||||
|
|
@ -296,12 +322,10 @@ public class SAMDataSource {
|
||||||
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(enableLowMemorySharding) {
|
for(SAMReaderID id: readerIDs) {
|
||||||
for(SAMReaderID id: readerIDs) {
|
File indexFile = findIndexFile(id.samFile);
|
||||||
File indexFile = findIndexFile(id.samFile);
|
if(indexFile != null)
|
||||||
if(indexFile != null)
|
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resourcePool.releaseReaders(readers);
|
resourcePool.releaseReaders(readers);
|
||||||
|
|
@ -314,22 +338,6 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
public ReadProperties getReadsInfo() { return readProperties; }
|
public ReadProperties getReadsInfo() { return readProperties; }
|
||||||
|
|
||||||
/**
|
|
||||||
* Enable experimental low-memory sharding.
|
|
||||||
* @param enable True to enable sharding. False otherwise.
|
|
||||||
*/
|
|
||||||
public void enableLowMemorySharding(final boolean enable) {
|
|
||||||
enableLowMemorySharding = enable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns whether low-memory sharding is enabled.
|
|
||||||
* @return True if enabled, false otherwise.
|
|
||||||
*/
|
|
||||||
public boolean isLowMemoryShardingEnabled() {
|
|
||||||
return enableLowMemorySharding;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks to see whether any reads files are supplying data.
|
* Checks to see whether any reads files are supplying data.
|
||||||
* @return True if no reads files are supplying data to the traversal; false otherwise.
|
* @return True if no reads files are supplying data to the traversal; false otherwise.
|
||||||
|
|
@ -368,7 +376,7 @@ public class SAMDataSource {
|
||||||
* Retrieves the current position within the BAM file.
|
* Retrieves the current position within the BAM file.
|
||||||
* @return A mapping of reader to current position.
|
* @return A mapping of reader to current position.
|
||||||
*/
|
*/
|
||||||
public Map<SAMReaderID,SAMFileSpan> getCurrentPosition() {
|
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
|
||||||
return readerPositions;
|
return readerPositions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -381,7 +389,7 @@ public class SAMDataSource {
|
||||||
}
|
}
|
||||||
|
|
||||||
public SAMFileHeader getHeader(SAMReaderID id) {
|
public SAMFileHeader getHeader(SAMReaderID id) {
|
||||||
return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader();
|
return headers.get(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -404,45 +412,21 @@ public class SAMDataSource {
|
||||||
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
|
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* No read group collisions at this time because only one SAM file is currently supported.
|
|
||||||
* @return False always.
|
|
||||||
*/
|
|
||||||
public boolean hasReadGroupCollisions() {
|
|
||||||
return hasReadGroupCollisions;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* True if all readers have an index.
|
* True if all readers have an index.
|
||||||
* @return True if all readers have an index.
|
* @return True if all readers have an index.
|
||||||
*/
|
*/
|
||||||
public boolean hasIndex() {
|
public boolean hasIndex() {
|
||||||
if(enableLowMemorySharding)
|
return readerIDs.size() == bamIndices.size();
|
||||||
return readerIDs.size() == bamIndices.size();
|
|
||||||
else {
|
|
||||||
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
|
|
||||||
if(!reader.hasIndex())
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the index for a particular reader. Always preloaded.
|
* Gets the index for a particular reader. Always preloaded.
|
||||||
* TODO: Should return object of type GATKBAMIndex, but cannot because there
|
|
||||||
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
|
|
||||||
* TODO: sharding system goes live.
|
|
||||||
* @param id Id of the reader.
|
* @param id Id of the reader.
|
||||||
* @return The index. Will preload the index if necessary.
|
* @return The index. Will preload the index if necessary.
|
||||||
*/
|
*/
|
||||||
public Object getIndex(final SAMReaderID id) {
|
public GATKBAMIndex getIndex(final SAMReaderID id) {
|
||||||
if(enableLowMemorySharding)
|
return bamIndices.get(id);
|
||||||
return bamIndices.get(id);
|
|
||||||
else {
|
|
||||||
SAMReaders readers = resourcePool.getReadersWithoutLocking();
|
|
||||||
return readers.getReader(id).getBrowseableIndex();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -454,7 +438,7 @@ public class SAMDataSource {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the cumulative read metrics for shards already processed.
|
* Gets the cumulative read metrics for shards already processed.
|
||||||
* @return Cumulative read metrics.
|
* @return Cumulative read metrics.
|
||||||
*/
|
*/
|
||||||
public ReadMetrics getCumulativeReadMetrics() {
|
public ReadMetrics getCumulativeReadMetrics() {
|
||||||
|
|
@ -507,10 +491,6 @@ public class SAMDataSource {
|
||||||
}
|
}
|
||||||
|
|
||||||
public StingSAMIterator seek(Shard shard) {
|
public StingSAMIterator seek(Shard shard) {
|
||||||
// todo: refresh monolithic sharding implementation
|
|
||||||
if(shard instanceof MonolithicShard)
|
|
||||||
return seekMonolithic(shard);
|
|
||||||
|
|
||||||
if(shard.buffersReads()) {
|
if(shard.buffersReads()) {
|
||||||
return shard.iterator();
|
return shard.iterator();
|
||||||
}
|
}
|
||||||
|
|
@ -540,7 +520,7 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
private void initializeReaderPositions(SAMReaders readers) {
|
private void initializeReaderPositions(SAMReaders readers) {
|
||||||
for(SAMReaderID id: getReaderIDs())
|
for(SAMReaderID id: getReaderIDs())
|
||||||
readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads());
|
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -548,7 +528,6 @@ public class SAMDataSource {
|
||||||
* @param readers Readers from which to load data.
|
* @param readers Readers from which to load data.
|
||||||
* @param shard The shard specifying the data limits.
|
* @param shard The shard specifying the data limits.
|
||||||
* @param enableVerification True to verify. For compatibility with old sharding strategy.
|
* @param enableVerification True to verify. For compatibility with old sharding strategy.
|
||||||
* TODO: Collapse this flag when the two sharding systems are merged.
|
|
||||||
* @return An iterator over the selected data.
|
* @return An iterator over the selected data.
|
||||||
*/
|
*/
|
||||||
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
|
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
|
||||||
|
|
@ -559,14 +538,20 @@ public class SAMDataSource {
|
||||||
|
|
||||||
for(SAMReaderID id: getReaderIDs()) {
|
for(SAMReaderID id: getReaderIDs()) {
|
||||||
CloseableIterator<SAMRecord> iterator = null;
|
CloseableIterator<SAMRecord> iterator = null;
|
||||||
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
|
|
||||||
continue;
|
// TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
|
||||||
iterator = shard.getFileSpans().get(id) != null ?
|
// TODO: Kill this check once we've proven that the design elements are gone.
|
||||||
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
|
if(shard.getFileSpans().get(id) == null)
|
||||||
readers.getReader(id).queryUnmapped();
|
throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
|
||||||
|
|
||||||
|
if(threadAllocation.getNumIOThreads() > 0) {
|
||||||
|
BlockInputStream inputStream = readers.getInputStream(id);
|
||||||
|
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
|
||||||
|
}
|
||||||
|
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||||
if(readProperties.getReadBufferSize() != null)
|
if(readProperties.getReadBufferSize() != null)
|
||||||
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
||||||
if(shard.getGenomeLocs() != null)
|
if(shard.getGenomeLocs().size() > 0)
|
||||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||||
}
|
}
|
||||||
|
|
@ -584,33 +569,6 @@ public class SAMDataSource {
|
||||||
readProperties.defaultBaseQualities());
|
readProperties.defaultBaseQualities());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* A stopgap measure to handle monolithic sharding
|
|
||||||
* @param shard the (monolithic) shard.
|
|
||||||
* @return An iterator over the monolithic shard.
|
|
||||||
*/
|
|
||||||
private StingSAMIterator seekMonolithic(Shard shard) {
|
|
||||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
|
||||||
|
|
||||||
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
|
||||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
|
|
||||||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
|
||||||
for(SAMReaderID id: getReaderIDs())
|
|
||||||
mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
|
|
||||||
|
|
||||||
return applyDecoratingIterators(shard.getReadMetrics(),
|
|
||||||
shard instanceof ReadShard,
|
|
||||||
readProperties.useOriginalBaseQualities(),
|
|
||||||
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
|
||||||
readProperties.getDownsamplingMethod().toFraction,
|
|
||||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
|
||||||
readProperties.getSupplementalFilters(),
|
|
||||||
readProperties.getBAQCalculationMode(),
|
|
||||||
readProperties.getBAQQualityMode(),
|
|
||||||
readProperties.getRefReader(),
|
|
||||||
readProperties.defaultBaseQualities());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds this read to the given shard.
|
* Adds this read to the given shard.
|
||||||
* @param shard The shard to which to add the read.
|
* @param shard The shard to which to add the read.
|
||||||
|
|
@ -618,7 +576,7 @@ public class SAMDataSource {
|
||||||
* @param read The read to add to the shard.
|
* @param read The read to add to the shard.
|
||||||
*/
|
*/
|
||||||
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
|
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
|
||||||
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
|
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
|
||||||
shard.addRead(read);
|
shard.addRead(read);
|
||||||
readerPositions.put(id,endChunk);
|
readerPositions.put(id,endChunk);
|
||||||
}
|
}
|
||||||
|
|
@ -689,19 +647,6 @@ public class SAMDataSource {
|
||||||
this.maxEntries = maxEntries;
|
this.maxEntries = maxEntries;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Dangerous internal method; retrieves any set of readers, whether in iteration or not.
|
|
||||||
* Used to handle non-exclusive, stateless operations, such as index queries.
|
|
||||||
* @return Any collection of SAMReaders, whether in iteration or not.
|
|
||||||
*/
|
|
||||||
protected SAMReaders getReadersWithoutLocking() {
|
|
||||||
synchronized(this) {
|
|
||||||
if(allResources.size() == 0)
|
|
||||||
createNewResource();
|
|
||||||
}
|
|
||||||
return allResources.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Choose a set of readers from the pool to use for this query. When complete,
|
* Choose a set of readers from the pool to use for this query. When complete,
|
||||||
* @return
|
* @return
|
||||||
|
|
@ -753,6 +698,11 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
|
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The inptu streams backing
|
||||||
|
*/
|
||||||
|
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Derive a new set of readers from the Reads metadata.
|
* Derive a new set of readers from the Reads metadata.
|
||||||
* @param readerIDs reads to load.
|
* @param readerIDs reads to load.
|
||||||
|
|
@ -760,12 +710,20 @@ public class SAMDataSource {
|
||||||
*/
|
*/
|
||||||
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
|
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
|
||||||
for(SAMReaderID readerID: readerIDs) {
|
for(SAMReaderID readerID: readerIDs) {
|
||||||
SAMFileReader reader = new SAMFileReader(readerID.samFile);
|
File indexFile = findIndexFile(readerID.samFile);
|
||||||
|
|
||||||
|
SAMFileReader reader = null;
|
||||||
|
|
||||||
|
if(threadAllocation.getNumIOThreads() > 0) {
|
||||||
|
BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||||
|
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||||
|
inputStreams.put(readerID,blockInputStream);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||||
reader.setSAMRecordFactory(factory);
|
reader.setSAMRecordFactory(factory);
|
||||||
|
|
||||||
reader.enableFileSource(true);
|
reader.enableFileSource(true);
|
||||||
reader.enableIndexMemoryMapping(false);
|
|
||||||
if(!enableLowMemorySharding)
|
|
||||||
reader.enableIndexCaching(true);
|
|
||||||
reader.setValidationStringency(validationStringency);
|
reader.setValidationStringency(validationStringency);
|
||||||
|
|
||||||
final SAMFileHeader header = reader.getFileHeader();
|
final SAMFileHeader header = reader.getFileHeader();
|
||||||
|
|
@ -786,6 +744,15 @@ public class SAMDataSource {
|
||||||
return readers.get(id);
|
return readers.get(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve the input stream backing a reader.
|
||||||
|
* @param id The ID of the reader to retrieve.
|
||||||
|
* @return the reader associated with the given id.
|
||||||
|
*/
|
||||||
|
public BlockInputStream getInputStream(final SAMReaderID id) {
|
||||||
|
return inputStreams.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Searches for the reader id of this reader.
|
* Searches for the reader id of this reader.
|
||||||
* @param reader Reader for which to search.
|
* @param reader Reader for which to search.
|
||||||
|
|
@ -883,7 +850,7 @@ public class SAMDataSource {
|
||||||
* Filters out reads that do not overlap the current GenomeLoc.
|
* Filters out reads that do not overlap the current GenomeLoc.
|
||||||
* Note the custom implementation: BAM index querying returns all reads that could
|
* Note the custom implementation: BAM index querying returns all reads that could
|
||||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
* possibly overlap the given region (and quite a few extras). In order not to drag
|
||||||
* down performance, this implementation is highly customized to its task.
|
* down performance, this implementation is highly customized to its task.
|
||||||
*/
|
*/
|
||||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||||
/**
|
/**
|
||||||
|
|
@ -903,7 +870,7 @@ public class SAMDataSource {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Custom representation of interval bounds.
|
* Custom representation of interval bounds.
|
||||||
* Makes it simpler to track current position.
|
* Makes it simpler to track current position.
|
||||||
*/
|
*/
|
||||||
private int[] intervalContigIndices;
|
private int[] intervalContigIndices;
|
||||||
private int[] intervalStarts;
|
private int[] intervalStarts;
|
||||||
|
|
@ -941,7 +908,7 @@ public class SAMDataSource {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
advance();
|
advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1070,6 +1037,40 @@ public class SAMDataSource {
|
||||||
|
|
||||||
return indexFile;
|
return indexFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
|
||||||
|
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
|
||||||
|
* @return An iterator that spans all reads in all BAM files.
|
||||||
|
*/
|
||||||
|
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
|
||||||
|
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
|
||||||
|
return shardBalancer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
|
||||||
|
* read that has been assigned
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
|
||||||
|
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
|
||||||
|
return shardBalancer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a schedule for processing the initialized BAM file using the given interval list.
|
||||||
|
* The returned schedule should be as granular as possible.
|
||||||
|
* @param intervals The list of intervals for which to create the schedule.
|
||||||
|
* @return A granular iterator over file pointers.
|
||||||
|
*/
|
||||||
|
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
|
||||||
|
if(intervals == null)
|
||||||
|
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
|
||||||
|
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
|
||||||
|
return shardBalancer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import net.sf.picard.util.PeekableIterator;
|
||||||
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
|
import net.sf.samtools.GATKChunk;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: mhanna
|
||||||
|
* Date: 10/14/11
|
||||||
|
* Time: 10:47 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
class SAMReaderPosition {
|
||||||
|
private final SAMReaderID reader;
|
||||||
|
private final BlockInputStream inputStream;
|
||||||
|
|
||||||
|
private final List<GATKChunk> positions;
|
||||||
|
private PeekableIterator<GATKChunk> positionIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stores the next block address to read, or -1 if no such block is available.
|
||||||
|
*/
|
||||||
|
private long nextBlockAddress;
|
||||||
|
|
||||||
|
|
||||||
|
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||||
|
this.reader = reader;
|
||||||
|
this.inputStream = inputStream;
|
||||||
|
|
||||||
|
this.positions = fileSpan.getGATKChunks();
|
||||||
|
initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMReaderID getReader() {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockInputStream getInputStream() {
|
||||||
|
return inputStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the next block address to be read.
|
||||||
|
* @return Next block address to be read.
|
||||||
|
*/
|
||||||
|
public long getBlockAddress() {
|
||||||
|
return nextBlockAddress;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets the SAM reader position to its original state.
|
||||||
|
*/
|
||||||
|
private void initialize() {
|
||||||
|
this.positionIterator = new PeekableIterator<GATKChunk>(positions.iterator());
|
||||||
|
if(positionIterator.hasNext())
|
||||||
|
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||||
|
else
|
||||||
|
nextBlockAddress = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advances the current position to the next block to read, given the current position in the file.
|
||||||
|
* @param filePosition The current position within the file.
|
||||||
|
*/
|
||||||
|
void advancePosition(final long filePosition) {
|
||||||
|
nextBlockAddress = filePosition;
|
||||||
|
|
||||||
|
// Check the current file position against the iterator; if the iterator is before the current file position,
|
||||||
|
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
||||||
|
try {
|
||||||
|
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
|
||||||
|
positionIterator.next();
|
||||||
|
// Check to see if the iterator has more data available.
|
||||||
|
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
|
||||||
|
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch(Exception ex) {
|
||||||
|
throw new ReviewedStingException("");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
|
||||||
|
return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
|
import net.sf.picard.util.PeekableIterator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Balances maximally granular file pointers into shards of reasonable size.
|
||||||
|
*/
|
||||||
|
public abstract class ShardBalancer implements Iterable<Shard> {
|
||||||
|
protected SAMDataSource readsDataSource;
|
||||||
|
protected PeekableIterator<FilePointer> filePointers;
|
||||||
|
protected GenomeLocParser parser;
|
||||||
|
|
||||||
|
public void initialize(final SAMDataSource readsDataSource, final Iterator<FilePointer> filePointers, final GenomeLocParser parser) {
|
||||||
|
this.readsDataSource = readsDataSource;
|
||||||
|
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* User: aaron
|
|
||||||
* Date: Apr 10, 2009
|
|
||||||
* Time: 4:55:37 PM
|
|
||||||
*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author aaron
|
|
||||||
* @version 1.0
|
|
||||||
* @date Apr 10, 2009
|
|
||||||
* <p/>
|
|
||||||
* Interface ShardStrategy
|
|
||||||
* <p/>
|
|
||||||
* The base interface for the sharding strategy; before we had a base abstract
|
|
||||||
* class, but not this will be an interface to accomidate read based sharding
|
|
||||||
*/
|
|
||||||
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
|
||||||
}
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* User: aaron
|
|
||||||
* Date: Apr 6, 2009
|
|
||||||
* Time: 7:09:22 PM
|
|
||||||
*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author aaron
|
|
||||||
* @version 1.0
|
|
||||||
* @date Apr 6, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class ShardStrategyFactory
|
|
||||||
* <p/>
|
|
||||||
* The Shard Strategy Factory, use this class to create and transfer shard strategies
|
|
||||||
* between different approaches.
|
|
||||||
*/
|
|
||||||
public class ShardStrategyFactory {
|
|
||||||
public enum SHATTER_STRATEGY {
|
|
||||||
MONOLITHIC, // Put all of the available data into one shard.
|
|
||||||
LOCUS_EXPERIMENTAL,
|
|
||||||
READS_EXPERIMENTAL
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get a new shatter strategy
|
|
||||||
*
|
|
||||||
* @param readsDataSource File pointer to BAM.
|
|
||||||
* @param referenceDataSource File pointer to reference.
|
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
|
||||||
* @param dic the seq dictionary
|
|
||||||
* @param startingSize the starting size
|
|
||||||
* @return a shard strategy capable of dividing input data into shards.
|
|
||||||
*/
|
|
||||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
|
|
||||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get a new shatter strategy
|
|
||||||
*
|
|
||||||
* @param readsDataSource File pointer to BAM.
|
|
||||||
* @param referenceDataSource File pointer to reference.
|
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
|
||||||
* @param dic the seq dictionary
|
|
||||||
* @param startingSize the starting size
|
|
||||||
* @return a shard strategy capable of dividing input data into shards.
|
|
||||||
*/
|
|
||||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
|
|
||||||
switch (strat) {
|
|
||||||
case LOCUS_EXPERIMENTAL:
|
|
||||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
|
|
||||||
case READS_EXPERIMENTAL:
|
|
||||||
return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
|
|
||||||
default:
|
|
||||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get a new shatter strategy
|
|
||||||
*
|
|
||||||
* @param readsDataSource File pointer to BAM.
|
|
||||||
* @param referenceDataSource File pointer to reference.
|
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
|
||||||
* @param dic the seq dictionary
|
|
||||||
* @param startingSize the starting size
|
|
||||||
* @return a shard strategy capable of dividing input data into shards.
|
|
||||||
*/
|
|
||||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
|
|
||||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get a new shatter strategy
|
|
||||||
*
|
|
||||||
* @param readsDataSource The reads used to shatter this file.
|
|
||||||
* @param referenceDataSource The reference used to shatter this file.
|
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
|
||||||
* @param dic the seq dictionary
|
|
||||||
* @param startingSize the starting size
|
|
||||||
* @return A strategy for shattering this data.
|
|
||||||
*/
|
|
||||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
|
|
||||||
switch (strat) {
|
|
||||||
case LOCUS_EXPERIMENTAL:
|
|
||||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
|
|
||||||
case READS_EXPERIMENTAL:
|
|
||||||
return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
|
|
||||||
default:
|
|
||||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||||
import org.broadinstitute.sting.commandline.Input;
|
import org.broadinstitute.sting.commandline.Input;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
|
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||||
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
|
||||||
|
|
||||||
// initialize reads
|
// initialize reads
|
||||||
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
|
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
|
||||||
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
|
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
|
||||||
|
|
||||||
// intervals
|
// intervals
|
||||||
GenomeLocSortedSet intervalSortedSet = null;
|
GenomeLocSortedSet intervalSortedSet = null;
|
||||||
|
|
@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
|
||||||
|
|
||||||
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||||
|
|
||||||
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||||
while(sharder.hasNext()) {
|
while(sharder.hasNext()) {
|
||||||
FilePointer filePointer = sharder.next();
|
FilePointer filePointer = sharder.next();
|
||||||
|
|
||||||
|
|
@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
|
||||||
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||||
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
|
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
|
||||||
|
|
||||||
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||||
while(sharder.hasNext()) {
|
while(sharder.hasNext()) {
|
||||||
FilePointer filePointer = sharder.next();
|
FilePointer filePointer = sharder.next();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex;
|
||||||
import net.sf.picard.reference.FastaSequenceIndexBuilder;
|
import net.sf.picard.reference.FastaSequenceIndexBuilder;
|
||||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
import net.sf.picard.sam.CreateSequenceDictionary;
|
import net.sf.picard.sam.CreateSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||||
|
|
@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
||||||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads reference data from fasta file
|
* Loads reference data from fasta file
|
||||||
* Looks for fai and dict files, and tries to create them if they don't exist
|
* Looks for fai and dict files, and tries to create them if they don't exist
|
||||||
*/
|
*/
|
||||||
public class ReferenceDataSource {
|
public class ReferenceDataSource {
|
||||||
private IndexedFastaSequenceFile index;
|
private IndexedFastaSequenceFile reference;
|
||||||
|
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
|
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
|
||||||
|
|
@ -173,7 +185,7 @@ public class ReferenceDataSource {
|
||||||
logger.info("Treating existing index file as complete.");
|
logger.info("Treating existing index file as complete.");
|
||||||
}
|
}
|
||||||
|
|
||||||
index = new CachingIndexedFastaSequenceFile(fastaFile);
|
reference = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||||
|
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
|
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
|
||||||
|
|
@ -192,6 +204,52 @@ public class ReferenceDataSource {
|
||||||
* @return IndexedFastaSequenceFile that was created from file
|
* @return IndexedFastaSequenceFile that was created from file
|
||||||
*/
|
*/
|
||||||
public IndexedFastaSequenceFile getReference() {
|
public IndexedFastaSequenceFile getReference() {
|
||||||
return this.index;
|
return this.reference;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an iterator for processing the entire reference.
|
||||||
|
* @param readsDataSource the reads datasource to embed in the locus shard.
|
||||||
|
* @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||||
|
* @param maxShardSize The maximum shard size which can be used to create this list.
|
||||||
|
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||||
|
*/
|
||||||
|
public Iterable<Shard> createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
|
||||||
|
List<Shard> shards = new ArrayList<Shard>();
|
||||||
|
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
||||||
|
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
||||||
|
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
||||||
|
shards.add(new LocusShard(parser,
|
||||||
|
readsDataSource,
|
||||||
|
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
|
||||||
|
null));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return shards;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an iterator for processing the entire reference.
|
||||||
|
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||||
|
* @param intervals the list of intervals to use when processing the reference.
|
||||||
|
* @param maxShardSize The maximum shard size which can be used to create this list.
|
||||||
|
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||||
|
*/
|
||||||
|
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
|
||||||
|
List<Shard> shards = new ArrayList<Shard>();
|
||||||
|
for(GenomeLoc interval: intervals) {
|
||||||
|
while(interval.size() > maxShardSize) {
|
||||||
|
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||||
|
readsDataSource,
|
||||||
|
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
|
||||||
|
null));
|
||||||
|
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
||||||
|
}
|
||||||
|
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||||
|
readsDataSource,
|
||||||
|
Collections.singletonList(interval),
|
||||||
|
null));
|
||||||
|
}
|
||||||
|
return shards;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
||||||
|
|
@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
||||||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
|
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
|
||||||
// Fast fail for walkers not supporting TreeReducible interface.
|
// Fast fail for walkers not supporting TreeReducible interface.
|
||||||
if (!( walker instanceof TreeReducible ))
|
if (!( walker instanceof TreeReducible ))
|
||||||
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
|
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
|
|
@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
||||||
* @param walker Computation to perform over dataset.
|
* @param walker Computation to perform over dataset.
|
||||||
* @param shardStrategy A strategy for sharding the data.
|
* @param shardStrategy A strategy for sharding the data.
|
||||||
*/
|
*/
|
||||||
public Object execute(Walker walker, ShardStrategy shardStrategy) {
|
public Object execute(Walker walker, Iterable<Shard> shardStrategy) {
|
||||||
walker.initialize();
|
walker.initialize();
|
||||||
Accumulator accumulator = Accumulator.create(engine,walker);
|
Accumulator accumulator = Accumulator.create(engine,walker);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
|
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||||
import org.broadinstitute.sting.gatk.traversals.*;
|
import org.broadinstitute.sting.gatk.traversals.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
||||||
* @param reads the informations associated with the reads
|
* @param reads the informations associated with the reads
|
||||||
* @param reference the reference file
|
* @param reference the reference file
|
||||||
* @param rods the rods to include in the traversal
|
* @param rods the rods to include in the traversal
|
||||||
* @param nThreadsToUse Number of threads to utilize.
|
* @param threadAllocation Number of threads to utilize.
|
||||||
*
|
*
|
||||||
* @return The best-fit microscheduler.
|
* @return The best-fit microscheduler.
|
||||||
*/
|
*/
|
||||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
|
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
|
||||||
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
|
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
|
||||||
if(walker.isReduceByInterval())
|
if(walker.isReduceByInterval())
|
||||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||||
if(walker instanceof ReadWalker)
|
if(walker instanceof ReadWalker)
|
||||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse));
|
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
|
||||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse);
|
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
|
||||||
} else {
|
} else {
|
||||||
if(nThreadsToUse > 1)
|
if(threadAllocation.getNumCPUThreads() > 1)
|
||||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
|
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
|
||||||
}
|
}
|
||||||
|
|
@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
||||||
*
|
*
|
||||||
* @return the return type of the walker
|
* @return the return type of the walker
|
||||||
*/
|
*/
|
||||||
public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
|
public abstract Object execute(Walker walker, Iterable<Shard> shardStrategy);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the object responsible for tracking and managing output.
|
* Retrieves the object responsible for tracking and managing output.
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,9 @@ import org.broad.tribble.annotation.Strand;
|
||||||
import org.broad.tribble.dbsnp.OldDbSNPFeature;
|
import org.broad.tribble.dbsnp.OldDbSNPFeature;
|
||||||
import org.broad.tribble.gelitext.GeliTextFeature;
|
import org.broad.tribble.gelitext.GeliTextFeature;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
|
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -187,30 +186,23 @@ public class VariantContextAdaptors {
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
|
||||||
|
|
||||||
int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
|
int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
|
||||||
if ( index < 0 )
|
if ( index < 0 )
|
||||||
return null; // we weren't given enough reference context to create the VariantContext
|
return null; // we weren't given enough reference context to create the VariantContext
|
||||||
Byte refBaseForIndel = new Byte(ref.getBases()[index]);
|
Byte refBaseForIndel = new Byte(ref.getBases()[index]);
|
||||||
|
|
||||||
Map<String, Genotype> genotypes = null;
|
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||||
VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel);
|
builder.source(name).id(dbsnp.getRsID());
|
||||||
return vc;
|
builder.loc(dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0));
|
||||||
|
builder.alleles(alleles);
|
||||||
|
builder.referenceBaseForIndel(refBaseForIndel);
|
||||||
|
return builder.make();
|
||||||
} else
|
} else
|
||||||
return null; // can't handle anything else
|
return null; // can't handle anything else
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static VCFHeader createVCFHeader(Set<VCFHeaderLine> hInfo, VariantContext vc) {
|
|
||||||
HashSet<String> names = new LinkedHashSet<String>();
|
|
||||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
|
||||||
names.add(g.getSampleName());
|
|
||||||
}
|
|
||||||
|
|
||||||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// GELI to VariantContext
|
// GELI to VariantContext
|
||||||
|
|
@ -257,20 +249,15 @@ public class VariantContextAdaptors {
|
||||||
else genotypeAlleles.add(refAllele);
|
else genotypeAlleles.add(refAllele);
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, String> attributes = new HashMap<String, String>();
|
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||||
MutableGenotype call = new MutableGenotype(name, genotypeAlleles);
|
Genotype call = new Genotype(name, genotypeAlleles);
|
||||||
|
|
||||||
// set the likelihoods, depth, and RMS mapping quality values
|
|
||||||
//call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.getLikelihoods());
|
|
||||||
//call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaximumMappingQual());
|
|
||||||
//call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.getDepthOfCoverage());
|
|
||||||
|
|
||||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||||
genotypes.add(call);
|
genotypes.add(call);
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
VariantContext vc = VariantContextUtils.toVC(name, ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()), alleles, genotypes, geli.getLODBestToReference(), null, attributes);
|
GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart());
|
||||||
return vc;
|
return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make();
|
||||||
} else
|
} else
|
||||||
return null; // can't handle anything else
|
return null; // can't handle anything else
|
||||||
}
|
}
|
||||||
|
|
@ -329,7 +316,7 @@ public class VariantContextAdaptors {
|
||||||
String[] samples = hapmap.getSampleIDs();
|
String[] samples = hapmap.getSampleIDs();
|
||||||
String[] genotypeStrings = hapmap.getGenotypes();
|
String[] genotypeStrings = hapmap.getGenotypes();
|
||||||
|
|
||||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
|
GenotypesContext genotypes = GenotypesContext.create(samples.length);
|
||||||
for ( int i = 0; i < samples.length; i++ ) {
|
for ( int i = 0; i < samples.length; i++ ) {
|
||||||
// ignore bad genotypes
|
// ignore bad genotypes
|
||||||
if ( genotypeStrings[i].contains("N") )
|
if ( genotypeStrings[i].contains("N") )
|
||||||
|
|
@ -358,16 +345,13 @@ public class VariantContextAdaptors {
|
||||||
}
|
}
|
||||||
|
|
||||||
Genotype g = new Genotype(samples[i], myAlleles);
|
Genotype g = new Genotype(samples[i], myAlleles);
|
||||||
genotypes.put(samples[i], g);
|
genotypes.add(g);
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
|
|
||||||
attrs.put(VariantContext.ID_KEY, hapmap.getName());
|
|
||||||
|
|
||||||
long end = hapmap.getEnd();
|
long end = hapmap.getEnd();
|
||||||
if ( deletionLength > 0 )
|
if ( deletionLength > 0 )
|
||||||
end += deletionLength;
|
end += deletionLength;
|
||||||
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel);
|
VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).referenceBaseForIndel(refBaseForIndel).make();
|
||||||
return vc;
|
return vc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,93 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.resourcemanagement;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models how threads are distributed between various components of the GATK.
|
||||||
|
*/
|
||||||
|
public class ThreadAllocation {
|
||||||
|
/**
|
||||||
|
* The number of CPU threads to be used by the GATK.
|
||||||
|
*/
|
||||||
|
private final int numCPUThreads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of threads to devote exclusively to IO. Default is 0.
|
||||||
|
*/
|
||||||
|
private final int numIOThreads;
|
||||||
|
|
||||||
|
public int getNumCPUThreads() {
|
||||||
|
return numCPUThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumIOThreads() {
|
||||||
|
return numIOThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct the default thread allocation.
|
||||||
|
*/
|
||||||
|
public ThreadAllocation() {
|
||||||
|
this(1,null,null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
|
||||||
|
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
|
||||||
|
* @param totalThreads Complete number of threads to allocate.
|
||||||
|
* @param numCPUThreads Total number of threads allocated to the traversal.
|
||||||
|
* @param numIOThreads Total number of threads allocated exclusively to IO.
|
||||||
|
*/
|
||||||
|
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
|
||||||
|
// If no allocation information is present, allocate all threads to CPU
|
||||||
|
if(numCPUThreads == null && numIOThreads == null) {
|
||||||
|
this.numCPUThreads = totalThreads;
|
||||||
|
this.numIOThreads = 0;
|
||||||
|
}
|
||||||
|
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
|
||||||
|
else if(numIOThreads == null) {
|
||||||
|
if(numCPUThreads > totalThreads)
|
||||||
|
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
|
||||||
|
this.numCPUThreads = numCPUThreads;
|
||||||
|
this.numIOThreads = totalThreads - numCPUThreads;
|
||||||
|
}
|
||||||
|
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
|
||||||
|
else if(numCPUThreads == null) {
|
||||||
|
if(numIOThreads > totalThreads)
|
||||||
|
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
|
||||||
|
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
|
||||||
|
this.numIOThreads = numIOThreads;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if(numCPUThreads + numIOThreads != totalThreads)
|
||||||
|
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
|
||||||
|
this.numCPUThreads = numCPUThreads;
|
||||||
|
this.numIOThreads = numIOThreads;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -54,18 +55,18 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
||||||
|
|
||||||
if ( !vc.isBiallelic() )
|
if ( !vc.isBiallelic() )
|
||||||
return null;
|
return null;
|
||||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
final GenotypesContext genotypes = vc.getGenotypes();
|
||||||
if ( !vc.hasGenotypes() )
|
if ( !vc.hasGenotypes() )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
double ratio = 0.0;
|
double ratio = 0.0;
|
||||||
double totalWeights = 0.0;
|
double totalWeights = 0.0;
|
||||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
for ( Genotype genotype : genotypes ) {
|
||||||
// we care only about het calls
|
// we care only about het calls
|
||||||
if ( !genotype.getValue().isHet() )
|
if ( !genotype.isHet() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||||
if ( context == null )
|
if ( context == null )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
@ -84,8 +85,8 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
|
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
|
||||||
ratio += genotype.getValue().getNegLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
||||||
totalWeights += genotype.getValue().getNegLog10PError();
|
totalWeights += genotype.getLog10PError();
|
||||||
} else if ( vc.isIndel() && context.hasExtendedEventPileup() ) {
|
} else if ( vc.isIndel() && context.hasExtendedEventPileup() ) {
|
||||||
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
||||||
if ( indelPileup == null ) {
|
if ( indelPileup == null ) {
|
||||||
|
|
|
||||||
|
|
@ -59,10 +59,8 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
if ( ! vc.hasGenotypes() )
|
if ( ! vc.hasGenotypes() )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
Map<String, Object> map = new HashMap<String, Object>();
|
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true);
|
||||||
VariantContextUtils.calculateChromosomeCounts(vc, map, true);
|
|
||||||
return map;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getKeyNames() {
|
public List<String> getKeyNames() {
|
||||||
|
|
|
||||||
|
|
@ -89,9 +89,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
||||||
|
|
||||||
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
||||||
if (haplotypes != null) {
|
if (haplotypes != null) {
|
||||||
final Set<Map.Entry<String, Genotype>> genotypes = vc.getGenotypes().entrySet();
|
for ( final Genotype genotype : vc.getGenotypes()) {
|
||||||
for ( final Map.Entry<String, Genotype> genotype : genotypes ) {
|
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
|
||||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getKey());
|
|
||||||
if ( thisContext != null ) {
|
if ( thisContext != null ) {
|
||||||
final ReadBackedPileup thisPileup;
|
final ReadBackedPileup thisPileup;
|
||||||
if (thisContext.hasExtendedEventPileup())
|
if (thisContext.hasExtendedEventPileup())
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -26,20 +27,18 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
||||||
|
|
||||||
private static final int MIN_SAMPLES = 10;
|
private static final int MIN_SAMPLES = 10;
|
||||||
private static final int MIN_GENOTYPE_QUALITY = 10;
|
private static final int MIN_GENOTYPE_QUALITY = 10;
|
||||||
private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
||||||
|
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
||||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
final GenotypesContext genotypes = vc.getGenotypes();
|
||||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
int refCount = 0;
|
int refCount = 0;
|
||||||
int hetCount = 0;
|
int hetCount = 0;
|
||||||
int homCount = 0;
|
int homCount = 0;
|
||||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
for ( final Genotype g : genotypes ) {
|
||||||
Genotype g = genotype.getValue();
|
|
||||||
|
|
||||||
if ( g.isNoCall() )
|
if ( g.isNoCall() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
@ -47,7 +46,7 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
||||||
// Right now we just ignore genotypes that are not confident, but this throws off
|
// Right now we just ignore genotypes that are not confident, but this throws off
|
||||||
// our HW ratios. More analysis is needed to determine the right thing to do when
|
// our HW ratios. More analysis is needed to determine the right thing to do when
|
||||||
// the genotyper cannot decide whether a given sample is het or hom var.
|
// the genotyper cannot decide whether a given sample is het or hom var.
|
||||||
if ( g.getNegLog10PError() < MIN_NEG_LOG10_PERROR )
|
if ( g.getLog10PError() > MIN_LOG10_PERROR )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if ( g.isHomRef() )
|
if ( g.isHomRef() )
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -32,7 +33,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
||||||
|
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
||||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
final GenotypesContext genotypes = vc.getGenotypes();
|
||||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
|
@ -51,8 +52,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
||||||
double hetCount = 0.0;
|
double hetCount = 0.0;
|
||||||
double homCount = 0.0;
|
double homCount = 0.0;
|
||||||
int N = 0; // number of samples that have likelihoods
|
int N = 0; // number of samples that have likelihoods
|
||||||
for ( final Map.Entry<String, Genotype> genotypeMap : genotypes.entrySet() ) {
|
for ( final Genotype g : genotypes ) {
|
||||||
Genotype g = genotypeMap.getValue();
|
|
||||||
if ( g.isNoCall() || !g.hasLikelihoods() )
|
if ( g.isNoCall() || !g.hasLikelihoods() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -28,19 +29,19 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
||||||
if ( stratifiedContexts.size() == 0 )
|
if ( stratifiedContexts.size() == 0 )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
final GenotypesContext genotypes = vc.getGenotypes();
|
||||||
if ( genotypes == null || genotypes.size() == 0 )
|
if ( genotypes == null || genotypes.size() == 0 )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
|
|
||||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
for ( final Genotype genotype : genotypes ) {
|
||||||
|
|
||||||
// we care only about variant calls with likelihoods
|
// we care only about variant calls with likelihoods
|
||||||
if ( genotype.getValue().isHomRef() )
|
if ( genotype.isHomRef() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||||
if ( context == null )
|
if ( context == null )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
@ -50,7 +51,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
||||||
if ( depth == 0 )
|
if ( depth == 0 )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
double QD = 10.0 * vc.getNegLog10PError() / (double)depth;
|
double QD = -10.0 * vc.getLog10PError() / (double)depth;
|
||||||
|
|
||||||
Map<String, Object> map = new HashMap<String, Object>();
|
Map<String, Object> map = new HashMap<String, Object>();
|
||||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -32,7 +33,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
||||||
if ( stratifiedContexts.size() == 0 )
|
if ( stratifiedContexts.size() == 0 )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
final GenotypesContext genotypes = vc.getGenotypes();
|
||||||
if ( genotypes == null || genotypes.size() == 0 )
|
if ( genotypes == null || genotypes.size() == 0 )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
|
@ -42,8 +43,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
||||||
|
|
||||||
if (vc.isSNP() && vc.isBiallelic()) {
|
if (vc.isSNP() && vc.isBiallelic()) {
|
||||||
// todo - no current support for multiallelic snps
|
// todo - no current support for multiallelic snps
|
||||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||||
if ( context == null ) {
|
if ( context == null ) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -52,8 +53,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
||||||
}
|
}
|
||||||
else if (vc.isIndel() || vc.isMixed()) {
|
else if (vc.isIndel() || vc.isMixed()) {
|
||||||
|
|
||||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||||
if ( context == null ) {
|
if ( context == null ) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,11 +47,11 @@ import java.util.Map;
|
||||||
public class SampleList extends InfoFieldAnnotation {
|
public class SampleList extends InfoFieldAnnotation {
|
||||||
|
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
if ( vc.isMonomorphic() || !vc.hasGenotypes() )
|
if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
StringBuffer samples = new StringBuffer();
|
StringBuffer samples = new StringBuffer();
|
||||||
for ( Genotype genotype : vc.getGenotypesSortedByName() ) {
|
for ( Genotype genotype : vc.getGenotypesOrderedByName() ) {
|
||||||
if ( genotype.isCalled() && !genotype.isHomRef() ){
|
if ( genotype.isCalled() && !genotype.isHomRef() ){
|
||||||
if ( samples.length() > 0 )
|
if ( samples.length() > 0 )
|
||||||
samples.append(",");
|
samples.append(",");
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -162,11 +164,10 @@ public class VariantAnnotatorEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
||||||
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
|
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
|
||||||
|
|
||||||
// annotate db occurrences
|
// annotate db occurrences
|
||||||
annotateDBs(tracker, ref, vc, infoAnnotations);
|
vc = annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||||
|
|
||||||
// annotate expressions where available
|
// annotate expressions where available
|
||||||
annotateExpressions(tracker, ref, infoAnnotations);
|
annotateExpressions(tracker, ref, infoAnnotations);
|
||||||
|
|
@ -179,20 +180,20 @@ public class VariantAnnotatorEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
// generate a new annotated VC
|
// generate a new annotated VC
|
||||||
final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations);
|
VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
|
||||||
|
|
||||||
// annotate genotypes, creating another new VC in the process
|
// annotate genotypes, creating another new VC in the process
|
||||||
return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc));
|
return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||||
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||||
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
||||||
// annotate dbsnp id if available and not already there
|
// annotate dbsnp id if available and not already there
|
||||||
if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )
|
if ( rsID != null && vc.emptyID() )
|
||||||
infoAnnotations.put(VariantContext.ID_KEY, rsID);
|
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||||
} else {
|
} else {
|
||||||
boolean overlapsComp = false;
|
boolean overlapsComp = false;
|
||||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||||
|
|
@ -204,6 +205,8 @@ public class VariantAnnotatorEngine {
|
||||||
infoAnnotations.put(dbSet.getValue(), overlapsComp);
|
infoAnnotations.put(dbSet.getValue(), overlapsComp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return vc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
|
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
|
||||||
|
|
@ -223,16 +226,15 @@ public class VariantAnnotatorEngine {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Map<String, Genotype> annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
if ( requestedGenotypeAnnotations.size() == 0 )
|
if ( requestedGenotypeAnnotations.size() == 0 )
|
||||||
return vc.getGenotypes();
|
return vc.getGenotypes();
|
||||||
|
|
||||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc.getNSamples());
|
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
|
||||||
for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
|
for ( final Genotype genotype : vc.getGenotypes() ) {
|
||||||
Genotype genotype = g.getValue();
|
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||||
AlignmentContext context = stratifiedContexts.get(g.getKey());
|
|
||||||
if ( context == null ) {
|
if ( context == null ) {
|
||||||
genotypes.put(g.getKey(), genotype);
|
genotypes.add(genotype);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -242,7 +244,7 @@ public class VariantAnnotatorEngine {
|
||||||
if ( result != null )
|
if ( result != null )
|
||||||
genotypeAnnotations.putAll(result);
|
genotypeAnnotations.putAll(result);
|
||||||
}
|
}
|
||||||
genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return genotypes;
|
return genotypes;
|
||||||
|
|
|
||||||
|
|
@ -36,10 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -125,7 +122,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
protected static String line = null;
|
protected static String line = null;
|
||||||
|
|
||||||
private final double MIN_PROB_ERROR = 0.000001;
|
private final double MIN_PROB_ERROR = 0.000001;
|
||||||
private final double MAX_GENOTYPE_QUALITY = 6.0;
|
private final double MAX_GENOTYPE_QUALITY = -6.0;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
|
|
@ -181,8 +178,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
// ignore places where we don't have a variant
|
// ignore places where we don't have a variant
|
||||||
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
||||||
{
|
{
|
||||||
vcfWriter.add(vc_input);
|
vcfWriter.add(vc_input);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -190,8 +187,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
byte refByte = ref.getBase();
|
byte refByte = ref.getBase();
|
||||||
|
|
||||||
// make new Genotypes based on Beagle results
|
// make new Genotypes based on Beagle results
|
||||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
|
GenotypesContext genotypes = GenotypesContext.create(vc_input.getGenotypes().size());
|
||||||
|
|
||||||
|
|
||||||
// for each genotype, create a new object with Beagle information on it
|
// for each genotype, create a new object with Beagle information on it
|
||||||
|
|
||||||
|
|
@ -200,15 +196,13 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
Double alleleFrequencyH = 0.0;
|
Double alleleFrequencyH = 0.0;
|
||||||
int beagleVarCounts = 0;
|
int beagleVarCounts = 0;
|
||||||
|
|
||||||
Map<String,Genotype> hapmapGenotypes = null;
|
GenotypesContext hapmapGenotypes = null;
|
||||||
|
|
||||||
if (vc_comp != null) {
|
if (vc_comp != null) {
|
||||||
hapmapGenotypes = vc_comp.getGenotypes();
|
hapmapGenotypes = vc_comp.getGenotypes();
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
|
for ( final Genotype g : vc_input.getGenotypes() ) {
|
||||||
|
|
||||||
Genotype g = originalGenotypes.getValue();
|
|
||||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||||
|
|
||||||
boolean genotypeIsPhased = true;
|
boolean genotypeIsPhased = true;
|
||||||
|
|
@ -218,7 +212,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
// use sample as key into genotypes structure
|
// use sample as key into genotypes structure
|
||||||
if (vc_comp != null) {
|
if (vc_comp != null) {
|
||||||
|
|
||||||
if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) {
|
if (vc_input.getGenotypes().containsSample(sample) && hapmapGenotypes.containsSample(sample)) {
|
||||||
|
|
||||||
Genotype hapmapGenotype = hapmapGenotypes.get(sample);
|
Genotype hapmapGenotype = hapmapGenotypes.get(sample);
|
||||||
if (hapmapGenotype.isCalled()){
|
if (hapmapGenotype.isCalled()){
|
||||||
|
|
@ -255,9 +249,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
Allele bglAlleleA, bglAlleleB;
|
Allele bglAlleleA, bglAlleleB;
|
||||||
|
|
||||||
if (alleleA.matches(refString))
|
if (alleleA.matches(refString))
|
||||||
bglAlleleA = Allele.create(alleleA,true);
|
bglAlleleA = Allele.create(alleleA,true);
|
||||||
else
|
else
|
||||||
bglAlleleA = Allele.create(alleleA,false);
|
bglAlleleA = Allele.create(alleleA,false);
|
||||||
|
|
||||||
if (alleleB.matches(refString))
|
if (alleleB.matches(refString))
|
||||||
bglAlleleB = Allele.create(alleleB,true);
|
bglAlleleB = Allele.create(alleleB,true);
|
||||||
|
|
@ -286,7 +280,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
// deal with numerical errors coming from limited formatting value on Beagle output files
|
// deal with numerical errors coming from limited formatting value on Beagle output files
|
||||||
if (probWrongGenotype > 1 - MIN_PROB_ERROR)
|
if (probWrongGenotype > 1 - MIN_PROB_ERROR)
|
||||||
probWrongGenotype = 1 - MIN_PROB_ERROR;
|
probWrongGenotype = 1 - MIN_PROB_ERROR;
|
||||||
|
|
||||||
if (1-probWrongGenotype < noCallThreshold) {
|
if (1-probWrongGenotype < noCallThreshold) {
|
||||||
// quality is bad: don't call genotype
|
// quality is bad: don't call genotype
|
||||||
alleles.clear();
|
alleles.clear();
|
||||||
|
|
@ -298,7 +292,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
if (probWrongGenotype < MIN_PROB_ERROR)
|
if (probWrongGenotype < MIN_PROB_ERROR)
|
||||||
genotypeQuality = MAX_GENOTYPE_QUALITY;
|
genotypeQuality = MAX_GENOTYPE_QUALITY;
|
||||||
else
|
else
|
||||||
genotypeQuality = -log10(probWrongGenotype);
|
genotypeQuality = log10(probWrongGenotype);
|
||||||
|
|
||||||
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
|
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
|
||||||
|
|
||||||
|
|
@ -329,47 +323,40 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
||||||
else {
|
else {
|
||||||
originalAttributes.put("OG",".");
|
originalAttributes.put("OG",".");
|
||||||
}
|
}
|
||||||
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||||
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
|
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
|
||||||
beagleVarCounts++;
|
beagleVarCounts++;
|
||||||
}
|
}
|
||||||
|
|
||||||
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
|
genotypes.add(imputedGenotype);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
VariantContext filteredVC;
|
final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes);
|
||||||
if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
|
if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) {
|
||||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
|
|
||||||
else {
|
|
||||||
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
|
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
|
||||||
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
|
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
|
||||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
|
builder.alleles(new HashSet<Allele>(Arrays.asList(vc_input.getReference()))).filters(removedFilters);
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
|
|
||||||
// re-compute chromosome counts
|
// re-compute chromosome counts
|
||||||
VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);
|
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||||
|
|
||||||
// Get Hapmap AC and AF
|
// Get Hapmap AC and AF
|
||||||
if (vc_comp != null) {
|
if (vc_comp != null) {
|
||||||
attributes.put("ACH", alleleCountH.toString() );
|
builder.attribute("ACH", alleleCountH.toString() );
|
||||||
attributes.put("ANH", chrCountH.toString() );
|
builder.attribute("ANH", chrCountH.toString() );
|
||||||
attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
builder.attribute("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
builder.attribute("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
||||||
if( !beagleR2Feature.getR2value().equals(Double.NaN) ) {
|
if( !beagleR2Feature.getR2value().equals(Double.NaN) ) {
|
||||||
attributes.put("R2", beagleR2Feature.getR2value().toString() );
|
builder.attribute("R2", beagleR2Feature.getR2value().toString() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vcfWriter.add(builder.make());
|
||||||
vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes));
|
|
||||||
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
|
||||||
|
|
@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -204,7 +201,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
|
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
|
||||||
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
|
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
|
||||||
if ( bootstrapVCFOutput != null ) {
|
if ( bootstrapVCFOutput != null ) {
|
||||||
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER));
|
bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
|
||||||
}
|
}
|
||||||
bootstrapSetSize++;
|
bootstrapSetSize++;
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -245,18 +242,18 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
if ( markers != null ) markers.append("\n");
|
if ( markers != null ) markers.append("\n");
|
||||||
|
|
||||||
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
|
||||||
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||||
for ( String sample : samples ) {
|
for ( String sample : samples ) {
|
||||||
boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;
|
boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;
|
||||||
|
|
||||||
Genotype genotype;
|
Genotype genotype;
|
||||||
boolean isValidation;
|
boolean isValidation;
|
||||||
// use sample as key into genotypes structure
|
// use sample as key into genotypes structure
|
||||||
if ( preferredGenotypes.keySet().contains(sample) ) {
|
if ( preferredGenotypes.containsSample(sample) ) {
|
||||||
genotype = preferredGenotypes.get(sample);
|
genotype = preferredGenotypes.get(sample);
|
||||||
isValidation = isValidationSite;
|
isValidation = isValidationSite;
|
||||||
} else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
|
} else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) {
|
||||||
genotype = otherGenotypes.get(sample);
|
genotype = otherGenotypes.get(sample);
|
||||||
isValidation = ! isValidationSite;
|
isValidation = ! isValidationSite;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.diffengine;
|
package org.broadinstitute.sting.gatk.walkers.diffengine;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import org.broad.tribble.readers.AsciiLineReader;
|
import org.broad.tribble.readers.AsciiLineReader;
|
||||||
import org.broad.tribble.readers.LineReader;
|
import org.broad.tribble.readers.LineReader;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
|
|
@ -46,6 +47,8 @@ import java.util.Map;
|
||||||
* Class implementing diffnode reader for VCF
|
* Class implementing diffnode reader for VCF
|
||||||
*/
|
*/
|
||||||
public class VCFDiffableReader implements DiffableReader {
|
public class VCFDiffableReader implements DiffableReader {
|
||||||
|
private static Logger logger = Logger.getLogger(VCFDiffableReader.class);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getName() { return "VCF"; }
|
public String getName() { return "VCF"; }
|
||||||
|
|
||||||
|
|
@ -68,7 +71,10 @@ public class VCFDiffableReader implements DiffableReader {
|
||||||
String key = headerLine.getKey();
|
String key = headerLine.getKey();
|
||||||
if ( headerLine instanceof VCFNamedHeaderLine )
|
if ( headerLine instanceof VCFNamedHeaderLine )
|
||||||
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
||||||
root.add(key, headerLine.toString());
|
if ( root.hasElement(key) )
|
||||||
|
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
||||||
|
else
|
||||||
|
root.add(key, headerLine.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
String line = lineReader.readLine();
|
String line = lineReader.readLine();
|
||||||
|
|
@ -90,22 +96,22 @@ public class VCFDiffableReader implements DiffableReader {
|
||||||
// add fields
|
// add fields
|
||||||
vcRoot.add("CHROM", vc.getChr());
|
vcRoot.add("CHROM", vc.getChr());
|
||||||
vcRoot.add("POS", vc.getStart());
|
vcRoot.add("POS", vc.getStart());
|
||||||
vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4);
|
vcRoot.add("ID", vc.getID());
|
||||||
vcRoot.add("REF", vc.getReference());
|
vcRoot.add("REF", vc.getReference());
|
||||||
vcRoot.add("ALT", vc.getAlternateAlleles());
|
vcRoot.add("ALT", vc.getAlternateAlleles());
|
||||||
vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4);
|
vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4);
|
||||||
vcRoot.add("FILTER", vc.getFilters());
|
vcRoot.add("FILTER", vc.getFilters());
|
||||||
|
|
||||||
// add info fields
|
// add info fields
|
||||||
for (Map.Entry<String, Object> attribute : vc.getAttributes().entrySet()) {
|
for (Map.Entry<String, Object> attribute : vc.getAttributes().entrySet()) {
|
||||||
if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY))
|
if ( ! attribute.getKey().startsWith("_") )
|
||||||
vcRoot.add(attribute.getKey(), attribute.getValue());
|
vcRoot.add(attribute.getKey(), attribute.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Genotype g : vc.getGenotypes().values() ) {
|
for (Genotype g : vc.getGenotypes() ) {
|
||||||
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
|
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
|
||||||
gRoot.add("GT", g.getGenotypeString());
|
gRoot.add("GT", g.getGenotypeString());
|
||||||
gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 );
|
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 );
|
||||||
|
|
||||||
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
|
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
|
||||||
if ( ! attribute.getKey().startsWith("_") )
|
if ( ! attribute.getKey().startsWith("_") )
|
||||||
|
|
|
||||||
|
|
@ -36,9 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -224,7 +222,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||||
filters.add(MASK_NAME);
|
filters.add(MASK_NAME);
|
||||||
vc = VariantContext.modifyFilters(vc, filters);
|
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
||||||
|
|
@ -267,7 +265,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||||
filters.add(MASK_NAME);
|
filters.add(MASK_NAME);
|
||||||
vc = VariantContext.modifyFilters(vc, filters);
|
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
return vc;
|
return vc;
|
||||||
|
|
@ -279,20 +277,15 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||||
if ( context == null )
|
if ( context == null )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
VariantContext vc = context.getVariantContext();
|
final VariantContext vc = context.getVariantContext();
|
||||||
|
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||||
|
|
||||||
// make new Genotypes based on filters
|
// make new Genotypes based on filters
|
||||||
Map<String, Genotype> genotypes;
|
if ( genotypeFilterExps.size() > 0 ) {
|
||||||
if ( genotypeFilterExps.size() == 0 ) {
|
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
||||||
genotypes = null;
|
|
||||||
} else {
|
|
||||||
genotypes = new HashMap<String, Genotype>(vc.getGenotypes().size());
|
|
||||||
|
|
||||||
// for each genotype, check filters then create a new object
|
// for each genotype, check filters then create a new object
|
||||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
|
||||||
Genotype g = genotype.getValue();
|
|
||||||
|
|
||||||
if ( g.isCalled() ) {
|
if ( g.isCalled() ) {
|
||||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||||
|
|
||||||
|
|
@ -300,11 +293,13 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||||
if ( VariantContextUtils.match(vc, g, exp) )
|
if ( VariantContextUtils.match(vc, g, exp) )
|
||||||
filters.add(exp.name);
|
filters.add(exp.name);
|
||||||
}
|
}
|
||||||
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
||||||
} else {
|
} else {
|
||||||
genotypes.put(genotype.getKey(), g);
|
genotypes.add(g);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
builder.genotypes(genotypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// make a new variant context based on filters
|
// make a new variant context based on filters
|
||||||
|
|
@ -324,14 +319,9 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||||
filters.add(exp.name);
|
filters.add(exp.name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
builder.filters(filters);
|
||||||
|
|
||||||
VariantContext filteredVC;
|
writer.add(builder.make());
|
||||||
if ( genotypes == null )
|
|
||||||
filteredVC = VariantContext.modifyFilters(vc, filters);
|
|
||||||
else
|
|
||||||
filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
|
|
||||||
|
|
||||||
writer.add(filteredVC);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
|
|
||||||
|
|
@ -26,16 +26,12 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -47,8 +43,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
public enum Model {
|
public enum Model {
|
||||||
/** The default model with the best performance in all cases */
|
/** The default model with the best performance in all cases */
|
||||||
EXACT,
|
EXACT,
|
||||||
/** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
|
|
||||||
GRID_SEARCH
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int N;
|
protected int N;
|
||||||
|
|
@ -73,7 +67,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
* @param log10AlleleFrequencyPriors priors
|
* @param log10AlleleFrequencyPriors priors
|
||||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||||
*/
|
*/
|
||||||
protected abstract void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> Alleles,
|
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
|
||||||
double[] log10AlleleFrequencyPriors,
|
double[] log10AlleleFrequencyPriors,
|
||||||
double[] log10AlleleFrequencyPosteriors);
|
double[] log10AlleleFrequencyPosteriors);
|
||||||
|
|
||||||
|
|
@ -85,7 +79,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
*
|
*
|
||||||
* @return calls
|
* @return calls
|
||||||
*/
|
*/
|
||||||
protected abstract Map<String, Genotype> assignGenotypes(VariantContext vc,
|
protected abstract GenotypesContext assignGenotypes(VariantContext vc,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int AFofMaxLikelihood);
|
int AFofMaxLikelihood);
|
||||||
}
|
}
|
||||||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
* Time: 6:46:09 PM
|
* Time: 6:46:09 PM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
enum DiploidGenotype {
|
public enum DiploidGenotype {
|
||||||
AA ('A', 'A'),
|
AA ('A', 'A'),
|
||||||
AC ('A', 'C'),
|
AC ('A', 'C'),
|
||||||
AG ('A', 'G'),
|
AG ('A', 'G'),
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
import net.sf.samtools.SAMUtils;
|
import net.sf.samtools.SAMUtils;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
||||||
|
|
||||||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||||
byte obsBase = elt.getBase();
|
byte obsBase = elt.getBase();
|
||||||
|
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||||
|
|
||||||
if ( elt.isReducedRead() ) {
|
if ( elt.isReducedRead() ) {
|
||||||
// reduced read representation
|
// reduced read representation
|
||||||
byte qual = elt.getQual();
|
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||||
if ( BaseUtils.isRegularBase( elt.getBase() )) {
|
|
||||||
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
||||||
return elt.getRepresentativeCount(); // we added nObs bases here
|
return elt.getRepresentativeCount(); // we added nObs bases here
|
||||||
} else // odd bases or deletions => don't use them
|
}
|
||||||
return 0;
|
|
||||||
} else {
|
// odd bases or deletions => don't use them
|
||||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
return 0;
|
||||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||||
|
|
@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) {
|
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
|
||||||
byte qual = p.getQual();
|
|
||||||
|
|
||||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
byte qual = p.getQual();
|
||||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
|
||||||
if ( capBaseQualsAtMappingQual )
|
|
||||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
|
||||||
if ( (int)qual < minBaseQual )
|
|
||||||
qual = (byte)0;
|
|
||||||
|
|
||||||
return qual;
|
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||||
}
|
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||||
|
if ( capBaseQualsAtMappingQual )
|
||||||
|
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||||
|
if ( (int)qual < minBaseQual )
|
||||||
|
qual = (byte)0;
|
||||||
|
|
||||||
|
return qual;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -26,14 +26,10 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -46,12 +42,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||||
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||||
|
private final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||||
|
|
||||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||||
super(UAC, N, logger, verboseWriter);
|
super(UAC, N, logger, verboseWriter);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
|
||||||
double[] log10AlleleFrequencyPriors,
|
double[] log10AlleleFrequencyPriors,
|
||||||
double[] log10AlleleFrequencyPosteriors) {
|
double[] log10AlleleFrequencyPosteriors) {
|
||||||
final int numAlleles = alleles.size();
|
final int numAlleles = alleles.size();
|
||||||
|
|
@ -95,11 +92,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||||
|
|
||||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||||
for ( Genotype sample : GLs.values() ) {
|
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||||
if ( sample.hasLikelihoods() ) {
|
if ( sample.hasLikelihoods() ) {
|
||||||
double[] gls = sample.getLikelihoods().getAsVector();
|
double[] gls = sample.getLikelihoods().getAsVector();
|
||||||
|
|
||||||
|
|
@ -155,7 +152,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int linearExact(Map<String, Genotype> GLs,
|
public int linearExact(GenotypesContext GLs,
|
||||||
double[] log10AlleleFrequencyPriors,
|
double[] log10AlleleFrequencyPriors,
|
||||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||||
|
|
@ -268,14 +265,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
*
|
*
|
||||||
* @return calls
|
* @return calls
|
||||||
*/
|
*/
|
||||||
public Map<String, Genotype> assignGenotypes(VariantContext vc,
|
public GenotypesContext assignGenotypes(VariantContext vc,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int AFofMaxLikelihood) {
|
int AFofMaxLikelihood) {
|
||||||
if ( !vc.isVariant() )
|
if ( !vc.isVariant() )
|
||||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||||
|
|
||||||
|
|
||||||
Map<String, Genotype> GLs = vc.getGenotypes();
|
GenotypesContext GLs = vc.getGenotypes();
|
||||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||||
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||||
|
|
||||||
|
|
@ -291,16 +288,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||||
sampleIndices.addAll(GLs.keySet());
|
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
|
||||||
sampleIdx = GLs.size();
|
sampleIdx = GLs.size();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||||
if ( !sample.getValue().hasLikelihoods() )
|
if ( !genotype.hasLikelihoods() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||||
|
|
||||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||||
//System.out.print(sample.getKey()+":");
|
//System.out.print(sample.getKey()+":");
|
||||||
|
|
@ -312,7 +309,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
sampleIndices.add(sample.getKey());
|
sampleIndices.add(genotype.getSampleName());
|
||||||
|
|
||||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||||
|
|
||||||
|
|
@ -342,7 +339,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
GenotypesContext calls = GenotypesContext.create();
|
||||||
|
|
||||||
int startIdx = AFofMaxLikelihood;
|
int startIdx = AFofMaxLikelihood;
|
||||||
for (int k = sampleIdx; k > 0; k--) {
|
for (int k = sampleIdx; k > 0; k--) {
|
||||||
|
|
@ -355,11 +352,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
// and will add no-call genotype to GL's in a second pass
|
// and will add no-call genotype to GL's in a second pass
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||||
|
|
||||||
double qual = Double.NEGATIVE_INFINITY;
|
|
||||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||||
|
|
||||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int newIdx = tracebackArray[k][startIdx];;
|
int newIdx = tracebackArray[k][startIdx];;
|
||||||
|
|
@ -367,20 +363,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
startIdx = newIdx;
|
startIdx = newIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* System.out.format("Sample: %s GL:",sample);
|
|
||||||
for (int i=0; i < likelihoods.length; i++)
|
|
||||||
System.out.format("%1.4f, ",likelihoods[i]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
for (int i=0; i < likelihoods.length; i++) {
|
|
||||||
if (i==bestGTguess)
|
|
||||||
continue;
|
|
||||||
if (likelihoods[i] >= qual)
|
|
||||||
qual = likelihoods[i];
|
|
||||||
}
|
|
||||||
// qual contains now max(likelihoods[k]) for all k != bestGTguess
|
|
||||||
qual = likelihoods[bestGTguess] - qual;
|
|
||||||
|
|
||||||
// likelihoods are stored row-wise in lower triangular matrix. IE
|
// likelihoods are stored row-wise in lower triangular matrix. IE
|
||||||
// for 2 alleles they have ordering AA,AB,BB
|
// for 2 alleles they have ordering AA,AB,BB
|
||||||
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
|
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
|
||||||
|
|
@ -408,37 +390,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (qual < 0) {
|
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
|
||||||
// QUAL can be negative if the chosen genotype is not the most likely one individually.
|
|
||||||
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on
|
|
||||||
double[] normalized = MathUtils.normalizeFromLog10(likelihoods);
|
|
||||||
double chosenGenotype = normalized[bestGTguess];
|
|
||||||
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
|
|
||||||
}
|
|
||||||
//System.out.println(myAlleles.toString());
|
//System.out.println(myAlleles.toString());
|
||||||
calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||||
|
if ( !genotype.hasLikelihoods() )
|
||||||
if ( !sample.getValue().hasLikelihoods() )
|
|
||||||
continue;
|
continue;
|
||||||
Genotype g = GLs.get(sample.getKey());
|
|
||||||
|
|
||||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
final Genotype g = GLs.get(genotype.getSampleName());
|
||||||
|
final double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||||
|
|
||||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||||
continue; // regular likelihoods
|
continue; // regular likelihoods
|
||||||
|
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
final double qual = Genotype.NO_LOG10_PERROR;
|
||||||
|
calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false));
|
||||||
double qual = Genotype.NO_NEG_LOG_10PERROR;
|
|
||||||
myAlleles.add(Allele.NO_CALL);
|
|
||||||
myAlleles.add(Allele.NO_CALL);
|
|
||||||
//System.out.println(myAlleles.toString());
|
|
||||||
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return calls;
|
return calls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,271 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
||||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
|
||||||
|
|
||||||
// for use in optimizing the P(D|AF) calculations:
|
|
||||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
|
||||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
|
||||||
|
|
||||||
private AlleleFrequencyMatrix AFMatrix;
|
|
||||||
|
|
||||||
protected GridSearchAFEstimation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
|
||||||
super(UAC, N, logger, verboseWriter);
|
|
||||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
|
||||||
double[] log10AlleleFrequencyPriors,
|
|
||||||
double[] log10AlleleFrequencyPosteriors) {
|
|
||||||
initializeAFMatrix(GLs);
|
|
||||||
|
|
||||||
// first, calculate for AF=0 (no change to matrix)
|
|
||||||
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
|
|
||||||
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
|
|
||||||
|
|
||||||
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
|
|
||||||
|
|
||||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
|
||||||
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
|
|
||||||
// add one more alternate allele
|
|
||||||
AFMatrix.incrementFrequency();
|
|
||||||
|
|
||||||
// calculate new likelihoods
|
|
||||||
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
|
|
||||||
|
|
||||||
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
|
||||||
// that subsequent likelihoods won't factor into the confidence score, just quit
|
|
||||||
if ( maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON )
|
|
||||||
return;
|
|
||||||
|
|
||||||
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
|
|
||||||
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Overrides the super class
|
|
||||||
* @param vc variant context with genotype likelihoods
|
|
||||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
|
||||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
|
||||||
*
|
|
||||||
* @return calls
|
|
||||||
*/
|
|
||||||
protected Map<String, Genotype> assignGenotypes(VariantContext vc,
|
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
|
||||||
int AFofMaxLikelihood) {
|
|
||||||
if ( !vc.isVariant() )
|
|
||||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
|
||||||
|
|
||||||
Allele refAllele = vc.getReference();
|
|
||||||
Allele altAllele = vc.getAlternateAllele(0);
|
|
||||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
|
||||||
|
|
||||||
// first, the potential alt calls
|
|
||||||
for ( String sample : AFMatrix.getSamples() ) {
|
|
||||||
Genotype g = vc.getGenotype(sample);
|
|
||||||
|
|
||||||
// set the genotype and confidence
|
|
||||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
|
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
|
||||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
|
||||||
myAlleles.add(refAllele);
|
|
||||||
myAlleles.add(refAllele);
|
|
||||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
|
||||||
myAlleles.add(refAllele);
|
|
||||||
myAlleles.add(altAllele);
|
|
||||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
|
||||||
myAlleles.add(altAllele);
|
|
||||||
myAlleles.add(altAllele);
|
|
||||||
}
|
|
||||||
|
|
||||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, g.getAttributes(), false));
|
|
||||||
}
|
|
||||||
|
|
||||||
return calls;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeAFMatrix(Map<String, Genotype> GLs) {
|
|
||||||
AFMatrix.clear();
|
|
||||||
|
|
||||||
for ( Genotype g : GLs.values() ) {
|
|
||||||
if ( g.hasLikelihoods() )
|
|
||||||
AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static class AlleleFrequencyMatrix {
|
|
||||||
|
|
||||||
private double[][] matrix; // allele frequency matrix
|
|
||||||
private int[] indexes; // matrix to maintain which genotype is active
|
|
||||||
private int maxN; // total possible frequencies in data
|
|
||||||
private int frequency; // current frequency
|
|
||||||
|
|
||||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
|
||||||
private ArrayList<String> samples = new ArrayList<String>();
|
|
||||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
|
||||||
|
|
||||||
public AlleleFrequencyMatrix(int N) {
|
|
||||||
maxN = N;
|
|
||||||
matrix = new double[N][3];
|
|
||||||
indexes = new int[N];
|
|
||||||
clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getSamples() { return samples; }
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
frequency = 0;
|
|
||||||
for (int i = 0; i < maxN; i++)
|
|
||||||
indexes[i] = 0;
|
|
||||||
samples.clear();
|
|
||||||
samplesToGenotypesPerAF.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLikelihoods(double[] GLs, String sample) {
|
|
||||||
int index = samples.size();
|
|
||||||
samples.add(sample);
|
|
||||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
|
||||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
|
||||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
public void incrementFrequency() {
|
|
||||||
int N = samples.size();
|
|
||||||
if ( frequency == 2 * N )
|
|
||||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
|
||||||
frequency++;
|
|
||||||
|
|
||||||
double greedy = VALUE_NOT_CALCULATED;
|
|
||||||
int greedyIndex = -1;
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
|
|
||||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
|
||||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
|
||||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
|
||||||
greedyIndex = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
|
||||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
|
||||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
|
||||||
greedyIndex = i;
|
|
||||||
}
|
|
||||||
// note that we currently don't bother with breaking ties between samples
|
|
||||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
|
||||||
// unlikely that a collision will both occur and that the difference will
|
|
||||||
// be significant at HOM_VAR...
|
|
||||||
}
|
|
||||||
// if this person is already hom var, he can't add another alternate allele
|
|
||||||
// so we can ignore that case
|
|
||||||
}
|
|
||||||
if ( greedyIndex == -1 )
|
|
||||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
|
||||||
|
|
||||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
|
||||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
|
||||||
else
|
|
||||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getLikelihoodsOfFrequency() {
|
|
||||||
double likelihoods = 0.0;
|
|
||||||
int N = samples.size();
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
likelihoods += matrix[i][indexes[i]];
|
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println(frequency);
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
System.out.print(samples.get(i));
|
|
||||||
for (int j=0; j < 3; j++) {
|
|
||||||
System.out.print(String.valueOf(matrix[i][j]));
|
|
||||||
System.out.print(indexes[i] == j ? "* " : " ");
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
System.out.println(likelihoods);
|
|
||||||
System.out.println();
|
|
||||||
*/
|
|
||||||
|
|
||||||
recordGenotypes();
|
|
||||||
|
|
||||||
return likelihoods;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
|
||||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void recordGenotypes() {
|
|
||||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
|
||||||
|
|
||||||
int index = 0;
|
|
||||||
for ( String sample : samples ) {
|
|
||||||
int genotype = indexes[index];
|
|
||||||
|
|
||||||
double score;
|
|
||||||
|
|
||||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
|
||||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
|
||||||
if ( genotype == maxEntry ) {
|
|
||||||
if ( genotype == GenotypeType.AA.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
|
||||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
|
||||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
|
||||||
}
|
|
||||||
// otherwise, we need to calculate the probability of the genotype
|
|
||||||
else {
|
|
||||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
|
||||||
double chosenGenotype = normalized[genotype];
|
|
||||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
|
||||||
}
|
|
||||||
|
|
||||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -35,9 +35,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -108,9 +106,9 @@ public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
||||||
return sum;
|
return sum;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Map<String, Object> attrs = new HashMap<String, Object>(value.getAttributes());
|
VariantContextBuilder builder = new VariantContextBuilder(value);
|
||||||
VariantContextUtils.calculateChromosomeCounts(value, attrs, true);
|
VariantContextUtils.calculateChromosomeCounts(builder, true);
|
||||||
writer.add(VariantContext.modifyAttributes(value, attrs));
|
writer.add(builder.make());
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||||
}
|
}
|
||||||
|
|
@ -128,27 +126,27 @@ public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
VariantContext variantVC = null;
|
VariantContext variantVC = null;
|
||||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
for ( VariantContext vc : VCs ) {
|
for ( VariantContext vc : VCs ) {
|
||||||
if ( variantVC == null && vc.isVariant() )
|
if ( variantVC == null && vc.isVariant() )
|
||||||
variantVC = vc;
|
variantVC = vc;
|
||||||
genotypes.putAll(getGenotypesWithGLs(vc.getGenotypes()));
|
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( variantVC == null ) {
|
if ( variantVC == null ) {
|
||||||
VariantContext vc = VCs.get(0);
|
VariantContext vc = VCs.get(0);
|
||||||
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
|
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
|
||||||
}
|
}
|
||||||
return new VariantContext("VCwithGLs", variantVC.getChr(), variantVC.getStart(), variantVC.getEnd(), variantVC.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null);
|
|
||||||
|
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, Genotype> getGenotypesWithGLs(Map<String, Genotype> genotypes) {
|
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
|
||||||
Map<String, Genotype> genotypesWithGLs = new HashMap<String, Genotype>();
|
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
|
||||||
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
|
for ( final Genotype g : genotypes ) {
|
||||||
if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null )
|
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
|
||||||
genotypesWithGLs.put(g.getKey(), g.getValue());
|
genotypesWithGLs.add(g);
|
||||||
}
|
}
|
||||||
|
|
||||||
return genotypesWithGLs;
|
return genotypesWithGLs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -229,8 +229,7 @@ public class UnifiedGenotyperEngine {
|
||||||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||||
if ( vcInput == null )
|
if ( vcInput == null )
|
||||||
return null;
|
return null;
|
||||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
|
vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).make();
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// deal with bad/non-standard reference bases
|
// deal with bad/non-standard reference bases
|
||||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||||
|
|
@ -238,7 +237,7 @@ public class UnifiedGenotyperEngine {
|
||||||
|
|
||||||
Set<Allele> alleles = new HashSet<Allele>();
|
Set<Allele> alleles = new HashSet<Allele>();
|
||||||
alleles.add(Allele.create(ref.getBase(), true));
|
alleles.add(Allele.create(ref.getBase(), true));
|
||||||
vc = new VariantContext("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles);
|
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( annotationEngine != null ) {
|
if ( annotationEngine != null ) {
|
||||||
|
|
@ -265,7 +264,7 @@ public class UnifiedGenotyperEngine {
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
boolean addedAltAlleles = false;
|
boolean addedAltAlleles = false;
|
||||||
|
|
||||||
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||||
if ( !addedAltAlleles ) {
|
if ( !addedAltAlleles ) {
|
||||||
addedAltAlleles = true;
|
addedAltAlleles = true;
|
||||||
|
|
@ -281,22 +280,13 @@ public class UnifiedGenotyperEngine {
|
||||||
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
||||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||||
|
|
||||||
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
|
genotypes.add(new Genotype(GL.getSample(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
GenomeLoc loc = refContext.getLocus();
|
GenomeLoc loc = refContext.getLocus();
|
||||||
int endLoc = calculateEndPos(alleles, refAllele, loc);
|
int endLoc = calculateEndPos(alleles, refAllele, loc);
|
||||||
|
|
||||||
return new VariantContext("UG_call",
|
return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make();
|
||||||
loc.getContig(),
|
|
||||||
loc.getStart(),
|
|
||||||
endLoc,
|
|
||||||
alleles,
|
|
||||||
genotypes,
|
|
||||||
VariantContext.NO_NEG_LOG_10PERROR,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
refContext.getBase());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
|
// private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
|
||||||
|
|
@ -354,7 +344,7 @@ public class UnifiedGenotyperEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the genotypes
|
// create the genotypes
|
||||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||||
|
|
||||||
// print out stats if we have a writer
|
// print out stats if we have a writer
|
||||||
if ( verboseWriter != null )
|
if ( verboseWriter != null )
|
||||||
|
|
@ -420,8 +410,14 @@ public class UnifiedGenotyperEngine {
|
||||||
myAlleles = new HashSet<Allele>(1);
|
myAlleles = new HashSet<Allele>(1);
|
||||||
myAlleles.add(vc.getReference());
|
myAlleles.add(vc.getReference());
|
||||||
}
|
}
|
||||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
|
||||||
myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase());
|
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
|
||||||
|
builder.genotypes(genotypes);
|
||||||
|
builder.log10PError(phredScaledConfidence/-10.0);
|
||||||
|
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
|
||||||
|
builder.attributes(attributes);
|
||||||
|
builder.referenceBaseForIndel(refContext.getBase());
|
||||||
|
VariantContext vcCall = builder.make();
|
||||||
|
|
||||||
if ( annotationEngine != null ) {
|
if ( annotationEngine != null ) {
|
||||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||||
|
|
@ -491,7 +487,7 @@ public class UnifiedGenotyperEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the genotypes
|
// create the genotypes
|
||||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||||
|
|
||||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
|
@ -504,10 +500,15 @@ public class UnifiedGenotyperEngine {
|
||||||
myAlleles = new HashSet<Allele>(1);
|
myAlleles = new HashSet<Allele>(1);
|
||||||
myAlleles.add(vc.getReference());
|
myAlleles.add(vc.getReference());
|
||||||
}
|
}
|
||||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
|
||||||
myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel());
|
|
||||||
|
|
||||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
|
||||||
|
builder.genotypes(genotypes);
|
||||||
|
builder.log10PError(phredScaledConfidence/-10.0);
|
||||||
|
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
|
||||||
|
builder.attributes(attributes);
|
||||||
|
builder.referenceBaseForIndel(vc.getReferenceBaseForIndel());
|
||||||
|
|
||||||
|
return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF));
|
||||||
}
|
}
|
||||||
|
|
||||||
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||||
|
|
@ -811,9 +812,6 @@ public class UnifiedGenotyperEngine {
|
||||||
case EXACT:
|
case EXACT:
|
||||||
afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter);
|
afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter);
|
||||||
break;
|
break;
|
||||||
case GRID_SEARCH:
|
|
||||||
afcm = new GridSearchAFEstimation(UAC, N, logger, verboseWriter);
|
|
||||||
break;
|
|
||||||
default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -556,40 +556,51 @@ public class PairHMMIndelErrorModel {
|
||||||
long indStart = start - haplotype.getStartPosition();
|
long indStart = start - haplotype.getStartPosition();
|
||||||
long indStop = stop - haplotype.getStartPosition();
|
long indStop = stop - haplotype.getStartPosition();
|
||||||
|
|
||||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
|
||||||
(int)indStart, (int)indStop);
|
|
||||||
|
|
||||||
double readLikelihood;
|
double readLikelihood;
|
||||||
if (matchMetricArray == null) {
|
if (DEBUG)
|
||||||
final int X_METRIC_LENGTH = readBases.length+1;
|
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
|
||||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString());
|
||||||
|
|
||||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) {
|
||||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
// read spanned more than allowed reference context: we currently can't deal with this
|
||||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
readLikelihood =0;
|
||||||
}
|
} else
|
||||||
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
{
|
||||||
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||||
if (previousHaplotypeSeen == null)
|
(int)indStart, (int)indStop);
|
||||||
startIdx = 0;
|
|
||||||
else {
|
if (matchMetricArray == null) {
|
||||||
final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
final int X_METRIC_LENGTH = readBases.length+1;
|
||||||
final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
|
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||||
final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP);
|
|
||||||
startIdx = Math.min(Math.min(s1, s2), s3);
|
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||||
}
|
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||||
previousHaplotypeSeen = haplotypeBases.clone();
|
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||||
previousGOP = currentContextGOP.clone();
|
}
|
||||||
previousGCP = currentContextGCP.clone();
|
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||||
|
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||||
|
if (previousHaplotypeSeen == null)
|
||||||
|
startIdx = 0;
|
||||||
|
else {
|
||||||
|
final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||||
|
final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
|
||||||
|
final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP);
|
||||||
|
startIdx = Math.min(Math.min(s1, s2), s3);
|
||||||
|
}
|
||||||
|
previousHaplotypeSeen = haplotypeBases.clone();
|
||||||
|
previousGOP = currentContextGOP.clone();
|
||||||
|
previousGCP = currentContextGCP.clone();
|
||||||
|
|
||||||
|
|
||||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
|
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
|
||||||
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
|
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
|
||||||
if (DEBUG) {
|
|
||||||
System.out.println("H:"+new String(haplotypeBases));
|
if (DEBUG) {
|
||||||
System.out.println("R:"+new String(readBases));
|
System.out.println("H:"+new String(haplotypeBases));
|
||||||
System.out.format("L:%4.2f\n",readLikelihood);
|
System.out.println("R:"+new String(readBases));
|
||||||
System.out.format("StPos:%d\n", startIdx);
|
System.out.format("L:%4.2f\n",readLikelihood);
|
||||||
|
System.out.format("StPos:%d\n", startIdx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
readEl.put(a,readLikelihood);
|
readEl.put(a,readLikelihood);
|
||||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||||
|
|
|
||||||
|
|
@ -58,9 +58,7 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||||
import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator;
|
import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -1057,16 +1055,15 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
stop += event_length;
|
stop += event_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String,Genotype> genotypes = new HashMap<String,Genotype>();
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
|
|
||||||
for ( String sample : normalSamples ) {
|
for ( String sample : normalSamples ) {
|
||||||
|
|
||||||
Map<String,?> attrs = call.makeStatsAttributes(null);
|
Map<String,Object> attrs = call.makeStatsAttributes(null);
|
||||||
|
|
||||||
if ( call.isCall() ) // we made a call - put actual het genotype here:
|
if ( call.isCall() ) // we made a call - put actual het genotype here:
|
||||||
genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false));
|
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||||
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
|
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
|
||||||
genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false));
|
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||||
|
|
||||||
}
|
}
|
||||||
Set<String> filters = null;
|
Set<String> filters = null;
|
||||||
|
|
@ -1074,8 +1071,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
filters = new HashSet<String>();
|
filters = new HashSet<String>();
|
||||||
filters.add("NoCall");
|
filters.add("NoCall");
|
||||||
}
|
}
|
||||||
VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes,
|
VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
|
||||||
-1.0 /* log error */, filters, null, refBases[(int)start-1]);
|
.genotypes(genotypes).filters(filters).referenceBaseForIndel(refBases[(int)start-1]).make();
|
||||||
vcf.add(vc);
|
vcf.add(vc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1147,14 +1144,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
homRefAlleles.add( alleles.get(0));
|
homRefAlleles.add( alleles.get(0));
|
||||||
homRefAlleles.add( alleles.get(0));
|
homRefAlleles.add( alleles.get(0));
|
||||||
|
|
||||||
Map<String,Genotype> genotypes = new HashMap<String,Genotype>();
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
|
|
||||||
for ( String sample : normalSamples ) {
|
for ( String sample : normalSamples ) {
|
||||||
genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false));
|
genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false));
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( String sample : tumorSamples ) {
|
for ( String sample : tumorSamples ) {
|
||||||
genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) );
|
genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) );
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> filters = null;
|
Set<String> filters = null;
|
||||||
|
|
@ -1171,8 +1168,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
filters.add("TCov");
|
filters.add("TCov");
|
||||||
}
|
}
|
||||||
|
|
||||||
VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes,
|
VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
|
||||||
-1.0 /* log error */, filters, attrs, refBases[(int)start-1]);
|
.genotypes(genotypes).filters(filters).attributes(attrs).referenceBaseForIndel(refBases[(int)start-1]).make();
|
||||||
vcf.add(vc);
|
vcf.add(vc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ import java.util.Arrays;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public abstract class BaseArray implements Comparable<BaseArray> {
|
abstract class BaseArray implements Comparable<BaseArray> {
|
||||||
protected Byte[] bases;
|
protected Byte[] bases;
|
||||||
|
|
||||||
public BaseArray(byte[] bases) {
|
public BaseArray(byte[] bases) {
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ import java.util.Iterator;
|
||||||
/*
|
/*
|
||||||
* CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables.
|
* CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables.
|
||||||
*/
|
*/
|
||||||
public class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
|
class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
|
||||||
private int[] cards;
|
private int[] cards;
|
||||||
private int[] valList;
|
private int[] valList;
|
||||||
private boolean hasNext;
|
private boolean hasNext;
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ import java.util.NoSuchElementException;
|
||||||
It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned
|
It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned
|
||||||
to save the current pointer for a later time (while the original iterator can continue to iterate).
|
to save the current pointer for a later time (while the original iterator can continue to iterate).
|
||||||
*/
|
*/
|
||||||
public class CloneableIteratorLinkedList<E> {
|
class CloneableIteratorLinkedList<E> {
|
||||||
private CloneableIteratorDoublyLinkedNode<E> first;
|
private CloneableIteratorDoublyLinkedNode<E> first;
|
||||||
private CloneableIteratorDoublyLinkedNode<E> last;
|
private CloneableIteratorDoublyLinkedNode<E> last;
|
||||||
private int size;
|
private int size;
|
||||||
|
|
|
||||||
|
|
@ -21,13 +21,13 @@
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
package org.broadinstitute.sting.utils;
|
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
public class DisjointSet {
|
class DisjointSet {
|
||||||
private ItemNode[] nodes;
|
private ItemNode[] nodes;
|
||||||
|
|
||||||
public DisjointSet(int numItems) {
|
public DisjointSet(int numItems) {
|
||||||
|
|
@ -27,7 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class Haplotype extends BaseArray implements Cloneable {
|
class Haplotype extends BaseArray implements Cloneable {
|
||||||
public Haplotype(byte[] bases) {
|
public Haplotype(byte[] bases) {
|
||||||
super(bases);
|
super(bases);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,133 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
|
||||||
import org.broadinstitute.sting.commandline.Input;
|
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
|
||||||
import org.broadinstitute.sting.commandline.RodBinding;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD.
|
|
||||||
*/
|
|
||||||
@Allows(value = {DataSource.REFERENCE})
|
|
||||||
@Requires(value = {DataSource.REFERENCE})
|
|
||||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
|
||||||
|
|
||||||
public class MergeMNPsWalker extends RodWalker<Integer, Integer> {
|
|
||||||
|
|
||||||
@Output(doc = "File to which variants should be written", required = true)
|
|
||||||
protected VCFWriter writer = null;
|
|
||||||
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
|
|
||||||
|
|
||||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
|
||||||
protected int maxGenomicDistanceForMNP = 1;
|
|
||||||
|
|
||||||
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
|
|
||||||
public RodBinding<VariantContext> variants;
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
initializeVcfWriter();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeVcfWriter() {
|
|
||||||
// false <-> don't take control of writer, since didn't create it:
|
|
||||||
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false);
|
|
||||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
|
||||||
|
|
||||||
// setup the header fields:
|
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
|
||||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
|
||||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
|
||||||
|
|
||||||
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
|
|
||||||
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean generateExtendedEvents() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduceInit() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For each site, send it to be (possibly) merged with previously observed sites.
|
|
||||||
*
|
|
||||||
* @param tracker the meta-data tracker
|
|
||||||
* @param ref the reference base
|
|
||||||
* @param context the context for the given locus
|
|
||||||
* @return dummy Integer
|
|
||||||
*/
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if (tracker == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
|
|
||||||
writeVCF(vc);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeVCF(VariantContext vc) {
|
|
||||||
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer result, Integer total) {
|
|
||||||
if (result == null)
|
|
||||||
return total;
|
|
||||||
|
|
||||||
return total + result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Release any VariantContexts not yet processed.
|
|
||||||
*
|
|
||||||
* @param result Empty for now...
|
|
||||||
*/
|
|
||||||
public void onTraversalDone(Integer result) {
|
|
||||||
vcMergerWriter.close();
|
|
||||||
|
|
||||||
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
|
||||||
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
|
|
||||||
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
|
|
||||||
System.out.println(vcMergerWriter.getAltAlleleStats());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010, The Broad Institute
|
* Copyright (c) 2011, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
|
|
@ -33,10 +33,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
@ -44,7 +41,7 @@ import java.util.*;
|
||||||
|
|
||||||
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
|
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
|
||||||
|
|
||||||
public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
private VCFWriter innerWriter;
|
private VCFWriter innerWriter;
|
||||||
|
|
||||||
private GenomeLocParser genomeLocParser;
|
private GenomeLocParser genomeLocParser;
|
||||||
|
|
@ -52,7 +49,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
private ReferenceSequenceFile referenceFileForMNPmerging;
|
private ReferenceSequenceFile referenceFileForMNPmerging;
|
||||||
|
|
||||||
private VariantContextMergeRule vcMergeRule;
|
private VariantContextMergeRule vcMergeRule;
|
||||||
private VariantContextUtils.AlleleMergeRule alleleMergeRule;
|
private PhasingUtils.AlleleMergeRule alleleMergeRule;
|
||||||
|
|
||||||
private String useSingleSample = null;
|
private String useSingleSample = null;
|
||||||
|
|
||||||
|
|
@ -71,7 +68,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
// Should we call innerWriter.close() in close()
|
// Should we call innerWriter.close() in close()
|
||||||
private boolean takeOwnershipOfInner;
|
private boolean takeOwnershipOfInner;
|
||||||
|
|
||||||
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, PhasingUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||||
this.innerWriter = innerWriter;
|
this.innerWriter = innerWriter;
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
try {
|
try {
|
||||||
|
|
@ -122,7 +119,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
if (useSingleSample != null) { // only want to output context for one sample
|
if (useSingleSample != null) { // only want to output context for one sample
|
||||||
Genotype sampGt = vc.getGenotype(useSingleSample);
|
Genotype sampGt = vc.getGenotype(useSingleSample);
|
||||||
if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP!
|
if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP!
|
||||||
vc = vc.subContextFromGenotypes(sampGt);
|
vc = vc.subContextFromSample(sampGt.getSampleName());
|
||||||
else // asked for a sample that this vc does not contain, so ignore this vc:
|
else // asked for a sample that this vc does not contain, so ignore this vc:
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -179,14 +176,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
boolean mergedRecords = false;
|
boolean mergedRecords = false;
|
||||||
if (shouldAttemptToMerge) {
|
if (shouldAttemptToMerge) {
|
||||||
numRecordsSatisfyingMergeRule++;
|
numRecordsSatisfyingMergeRule++;
|
||||||
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
|
VariantContext mergedVc = PhasingUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
|
||||||
|
|
||||||
if (mergedVc != null) {
|
if (mergedVc != null) {
|
||||||
mergedRecords = true;
|
mergedRecords = true;
|
||||||
|
|
||||||
Map<String, Object> addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc);
|
Map<String, Object> addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc);
|
||||||
addedAttribs.putAll(mergedVc.getAttributes());
|
addedAttribs.putAll(mergedVc.getAttributes());
|
||||||
mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs);
|
mergedVc = new VariantContextBuilder(mergedVc).attributes(addedAttribs).make();
|
||||||
|
|
||||||
vcfrWaitingToMerge = new VCFRecord(mergedVc, true);
|
vcfrWaitingToMerge = new VCFRecord(mergedVc, true);
|
||||||
numMergedRecords++;
|
numMergedRecords++;
|
||||||
|
|
@ -218,26 +215,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
filteredVcfrList.clear();
|
filteredVcfrList.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getNumRecordsAttemptToMerge() {
|
|
||||||
return numRecordsAttemptToMerge;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumRecordsSatisfyingMergeRule() {
|
|
||||||
return numRecordsSatisfyingMergeRule;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumMergedRecords() {
|
|
||||||
return numMergedRecords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VariantContextMergeRule getVcMergeRule() {
|
|
||||||
return vcMergeRule;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() {
|
|
||||||
return alleleMergeRule;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets a string representation of this object.
|
* Gets a string representation of this object.
|
||||||
*
|
*
|
||||||
|
|
@ -248,13 +225,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
return getClass().getName();
|
return getClass().getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getAltAlleleStats() {
|
|
||||||
if (altAlleleStats == null)
|
|
||||||
return "";
|
|
||||||
|
|
||||||
return "\n" + altAlleleStats.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class VCFRecord {
|
private static class VCFRecord {
|
||||||
public VariantContext vc;
|
public VariantContext vc;
|
||||||
public boolean resultedFromMerge;
|
public boolean resultedFromMerge;
|
||||||
|
|
@ -373,7 +343,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||||
if (shouldAttemptToMerge) {
|
if (shouldAttemptToMerge) {
|
||||||
aas.numSuccessiveGenotypesAttemptedToBeMerged++;
|
aas.numSuccessiveGenotypesAttemptedToBeMerged++;
|
||||||
|
|
||||||
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
if (!PhasingUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
||||||
aas.segregationUnknown++;
|
aas.segregationUnknown++;
|
||||||
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
|
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
|
||||||
}
|
}
|
||||||
|
|
@ -498,9 +468,9 @@ class DistanceMergeRule extends VariantContextMergeRule {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule {
|
class ExistsDoubleAltAlleleMergeRule extends PhasingUtils.AlleleMergeRule {
|
||||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
||||||
return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
|
return PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
@ -515,7 +485,7 @@ class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule {
|
||||||
|
|
||||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
||||||
// Must be interesting AND consistent:
|
// Must be interesting AND consistent:
|
||||||
return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
|
return super.allelesShouldBeMerged(vc1, vc2) && PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
|
||||||
|
|
@ -1,236 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.*;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD.
|
|
||||||
*/
|
|
||||||
@Allows(value = {DataSource.REFERENCE})
|
|
||||||
@Requires(value = {DataSource.REFERENCE})
|
|
||||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
|
||||||
|
|
||||||
public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, Integer> {
|
|
||||||
|
|
||||||
@Output(doc = "File to which variants should be written", required = true)
|
|
||||||
protected VCFWriter writer = null;
|
|
||||||
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
|
|
||||||
|
|
||||||
@Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false)
|
|
||||||
protected int maxGenomicDistance = 1;
|
|
||||||
|
|
||||||
@Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false)
|
|
||||||
protected String useSingleSample = null;
|
|
||||||
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName = "emitOnlyMergedRecords", shortName = "emitOnlyMerged", doc = "Only output records that resulted from merging [For DEBUGGING purposes only - DO NOT USE, since it disregards the semantics of '|' as 'phased relative to previous non-filtered VC']; [default:false]", required = false)
|
|
||||||
protected boolean emitOnlyMergedRecords = false;
|
|
||||||
|
|
||||||
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
|
||||||
protected boolean disablePrintAlternateAlleleStatistics = false;
|
|
||||||
|
|
||||||
public final static String IGNORE_REFSEQ = "IGNORE";
|
|
||||||
public final static String UNION_REFSEQ = "UNION";
|
|
||||||
public final static String INTERSECT_REFSEQ = "INTERSECT";
|
|
||||||
|
|
||||||
@Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false)
|
|
||||||
protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;
|
|
||||||
|
|
||||||
@Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false)
|
|
||||||
protected boolean dontRequireSomeSampleHasDoubleAltAllele = false;
|
|
||||||
|
|
||||||
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
|
|
||||||
public RodBinding<VariantContext> variants;
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
initializeVcfWriter();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeVcfWriter() {
|
|
||||||
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
|
|
||||||
|
|
||||||
VariantContextMergeRule vcMergeRule;
|
|
||||||
if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
|
|
||||||
vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser);
|
|
||||||
else
|
|
||||||
vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation);
|
|
||||||
|
|
||||||
VariantContextUtils.AlleleMergeRule alleleMergeRule;
|
|
||||||
if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them if there is a trailing prefix of polymorphisms (i.e., upstream polymorphic site):
|
|
||||||
alleleMergeRule = new PrefixPolymorphismMergeAllelesRule();
|
|
||||||
else
|
|
||||||
alleleMergeRule = new ExistsDoubleAltAlleleMergeRule();
|
|
||||||
|
|
||||||
// false <-> don't take control of writer, since didn't create it:
|
|
||||||
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
|
|
||||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
|
||||||
|
|
||||||
// setup the header fields:
|
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
|
||||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
|
||||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
|
||||||
|
|
||||||
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
|
|
||||||
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean generateExtendedEvents() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduceInit() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For each site, send it to be (possibly) merged with previously observed sites.
|
|
||||||
*
|
|
||||||
* @param tracker the meta-data tracker
|
|
||||||
* @param ref the reference base
|
|
||||||
* @param context the context for the given locus
|
|
||||||
* @return dummy Integer
|
|
||||||
*/
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if (tracker == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
|
|
||||||
writeVCF(vc);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeVCF(VariantContext vc) {
|
|
||||||
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer result, Integer total) {
|
|
||||||
if (result == null)
|
|
||||||
return total;
|
|
||||||
|
|
||||||
return total + result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Release any VariantContexts not yet processed.
|
|
||||||
*
|
|
||||||
* @param result Empty for now...
|
|
||||||
*/
|
|
||||||
public void onTraversalDone(Integer result) {
|
|
||||||
vcMergerWriter.close();
|
|
||||||
|
|
||||||
if (useSingleSample != null)
|
|
||||||
System.out.println("Only considered single sample: " + useSingleSample);
|
|
||||||
|
|
||||||
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
|
||||||
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
|
|
||||||
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
|
|
||||||
System.out.println(vcMergerWriter.getAltAlleleStats());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
enum MergeBasedOnRefSeqAnnotation {
|
|
||||||
UNION_WITH_DIST, INTERSECT_WITH_DIST
|
|
||||||
}
|
|
||||||
|
|
||||||
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
|
|
||||||
private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;
|
|
||||||
|
|
||||||
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
|
|
||||||
super(maxGenomicDistanceForMNP, genomeLocParser);
|
|
||||||
|
|
||||||
if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
|
|
||||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
|
|
||||||
else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
|
|
||||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
|
|
||||||
else
|
|
||||||
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) {
|
|
||||||
boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2);
|
|
||||||
|
|
||||||
if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
|
|
||||||
return withinDistance || sameGene(vc1, vc2);
|
|
||||||
else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
|
|
||||||
return withinDistance && sameGene(vc1, vc2);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
|
|
||||||
Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
|
|
||||||
Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
|
|
||||||
names_vc1.retainAll(names_vc2);
|
|
||||||
|
|
||||||
if (!names_vc1.isEmpty())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// Check refseq.name2:
|
|
||||||
Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
|
|
||||||
Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
|
|
||||||
names2_vc1.retainAll(names2_vc2);
|
|
||||||
|
|
||||||
return !names2_vc1.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Object> addToMergedAttributes(VariantContext vc1, VariantContext vc2) {
|
|
||||||
Map<String, Object> addedAttribs = super.addToMergedAttributes(vc1, vc2);
|
|
||||||
addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2));
|
|
||||||
return addedAttribs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PrefixPolymorphismMergeAllelesRule extends VariantContextUtils.AlleleMergeRule {
|
|
||||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
|
||||||
return vc1.isPolymorphic();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return super.toString() + ", there exists a polymorphism at the start of the merged allele";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -13,10 +13,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -135,7 +132,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
private final Allele NO_CALL = Allele.create(".",false);
|
private final Allele NO_CALL = Allele.create(".",false);
|
||||||
private final String DUMMY_NAME = "DummySample";
|
private final String DUMMY_NAME = "DummySample";
|
||||||
|
|
||||||
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
|
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
|
||||||
|
|
||||||
private ArrayList<Allele> getAlleles(Genotype.Type genotype){
|
private ArrayList<Allele> getAlleles(Genotype.Type genotype){
|
||||||
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
|
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
|
||||||
|
|
@ -165,10 +162,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
//Homozygous genotypes will be set as phased, heterozygous won't be
|
//Homozygous genotypes will be set as phased, heterozygous won't be
|
||||||
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
|
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
|
||||||
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
|
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
|
||||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
}
|
}
|
||||||
|
|
||||||
//Find the phase for a parent/child pair
|
//Find the phase for a parent/child pair
|
||||||
|
|
@ -176,8 +173,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
|
|
||||||
//Special case for Het/Het as it is ambiguous
|
//Special case for Het/Het as it is ambiguous
|
||||||
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
|
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
|
||||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
|
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false));
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -189,23 +186,23 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
//If there is a possible phasing between the mother and child => phase
|
//If there is a possible phasing between the mother and child => phase
|
||||||
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
|
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
|
||||||
if(childTransmittedAlleleIndex > -1){
|
if(childTransmittedAlleleIndex > -1){
|
||||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||||
childPhasedAlleles.add(childAlleles.get(0));
|
childPhasedAlleles.add(childAlleles.get(0));
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||||
}
|
}
|
||||||
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
|
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
|
||||||
parentPhasedAlleles.add(parentAlleles.get(1));
|
parentPhasedAlleles.add(parentAlleles.get(1));
|
||||||
parentPhasedAlleles.add(parentAlleles.get(0));
|
parentPhasedAlleles.add(parentAlleles.get(0));
|
||||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||||
childPhasedAlleles.add(childAlleles.get(0));
|
childPhasedAlleles.add(childAlleles.get(0));
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||||
}
|
}
|
||||||
//This is a Mendelian Violation => Do not phase
|
//This is a Mendelian Violation => Do not phase
|
||||||
else{
|
else{
|
||||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -239,7 +236,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
motherPhasedAlleles.add(motherAlleles.get(0));
|
motherPhasedAlleles.add(motherAlleles.get(0));
|
||||||
else
|
else
|
||||||
motherPhasedAlleles.add(motherAlleles.get(1));
|
motherPhasedAlleles.add(motherAlleles.get(1));
|
||||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||||
|
|
||||||
//Create father's genotype
|
//Create father's genotype
|
||||||
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
|
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
|
||||||
|
|
@ -248,10 +245,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
fatherPhasedAlleles.add(fatherAlleles.get(0));
|
fatherPhasedAlleles.add(fatherAlleles.get(0));
|
||||||
else
|
else
|
||||||
fatherPhasedAlleles.add(fatherAlleles.get(1));
|
fatherPhasedAlleles.add(fatherAlleles.get(1));
|
||||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||||
|
|
||||||
//Create child's genotype
|
//Create child's genotype
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||||
|
|
||||||
//Once a phased combination is found; exit
|
//Once a phased combination is found; exit
|
||||||
return;
|
return;
|
||||||
|
|
@ -259,9 +256,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
}
|
}
|
||||||
|
|
||||||
//If this is reached then no phasing could be found
|
//If this is reached then no phasing could be found
|
||||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
|
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
|
||||||
|
|
@ -301,62 +298,62 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Applies the trio genotype combination to the given trio.
|
* Applies the trio genotype combination to the given trio.
|
||||||
* @param ref: Reference allele
|
* @param ref: Reference allele
|
||||||
* @param alt: Alternate allele
|
* @param alt: Alternate allele
|
||||||
* @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
|
* @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
|
||||||
* @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
|
* @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
|
||||||
* @param childGenotype: Genotype of the child to phase using this trio genotype combination
|
* @param childGenotype: Genotype of the child to phase using this trio genotype combination
|
||||||
* @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
|
* @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
|
||||||
* @param phasedGenotypes: An ArrayList<Genotype> to which the newly phased genotypes are added in the following order: Mother, Father, Child
|
* @param phasedGenotypes: An ArrayList<Genotype> to which the newly phased genotypes are added in the following order: Mother, Father, Child
|
||||||
*/
|
*/
|
||||||
public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList<Genotype> phasedGenotypes){
|
public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList<Genotype> phasedGenotypes){
|
||||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
|
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
|
||||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
|
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
|
||||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
|
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
|
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
|
||||||
|
|
||||||
int phredScoreTransmission = -1;
|
int phredScoreTransmission = -1;
|
||||||
if(transmissionProb != NO_TRANSMISSION_PROB)
|
if(transmissionProb != NO_TRANSMISSION_PROB)
|
||||||
phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
|
phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
|
||||||
|
|
||||||
//Handle null, missing and unavailable genotypes
|
//Handle null, missing and unavailable genotypes
|
||||||
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
|
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
|
||||||
//genotype so it is safe to return the original genotype in this case.
|
//genotype so it is safe to return the original genotype in this case.
|
||||||
//In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
|
//In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
|
||||||
if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
|
if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
|
||||||
return genotype;
|
return genotype;
|
||||||
|
|
||||||
//Add the transmission probability
|
//Add the transmission probability
|
||||||
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
|
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
|
||||||
genotypeAttributes.putAll(genotype.getAttributes());
|
genotypeAttributes.putAll(genotype.getAttributes());
|
||||||
if(transmissionProb>NO_TRANSMISSION_PROB)
|
if(transmissionProb>NO_TRANSMISSION_PROB)
|
||||||
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
|
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
|
||||||
|
|
||||||
ArrayList<Allele> phasedAlleles = new ArrayList<Allele>(2);
|
ArrayList<Allele> phasedAlleles = new ArrayList<Allele>(2);
|
||||||
for(Allele allele : phasedGenotype.getAlleles()){
|
for(Allele allele : phasedGenotype.getAlleles()){
|
||||||
if(allele.isReference())
|
if(allele.isReference())
|
||||||
phasedAlleles.add(refAllele);
|
phasedAlleles.add(refAllele);
|
||||||
else if(allele.isNonReference())
|
else if(allele.isNonReference())
|
||||||
phasedAlleles.add(altAllele);
|
phasedAlleles.add(altAllele);
|
||||||
//At this point there should not be any other alleles left
|
//At this point there should not be any other alleles left
|
||||||
else
|
else
|
||||||
throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
|
throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Compute the new Log10Error if the genotype is different from the original genotype
|
//Compute the new Log10Error if the genotype is different from the original genotype
|
||||||
double negLog10Error;
|
double log10Error;
|
||||||
if(genotype.getType() == phasedGenotype.getType())
|
if(genotype.getType() == phasedGenotype.getType())
|
||||||
negLog10Error = genotype.getNegLog10PError();
|
log10Error = genotype.getLog10PError();
|
||||||
else
|
else
|
||||||
negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
|
log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
|
||||||
|
|
||||||
return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
|
return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -404,14 +401,14 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
for(Sample familyMember : family){
|
for(Sample familyMember : family){
|
||||||
parents = familyMember.getParents();
|
parents = familyMember.getParents();
|
||||||
if(parents.size()>0){
|
if(parents.size()>0){
|
||||||
if(family.containsAll(parents))
|
if(family.containsAll(parents))
|
||||||
this.trios.add(familyMember);
|
this.trios.add(familyMember);
|
||||||
else
|
else
|
||||||
logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
|
logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -426,11 +423,11 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
|
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
|
||||||
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class);
|
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class);
|
||||||
for(Genotype.Type mother : Genotype.Type.values()){
|
for(Genotype.Type mother : Genotype.Type.values()){
|
||||||
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
|
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
|
||||||
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
|
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
|
||||||
for(Genotype.Type father : Genotype.Type.values()){
|
for(Genotype.Type father : Genotype.Type.values()){
|
||||||
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
|
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
|
||||||
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
|
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
|
||||||
for(Genotype.Type child : Genotype.Type.values()){
|
for(Genotype.Type child : Genotype.Type.values()){
|
||||||
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
|
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
|
||||||
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
|
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
|
||||||
|
|
@ -671,9 +668,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
else
|
else
|
||||||
phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
|
phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
|
||||||
|
|
||||||
//Return the phased genotypes
|
//Return the phased genotypes
|
||||||
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
|
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
|
||||||
return bestMVCount.get(configuration_index);
|
return bestMVCount.get(configuration_index);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -682,14 +679,14 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
|
|
||||||
//Increment metrics counters
|
//Increment metrics counters
|
||||||
if(parent.isCalled() && child.isCalled()){
|
if(parent.isCalled() && child.isCalled()){
|
||||||
counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
|
counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
|
||||||
if(parent.isPhased())
|
if(parent.isPhased())
|
||||||
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
|
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
|
||||||
else{
|
else{
|
||||||
counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
|
counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
|
||||||
if(parent.isHet() && child.isHet())
|
if(parent.isHet() && child.isHet())
|
||||||
counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
|
counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
|
||||||
}
|
}
|
||||||
}else{
|
}else{
|
||||||
counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
|
counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
|
||||||
}
|
}
|
||||||
|
|
@ -700,21 +697,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
|
|
||||||
//Increment metrics counters
|
//Increment metrics counters
|
||||||
if(mother.isCalled() && father.isCalled() && child.isCalled()){
|
if(mother.isCalled() && father.isCalled() && child.isCalled()){
|
||||||
counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
|
counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
|
||||||
if(mother.isPhased())
|
if(mother.isPhased())
|
||||||
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
|
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
|
||||||
|
|
||||||
else{
|
else{
|
||||||
if(mvCount > 0){
|
if(mvCount > 0){
|
||||||
if(mvCount >1)
|
if(mvCount >1)
|
||||||
counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
|
counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
|
||||||
else
|
else
|
||||||
counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
|
counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
|
||||||
}
|
}
|
||||||
else if(mother.isHet() && father.isHet() && child.isHet())
|
else if(mother.isHet() && father.isHet() && child.isHet())
|
||||||
counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
|
counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
|
||||||
|
|
||||||
}
|
}
|
||||||
}else{
|
}else{
|
||||||
counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
|
counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
|
||||||
}
|
}
|
||||||
|
|
@ -749,11 +746,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
|
|
||||||
if (tracker != null) {
|
if (tracker != null) {
|
||||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
|
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
|
||||||
|
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||||
|
|
||||||
Map<String, Genotype> genotypeMap = vc.getGenotypes();
|
GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes());
|
||||||
|
|
||||||
int mvCount;
|
|
||||||
|
|
||||||
for (Sample sample : trios) {
|
for (Sample sample : trios) {
|
||||||
Genotype mother = vc.getGenotype(sample.getMaternalID());
|
Genotype mother = vc.getGenotype(sample.getMaternalID());
|
||||||
Genotype father = vc.getGenotype(sample.getPaternalID());
|
Genotype father = vc.getGenotype(sample.getPaternalID());
|
||||||
|
|
@ -764,18 +759,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ArrayList<Genotype> trioGenotypes = new ArrayList<Genotype>(3);
|
ArrayList<Genotype> trioGenotypes = new ArrayList<Genotype>(3);
|
||||||
mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
||||||
|
|
||||||
Genotype phasedMother = trioGenotypes.get(0);
|
Genotype phasedMother = trioGenotypes.get(0);
|
||||||
Genotype phasedFather = trioGenotypes.get(1);
|
Genotype phasedFather = trioGenotypes.get(1);
|
||||||
Genotype phasedChild = trioGenotypes.get(2);
|
Genotype phasedChild = trioGenotypes.get(2);
|
||||||
|
|
||||||
//Fill the genotype map with the new genotypes and increment metrics counters
|
//Fill the genotype map with the new genotypes and increment metrics counters
|
||||||
genotypeMap.put(phasedChild.getSampleName(),phasedChild);
|
genotypesContext.replace(phasedChild);
|
||||||
if(mother != null){
|
if(mother != null){
|
||||||
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
|
genotypesContext.replace(phasedMother);
|
||||||
if(father != null){
|
if(father != null){
|
||||||
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
|
genotypesContext.replace(phasedFather);
|
||||||
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
|
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
|
||||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
||||||
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||||
|
|
@ -789,24 +784,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
|
genotypesContext.replace(phasedFather);
|
||||||
updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
|
updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
|
||||||
if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
//Report violation if set so
|
//Report violation if set so
|
||||||
//TODO: ADAPT FOR PAIRS TOO!!
|
//TODO: ADAPT FOR PAIRS TOO!!
|
||||||
if(mvCount>0 && mvFile != null)
|
if(mvCount>0 && mvFile != null)
|
||||||
mvFile.println(mvfLine);
|
mvFile.println(mvfLine);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
builder.genotypes(genotypesContext);
|
||||||
VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap);
|
vcfWriter.add(builder.make());
|
||||||
|
|
||||||
vcfWriter.add(newvc);
|
|
||||||
}
|
}
|
||||||
return metricsCounters;
|
return metricsCounters;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,12 +23,10 @@
|
||||||
*/
|
*/
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.DisjointSet;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
// Represents an undirected graph with no self-edges:
|
// Represents an undirected graph with no self-edges:
|
||||||
public class PhasingGraph implements Iterable<PhasingGraphEdge> {
|
class PhasingGraph implements Iterable<PhasingGraphEdge> {
|
||||||
private Neighbors[] adj;
|
private Neighbors[] adj;
|
||||||
|
|
||||||
public PhasingGraph(int numVertices) {
|
public PhasingGraph(int numVertices) {
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
/*
|
/*
|
||||||
Edge class for PhasingGraph
|
Edge class for PhasingGraph
|
||||||
*/
|
*/
|
||||||
public class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
|
class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
|
||||||
protected int v1;
|
protected int v1;
|
||||||
protected int v2;
|
protected int v2;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class PhasingRead extends BaseArray {
|
class PhasingRead extends BaseArray {
|
||||||
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
|
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
|
||||||
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
|
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
|
||||||
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
|
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,382 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import net.sf.samtools.util.StringUtil;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [Short one sentence description of this walker]
|
||||||
|
* <p/>
|
||||||
|
* <p>
|
||||||
|
* [Functionality of this walker]
|
||||||
|
* </p>
|
||||||
|
* <p/>
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* [Input description]
|
||||||
|
* </p>
|
||||||
|
* <p/>
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* [Output description]
|
||||||
|
* </p>
|
||||||
|
* <p/>
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* java
|
||||||
|
* -jar GenomeAnalysisTK.jar
|
||||||
|
* -T $WalkerName
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Your Name
|
||||||
|
* @since Date created
|
||||||
|
*/
|
||||||
|
class PhasingUtils {
|
||||||
|
static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
|
||||||
|
if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// Check that it's logically possible to merge the VCs:
|
||||||
|
if (!allSamplesAreMergeable(vc1, vc2))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// Check if there's a "point" in merging the VCs (e.g., annotations could be changed)
|
||||||
|
if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
|
||||||
|
int startInter = vc1.getEnd() + 1;
|
||||||
|
int endInter = vc2.getStart() - 1;
|
||||||
|
byte[] intermediateBases = null;
|
||||||
|
if (startInter <= endInter) {
|
||||||
|
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
|
||||||
|
StringUtil.toUpperCase(intermediateBases);
|
||||||
|
}
|
||||||
|
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
|
||||||
|
|
||||||
|
GenotypesContext mergedGenotypes = GenotypesContext.create();
|
||||||
|
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||||
|
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||||
|
|
||||||
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
|
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
|
||||||
|
|
||||||
|
/* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
|
||||||
|
we preserve phase information (if any) relative to whatever precedes vc1:
|
||||||
|
*/
|
||||||
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
|
for (Allele all1 : site1Alleles) {
|
||||||
|
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||||
|
|
||||||
|
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
|
||||||
|
mergedAllelesForSample.add(mergedAllele);
|
||||||
|
}
|
||||||
|
|
||||||
|
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
|
||||||
|
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
|
||||||
|
|
||||||
|
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
|
||||||
|
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
|
||||||
|
if (phaseQual.PQ != null)
|
||||||
|
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
|
||||||
|
|
||||||
|
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
|
||||||
|
mergedGenotypes.add(mergedGt);
|
||||||
|
}
|
||||||
|
|
||||||
|
String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
|
||||||
|
double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError());
|
||||||
|
Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
|
||||||
|
Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);
|
||||||
|
|
||||||
|
// ids
|
||||||
|
List<String> mergedIDs = new ArrayList<String>();
|
||||||
|
if ( vc1.hasID() ) mergedIDs.add(vc1.getID());
|
||||||
|
if ( vc2.hasID() ) mergedIDs.add(vc2.getID());
|
||||||
|
String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
|
||||||
|
|
||||||
|
VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs);
|
||||||
|
VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true);
|
||||||
|
return mergedBuilder.make();
|
||||||
|
}
|
||||||
|
|
||||||
|
static String mergeVariantContextNames(String name1, String name2) {
|
||||||
|
return name1 + "_" + name2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
|
||||||
|
Map<String, Object> mergedAttribs = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
List<VariantContext> vcList = new LinkedList<VariantContext>();
|
||||||
|
vcList.add(vc1);
|
||||||
|
vcList.add(vc2);
|
||||||
|
|
||||||
|
String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
|
||||||
|
for (String orAttrib : MERGE_OR_ATTRIBS) {
|
||||||
|
boolean attribVal = false;
|
||||||
|
for (VariantContext vc : vcList) {
|
||||||
|
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
|
||||||
|
if (attribVal) // already true, so no reason to continue:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
mergedAttribs.put(orAttrib, attribVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedAttribs;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
|
||||||
|
GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
|
||||||
|
GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);
|
||||||
|
|
||||||
|
if (!loc1.onSameContig(loc2))
|
||||||
|
throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");
|
||||||
|
|
||||||
|
if (!loc1.isBefore(loc2))
|
||||||
|
throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2");
|
||||||
|
|
||||||
|
if (vc1.isFiltered() || vc2.isFiltered())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) {
|
||||||
|
for (final Genotype gt : vc.getGenotypes()) {
|
||||||
|
if (gt.isNoCall() || gt.isFiltered())
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
|
||||||
|
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
|
||||||
|
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||||
|
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||||
|
|
||||||
|
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
|
||||||
|
if (gt1.getPloidy() != gt2.getPloidy())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
|
||||||
|
|
||||||
|
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
|
||||||
|
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
|
||||||
|
*/
|
||||||
|
return (gt2.isPhased() || gt2.isHom() || gt1.isHom());
|
||||||
|
}
|
||||||
|
|
||||||
|
static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
|
||||||
|
if (gt2.isPhased() || gt2.isHom())
|
||||||
|
return new PhaseAndQuality(gt1); // maintain the phase of gt1
|
||||||
|
|
||||||
|
if (!gt1.isHom())
|
||||||
|
throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
|
||||||
|
|
||||||
|
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
|
||||||
|
|
||||||
|
For example, if we're merging the third Genotype with the second one:
|
||||||
|
0/1
|
||||||
|
1|1
|
||||||
|
0/1
|
||||||
|
|
||||||
|
Then, we want to output:
|
||||||
|
0/1
|
||||||
|
1/2
|
||||||
|
*/
|
||||||
|
return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
|
||||||
|
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||||
|
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||||
|
|
||||||
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
|
for (Allele all1 : site1Alleles) {
|
||||||
|
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||||
|
|
||||||
|
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
|
||||||
|
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
|
||||||
|
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
|
||||||
|
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
|
||||||
|
|
||||||
|
// Note the segregation of the alleles for the reference genome:
|
||||||
|
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
|
||||||
|
allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
|
||||||
|
|
||||||
|
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
|
||||||
|
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||||
|
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||||
|
|
||||||
|
List<Allele> site1Alleles = gt1.getAlleles();
|
||||||
|
List<Allele> site2Alleles = gt2.getAlleles();
|
||||||
|
|
||||||
|
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||||
|
for (Allele all1 : site1Alleles) {
|
||||||
|
Allele all2 = all2It.next();
|
||||||
|
|
||||||
|
Allele all1To2 = allele1ToAllele2.get(all1);
|
||||||
|
if (all1To2 == null)
|
||||||
|
allele1ToAllele2.put(all1, all2);
|
||||||
|
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Allele all2To1 = allele2ToAllele1.get(all2);
|
||||||
|
if (all2To1 == null)
|
||||||
|
allele2ToAllele1.put(all2, all1);
|
||||||
|
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
abstract static class AlleleMergeRule {
|
||||||
|
// vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
|
||||||
|
abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "all samples are mergeable";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class AlleleOneAndTwo {
|
||||||
|
private Allele all1;
|
||||||
|
private Allele all2;
|
||||||
|
|
||||||
|
public AlleleOneAndTwo(Allele all1, Allele all2) {
|
||||||
|
this.all1 = all1;
|
||||||
|
this.all2 = all2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return all1.hashCode() + all2.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (!(other instanceof AlleleOneAndTwo))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
|
||||||
|
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class MergedAllelesData {
|
||||||
|
private Map<AlleleOneAndTwo, Allele> mergedAlleles;
|
||||||
|
private byte[] intermediateBases;
|
||||||
|
private int intermediateLength;
|
||||||
|
|
||||||
|
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
|
||||||
|
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
|
||||||
|
this.intermediateBases = intermediateBases;
|
||||||
|
this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0;
|
||||||
|
|
||||||
|
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele ensureMergedAllele(Allele all1, Allele all2) {
|
||||||
|
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
|
||||||
|
}
|
||||||
|
|
||||||
|
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
|
||||||
|
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
|
||||||
|
Allele mergedAllele = mergedAlleles.get(all12);
|
||||||
|
|
||||||
|
if (mergedAllele == null) {
|
||||||
|
byte[] bases1 = all1.getBases();
|
||||||
|
byte[] bases2 = all2.getBases();
|
||||||
|
|
||||||
|
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
|
||||||
|
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
|
||||||
|
if (intermediateBases != null)
|
||||||
|
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
|
||||||
|
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
|
||||||
|
|
||||||
|
mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
|
||||||
|
mergedAlleles.put(all12, mergedAllele);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedAllele;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Allele> getAllMergedAlleles() {
|
||||||
|
return new HashSet<Allele>(mergedAlleles.values());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class PhaseAndQuality {
|
||||||
|
public boolean isPhased;
|
||||||
|
public Double PQ = null;
|
||||||
|
|
||||||
|
public PhaseAndQuality(Genotype gt) {
|
||||||
|
this.isPhased = gt.isPhased();
|
||||||
|
if (this.isPhased) {
|
||||||
|
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
|
||||||
|
if ( this.PQ == -1 ) this.PQ = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
/* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values
|
/* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values
|
||||||
with precision (prevents underflow by representing in log10 space).
|
with precision (prevents underflow by representing in log10 space).
|
||||||
*/
|
*/
|
||||||
public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
|
class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
|
||||||
private static final double EQUALS_THRESH = 1e-6;
|
private static final double EQUALS_THRESH = 1e-6;
|
||||||
private static final double INFINITY = Double.POSITIVE_INFINITY;
|
private static final double INFINITY = Double.POSITIVE_INFINITY;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,17 +34,13 @@ import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.DisjointSet;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -125,7 +121,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
public int MIN_MAPPING_QUALITY_SCORE = 20;
|
public int MIN_MAPPING_QUALITY_SCORE = 20;
|
||||||
|
|
||||||
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
|
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
|
||||||
protected List<String> samplesToPhase = null;
|
protected Set
|
||||||
|
<String> samplesToPhase = null;
|
||||||
|
|
||||||
private GenomeLoc mostDownstreamLocusReached = null;
|
private GenomeLoc mostDownstreamLocusReached = null;
|
||||||
|
|
||||||
|
|
@ -275,10 +272,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
|
|
||||||
private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList(PQ_KEY));
|
private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList(PQ_KEY));
|
||||||
|
|
||||||
private VariantContext reduceVCToSamples(VariantContext vc, List<String> samplesToPhase) {
|
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
|
||||||
// for ( String sample : samplesToPhase )
|
// for ( String sample : samplesToPhase )
|
||||||
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
|
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
|
||||||
VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values());
|
VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
|
||||||
// logger.debug("original VC = " + vc);
|
// logger.debug("original VC = " + vc);
|
||||||
// logger.debug("sub VC = " + subvc);
|
// logger.debug("sub VC = " + subvc);
|
||||||
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
|
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
|
||||||
|
|
@ -355,17 +352,16 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
UnfinishedVariantContext uvc = uvr.unfinishedVariant;
|
UnfinishedVariantContext uvc = uvr.unfinishedVariant;
|
||||||
|
|
||||||
// Perform per-sample phasing:
|
// Perform per-sample phasing:
|
||||||
Map<String, Genotype> sampGenotypes = vc.getGenotypes();
|
GenotypesContext sampGenotypes = vc.getGenotypes();
|
||||||
Map<String, PhaseCounts> samplePhaseStats = new TreeMap<String, PhaseCounts>();
|
Map<String, PhaseCounts> samplePhaseStats = new TreeMap<String, PhaseCounts>();
|
||||||
for (Map.Entry<String, Genotype> sampGtEntry : sampGenotypes.entrySet()) {
|
for (final Genotype gt : sampGenotypes) {
|
||||||
String samp = sampGtEntry.getKey();
|
String samp = gt.getSampleName();
|
||||||
Genotype gt = sampGtEntry.getValue();
|
|
||||||
|
|
||||||
if (DEBUG) logger.debug("sample = " + samp);
|
if (DEBUG) logger.debug("sample = " + samp);
|
||||||
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
||||||
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
||||||
// true <-> can trivially phase a hom site relative to ANY previous site:
|
// true <-> can trivially phase a hom site relative to ANY previous site:
|
||||||
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
||||||
uvc.setGenotype(samp, phasedGt);
|
uvc.setGenotype(samp, phasedGt);
|
||||||
}
|
}
|
||||||
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
|
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
|
||||||
|
|
@ -401,7 +397,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
|
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
|
||||||
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
|
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
|
||||||
gtAttribs.put(PQ_KEY, pr.phaseQuality);
|
gtAttribs.put(PQ_KEY, pr.phaseQuality);
|
||||||
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
|
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
|
||||||
uvc.setGenotype(samp, phasedGt);
|
uvc.setGenotype(samp, phasedGt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -421,7 +417,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
if (genotypesArePhased) {
|
if (genotypesArePhased) {
|
||||||
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes());
|
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes());
|
||||||
handledGtAttribs.put(PQ_KEY, pr.phaseQuality);
|
handledGtAttribs.put(PQ_KEY, pr.phaseQuality);
|
||||||
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
|
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
|
||||||
interiorUvc.setGenotype(samp, phasedHomGt);
|
interiorUvc.setGenotype(samp, phasedHomGt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1055,7 +1051,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
private void writeVCF(VariantContext vc) {
|
private void writeVCF(VariantContext vc) {
|
||||||
if (samplesToPhase == null || vc.isNotFiltered())
|
if (samplesToPhase == null || vc.isNotFiltered())
|
||||||
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
|
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
|
||||||
WriteVCF.writeVCF(vc, writer, logger);
|
writer.add(vc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean processVariantInPhasing(VariantContext vc) {
|
public static boolean processVariantInPhasing(VariantContext vc) {
|
||||||
|
|
@ -1126,25 +1122,34 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
private int start;
|
private int start;
|
||||||
private int stop;
|
private int stop;
|
||||||
private Collection<Allele> alleles;
|
private Collection<Allele> alleles;
|
||||||
private Map<String, Genotype> genotypes;
|
private Map<String,Genotype> genotypes;
|
||||||
private double negLog10PError;
|
private double log10PError;
|
||||||
private Set<String> filters;
|
private Set<String> filters;
|
||||||
private Map<String, Object> attributes;
|
private Map<String, Object> attributes;
|
||||||
|
private String id;
|
||||||
|
|
||||||
public UnfinishedVariantContext(VariantContext vc) {
|
public UnfinishedVariantContext(VariantContext vc) {
|
||||||
this.name = vc.getSource();
|
this.name = vc.getSource();
|
||||||
|
this.id = vc.getID();
|
||||||
this.contig = vc.getChr();
|
this.contig = vc.getChr();
|
||||||
this.start = vc.getStart();
|
this.start = vc.getStart();
|
||||||
this.stop = vc.getEnd();
|
this.stop = vc.getEnd();
|
||||||
this.alleles = vc.getAlleles();
|
this.alleles = vc.getAlleles();
|
||||||
this.genotypes = new HashMap<String, Genotype>(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable
|
|
||||||
this.negLog10PError = vc.getNegLog10PError();
|
this.genotypes = new HashMap<String, Genotype>();
|
||||||
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
this.genotypes.put(g.getSampleName(), g);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.log10PError = vc.getLog10PError();
|
||||||
this.filters = vc.filtersWereApplied() ? vc.getFilters() : null;
|
this.filters = vc.filtersWereApplied() ? vc.getFilters() : null;
|
||||||
this.attributes = new HashMap<String, Object>(vc.getAttributes());
|
this.attributes = new HashMap<String, Object>(vc.getAttributes());
|
||||||
}
|
}
|
||||||
|
|
||||||
public VariantContext toVariantContext() {
|
public VariantContext toVariantContext() {
|
||||||
return new VariantContext(name, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
|
GenotypesContext gc = GenotypesContext.copy(this.genotypes.values());
|
||||||
|
return new VariantContextBuilder(name, contig, start, stop, alleles).id(id)
|
||||||
|
.genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenomeLoc getLocation() {
|
public GenomeLoc getLocation() {
|
||||||
|
|
@ -1156,7 +1161,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setGenotype(String sample, Genotype newGt) {
|
public void setGenotype(String sample, Genotype newGt) {
|
||||||
genotypes.put(sample, newGt);
|
this.genotypes.put(sample, newGt);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPhasingInconsistent() {
|
public void setPhasingInconsistent() {
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
*/
|
*/
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||||
|
|
||||||
public class ReadBase {
|
class ReadBase {
|
||||||
public String readName;
|
public String readName;
|
||||||
public byte base;
|
public byte base;
|
||||||
public int mappingQual;
|
public int mappingQual;
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
|
||||||
public class ReadBasesAtPosition implements Iterable<ReadBase> {
|
class ReadBasesAtPosition implements Iterable<ReadBase> {
|
||||||
// list of: <read name, base>
|
// list of: <read name, base>
|
||||||
private LinkedList<ReadBase> bases;
|
private LinkedList<ReadBase> bases;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,189 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
|
|
||||||
*/
|
|
||||||
public class RefSeqDataParser {
|
|
||||||
private static String REFSEQ_PREFIX = "refseq.";
|
|
||||||
|
|
||||||
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
|
|
||||||
private static String NAME_KEY = REFSEQ_PREFIX + "name";
|
|
||||||
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
|
|
||||||
|
|
||||||
private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY};
|
|
||||||
|
|
||||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
|
|
||||||
String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
|
|
||||||
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
|
|
||||||
|
|
||||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
|
||||||
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
|
|
||||||
if (numRecords != -1) {
|
|
||||||
boolean done = false;
|
|
||||||
|
|
||||||
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
|
|
||||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
|
||||||
if (name != null) {
|
|
||||||
entriesToNames.put(nameKeyToUse, name);
|
|
||||||
done = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!done) {
|
|
||||||
for (int i = 1; i <= numRecords; i++) {
|
|
||||||
String key = nameKeyToUseMultiplePrefix + i;
|
|
||||||
String name = vc.getAttributeAsString(key, null);
|
|
||||||
if (name != null)
|
|
||||||
entriesToNames.put(key, name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else { // no entry with the # of records:
|
|
||||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
|
||||||
if (name != null) {
|
|
||||||
entriesToNames.put(nameKeyToUse, name);
|
|
||||||
}
|
|
||||||
else { // Check all INFO fields for a match (if there are multiple entries):
|
|
||||||
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
|
|
||||||
String key = entry.getKey();
|
|
||||||
if (key.startsWith(nameKeyToUseMultiplePrefix))
|
|
||||||
entriesToNames.put(key, entry.getValue().toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return entriesToNames;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
|
|
||||||
return getRefSeqEntriesToNames(vc, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
|
|
||||||
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Set<String> getRefSeqNames(VariantContext vc) {
|
|
||||||
return getRefSeqNames(vc, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Map<String, Object> getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) {
|
|
||||||
Map<String, Object> refSeqNameAttribs = new HashMap<String, Object>();
|
|
||||||
|
|
||||||
Map<String, RefSeqEntry> entriesMap1 = getAllRefSeqEntriesByName(vc1);
|
|
||||||
Map<String, RefSeqEntry> entriesMap2 = getAllRefSeqEntriesByName(vc2);
|
|
||||||
|
|
||||||
Set<String> commonNames = entriesMap1.keySet();
|
|
||||||
commonNames.retainAll(entriesMap2.keySet());
|
|
||||||
boolean addSuffix = commonNames.size() > 1;
|
|
||||||
int nextCount = 1;
|
|
||||||
|
|
||||||
for (String name : commonNames) {
|
|
||||||
RefSeqEntry refseq1 = entriesMap1.get(name);
|
|
||||||
RefSeqEntry refseq2 = entriesMap2.get(name);
|
|
||||||
|
|
||||||
String keySuffix = "";
|
|
||||||
if (addSuffix)
|
|
||||||
keySuffix = "_" + nextCount;
|
|
||||||
|
|
||||||
boolean added = false;
|
|
||||||
for (String key : NAME_KEYS) {
|
|
||||||
Object obj1 = refseq1.info.get(key);
|
|
||||||
Object obj2 = refseq2.info.get(key);
|
|
||||||
if (obj1 != null && obj2 != null && obj1.equals(obj2)) {
|
|
||||||
added = true;
|
|
||||||
String useKey = key + keySuffix;
|
|
||||||
refSeqNameAttribs.put(useKey, obj1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (added)
|
|
||||||
nextCount++;
|
|
||||||
}
|
|
||||||
int totalCount = nextCount - 1; // since incremented count one extra time
|
|
||||||
if (totalCount > 1)
|
|
||||||
refSeqNameAttribs.put(NUM_RECORDS_KEY, totalCount);
|
|
||||||
|
|
||||||
return refSeqNameAttribs;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Map<String, Object> removeRefSeqAttributes(Map<String, Object> attributes) {
|
|
||||||
Map<String, Object> removedRefSeqAttributes = new HashMap<String, Object>(attributes);
|
|
||||||
|
|
||||||
Iterator<Map.Entry<String, Object>> attrIt = removedRefSeqAttributes.entrySet().iterator();
|
|
||||||
while (attrIt.hasNext()) {
|
|
||||||
String key = attrIt.next().getKey();
|
|
||||||
if (key.startsWith(REFSEQ_PREFIX))
|
|
||||||
attrIt.remove();
|
|
||||||
}
|
|
||||||
|
|
||||||
return removedRefSeqAttributes;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Map<String, RefSeqEntry> getAllRefSeqEntriesByName(VariantContext vc) {
|
|
||||||
Map<String, RefSeqEntry> nameToEntries = new TreeMap<String, RefSeqEntry>();
|
|
||||||
|
|
||||||
List<RefSeqEntry> allEntries = getAllRefSeqEntries(vc);
|
|
||||||
for (RefSeqEntry entry : allEntries) {
|
|
||||||
Object name = entry.info.get(NAME_KEY);
|
|
||||||
if (name != null)
|
|
||||||
nameToEntries.put(name.toString(), entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return nameToEntries;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
|
|
||||||
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
|
|
||||||
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
|
|
||||||
|
|
||||||
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
|
|
||||||
String entry = entryToName.getKey();
|
|
||||||
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
|
|
||||||
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
|
|
||||||
}
|
|
||||||
|
|
||||||
return allRefSeq;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class RefSeqEntry {
|
|
||||||
public Map<String, Object> info;
|
|
||||||
|
|
||||||
public RefSeqEntry(VariantContext vc, String entrySuffix) {
|
|
||||||
this.info = new HashMap<String, Object>();
|
|
||||||
|
|
||||||
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
|
|
||||||
String key = attribEntry.getKey();
|
|
||||||
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
|
|
||||||
String genericKey = key.replaceAll(entrySuffix, "");
|
|
||||||
this.info.put(genericKey, attribEntry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
|
||||||
public class SNPallelePair extends AllelePair {
|
class SNPallelePair extends AllelePair {
|
||||||
|
|
||||||
public SNPallelePair(Genotype gt) {
|
public SNPallelePair(Genotype gt) {
|
||||||
super(gt);
|
super(gt);
|
||||||
|
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
public class WriteVCF {
|
|
||||||
public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) {
|
|
||||||
writer.add(vc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -39,8 +39,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
@ -466,9 +466,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
|
|
||||||
if (vcfWriter != null && writeVariant) {
|
if (vcfWriter != null && writeVariant) {
|
||||||
if (!vcComp.hasAttribute("callStatus")) {
|
if (!vcComp.hasAttribute("callStatus")) {
|
||||||
MutableVariantContext mvc = new MutableVariantContext(vcComp);
|
vcfWriter.add(new VariantContextBuilder(vcComp).attribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF").make());
|
||||||
mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
|
|
||||||
vcfWriter.add(mvc);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
vcfWriter.add(vcComp);
|
vcfWriter.add(vcComp);
|
||||||
|
|
|
||||||
|
|
@ -260,7 +260,7 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else /* (mask != null && validate == null ) */ {
|
} else /* (mask != null && validate == null ) */ {
|
||||||
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
|
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )) {
|
||||||
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
||||||
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
|
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
|
||||||
sequenceInvalid = true;
|
sequenceInvalid = true;
|
||||||
|
|
@ -281,7 +281,7 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
sequence.append('N');
|
sequence.append('N');
|
||||||
indelCounter--;
|
indelCounter--;
|
||||||
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||||
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){
|
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )){
|
||||||
logger.debug("SNP in mask found at " + ref.getLocus().toString());
|
logger.debug("SNP in mask found at " + ref.getLocus().toString());
|
||||||
|
|
||||||
if ( lowerCaseSNPs ) {
|
if ( lowerCaseSNPs ) {
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
@ -186,7 +187,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
||||||
* File containing tribble-readable features for the IntervalStratificiation
|
* File containing tribble-readable features for the IntervalStratificiation
|
||||||
*/
|
*/
|
||||||
@Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false)
|
@Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false)
|
||||||
protected IntervalBinding<Feature> intervalsFile = null;
|
public IntervalBinding<Feature> intervalsFile = null;
|
||||||
|
|
||||||
// Variables
|
// Variables
|
||||||
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
|
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
|
||||||
|
|
@ -330,9 +331,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
||||||
for ( VariantContext eval : evalSetBySample ) {
|
for ( VariantContext eval : evalSetBySample ) {
|
||||||
// deal with ancestral alleles if requested
|
// deal with ancestral alleles if requested
|
||||||
if ( eval != null && aastr != null ) {
|
if ( eval != null && aastr != null ) {
|
||||||
HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
|
eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
|
||||||
newAts.put("ANCESTRALALLELE", aastr);
|
|
||||||
eval = VariantContext.modifyAttributes(eval, newAts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// for each comp track
|
// for each comp track
|
||||||
|
|
|
||||||
|
|
@ -72,7 +72,7 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
boolean evalIsGood = eval != null && eval.isPolymorphic();
|
boolean evalIsGood = eval != null && eval.isPolymorphicInSamples();
|
||||||
boolean compIsGood = comp != null && comp.isNotFiltered();
|
boolean compIsGood = comp != null && comp.isNotFiltered();
|
||||||
|
|
||||||
if (evalIsGood) nEvalVariants++; // count the number of eval events
|
if (evalIsGood) nEvalVariants++; // count the number of eval events
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
||||||
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
|
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
|
||||||
// added in a proxy check for monomorphic status here.
|
// added in a proxy check for monomorphic status here.
|
||||||
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
|
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
|
||||||
if ( vc1.isMonomorphic() ) {
|
if ( vc1.isMonomorphicInSamples() ) {
|
||||||
nRefLoci++;
|
nRefLoci++;
|
||||||
} else {
|
} else {
|
||||||
switch (vc1.getType()) {
|
switch (vc1.getType()) {
|
||||||
|
|
@ -157,8 +157,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
||||||
// A C A
|
// A C A
|
||||||
// A C C
|
// A C C
|
||||||
|
|
||||||
for (Genotype g : vc1.getGenotypes().values()) {
|
for (final Genotype g : vc1.getGenotypes()) {
|
||||||
String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null;
|
final String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null;
|
||||||
|
|
||||||
switch (g.getType()) {
|
switch (g.getType()) {
|
||||||
case NO_CALL:
|
case NO_CALL:
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
||||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
@ -51,21 +50,21 @@ public class G1KPhaseITable extends VariantEvaluator {
|
||||||
@DataPoint(description = "Number of SNPs")
|
@DataPoint(description = "Number of SNPs")
|
||||||
public long nSNPs = 0;
|
public long nSNPs = 0;
|
||||||
@DataPoint(description = "SNP Novelty Rate")
|
@DataPoint(description = "SNP Novelty Rate")
|
||||||
public double SNPNoveltyRate = 0;
|
public String SNPNoveltyRate = "NA";
|
||||||
@DataPoint(description = "Mean number of SNPs per individual")
|
@DataPoint(description = "Mean number of SNPs per individual")
|
||||||
public long nSNPsPerSample = 0;
|
public long nSNPsPerSample = 0;
|
||||||
|
|
||||||
@DataPoint(description = "Number of Indels")
|
@DataPoint(description = "Number of Indels")
|
||||||
public long nIndels = 0;
|
public long nIndels = 0;
|
||||||
@DataPoint(description = "Indel Novelty Rate")
|
@DataPoint(description = "Indel Novelty Rate")
|
||||||
public double IndelNoveltyRate = 0;
|
public String IndelNoveltyRate = "NA";
|
||||||
@DataPoint(description = "Mean number of Indels per individual")
|
@DataPoint(description = "Mean number of Indels per individual")
|
||||||
public long nIndelsPerSample = 0;
|
public long nIndelsPerSample = 0;
|
||||||
|
|
||||||
@DataPoint(description = "Number of SVs")
|
@DataPoint(description = "Number of SVs")
|
||||||
public long nSVs = 0;
|
public long nSVs = 0;
|
||||||
@DataPoint(description = "SV Novelty Rate")
|
@DataPoint(description = "SV Novelty Rate")
|
||||||
public double SVNoveltyRate = 0;
|
public String SVNoveltyRate = "NA";
|
||||||
@DataPoint(description = "Mean number of SVs per individual")
|
@DataPoint(description = "Mean number of SVs per individual")
|
||||||
public long nSVsPerSample = 0;
|
public long nSVsPerSample = 0;
|
||||||
|
|
||||||
|
|
@ -103,12 +102,9 @@ public class G1KPhaseITable extends VariantEvaluator {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
if ( eval == null || eval.isMonomorphic() ) return null;
|
if ( eval == null || eval.isMonomorphicInSamples() ) return null;
|
||||||
|
|
||||||
switch (eval.getType()) {
|
switch (eval.getType()) {
|
||||||
// case NO_VARIATION:
|
|
||||||
// // shouldn't get here
|
|
||||||
// break;
|
|
||||||
case SNP:
|
case SNP:
|
||||||
case INDEL:
|
case INDEL:
|
||||||
case SYMBOLIC:
|
case SYMBOLIC:
|
||||||
|
|
@ -121,7 +117,7 @@ public class G1KPhaseITable extends VariantEvaluator {
|
||||||
}
|
}
|
||||||
|
|
||||||
// count variants per sample
|
// count variants per sample
|
||||||
for (final Genotype g : eval.getGenotypes().values()) {
|
for (final Genotype g : eval.getGenotypes()) {
|
||||||
if ( ! g.isNoCall() && ! g.isHomRef() ) {
|
if ( ! g.isNoCall() && ! g.isHomRef() ) {
|
||||||
int count = countsPerSample.get(g.getSampleName()).get(eval.getType());
|
int count = countsPerSample.get(g.getSampleName()).get(eval.getType());
|
||||||
countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1);
|
countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1);
|
||||||
|
|
@ -139,11 +135,12 @@ public class G1KPhaseITable extends VariantEvaluator {
|
||||||
return (int)(Math.round(sum / (1.0 * countsPerSample.size())));
|
return (int)(Math.round(sum / (1.0 * countsPerSample.size())));
|
||||||
}
|
}
|
||||||
|
|
||||||
private final double noveltyRate(VariantContext.Type type) {
|
private final String noveltyRate(VariantContext.Type type) {
|
||||||
int all = allVariantCounts.get(type);
|
int all = allVariantCounts.get(type);
|
||||||
int known = knownVariantCounts.get(type);
|
int known = knownVariantCounts.get(type);
|
||||||
int novel = all - known;
|
int novel = all - known;
|
||||||
return (novel / (1.0 * all));
|
double rate = (novel / (1.0 * all));
|
||||||
|
return all == 0 ? "NA" : String.format("%.2f", rate);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void finalizeEvaluation() {
|
public void finalizeEvaluation() {
|
||||||
|
|
|
||||||
|
|
@ -209,7 +209,7 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
|
|
||||||
//public GenotypeConcordance(VariantEvalWalker parent) {
|
//public GenotypeConcordance(VariantEvalWalker parent) {
|
||||||
// super(parent);
|
// super(parent);
|
||||||
// discordantInteresting = parent.DISCORDANT_INTERESTING;
|
// discordantInteresting = parent.DISCORDANT_INTERESTING;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
public String getName() {
|
public String getName() {
|
||||||
|
|
@ -277,8 +277,9 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
|
|
||||||
// determine concordance for eval data
|
// determine concordance for eval data
|
||||||
if (eval != null) {
|
if (eval != null) {
|
||||||
for (final String sample : eval.getGenotypes().keySet()) {
|
for (final Genotype g : eval.getGenotypes() ) {
|
||||||
final Genotype.Type called = eval.getGenotype(sample).getType();
|
final String sample = g.getSampleName();
|
||||||
|
final Genotype.Type called = g.getType();
|
||||||
final Genotype.Type truth;
|
final Genotype.Type truth;
|
||||||
|
|
||||||
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
|
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
|
||||||
|
|
@ -299,9 +300,9 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
else {
|
else {
|
||||||
final Genotype.Type called = Genotype.Type.NO_CALL;
|
final Genotype.Type called = Genotype.Type.NO_CALL;
|
||||||
|
|
||||||
for (final String sample : validation.getGenotypes().keySet()) {
|
for (final Genotype g : validation.getGenotypes()) {
|
||||||
final Genotype.Type truth = validation.getGenotype(sample).getType();
|
final Genotype.Type truth = g.getType();
|
||||||
detailedStats.incrValue(sample, truth, called);
|
detailedStats.incrValue(g.getSampleName(), truth, called);
|
||||||
|
|
||||||
// print out interesting sites
|
// print out interesting sites
|
||||||
/*
|
/*
|
||||||
|
|
@ -410,8 +411,8 @@ class SampleStats implements TableType {
|
||||||
|
|
||||||
public SampleStats(VariantContext vc, int nGenotypeTypes) {
|
public SampleStats(VariantContext vc, int nGenotypeTypes) {
|
||||||
this.nGenotypeTypes = nGenotypeTypes;
|
this.nGenotypeTypes = nGenotypeTypes;
|
||||||
for (String sample : vc.getGenotypes().keySet())
|
for (final Genotype g : vc.getGenotypes())
|
||||||
concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]);
|
concordanceStats.put(g.getSampleName(), new long[nGenotypeTypes][nGenotypeTypes]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SampleStats(int genotypeTypes) {
|
public SampleStats(int genotypeTypes) {
|
||||||
|
|
@ -511,8 +512,8 @@ class SampleSummaryStats implements TableType {
|
||||||
|
|
||||||
public SampleSummaryStats(final VariantContext vc) {
|
public SampleSummaryStats(final VariantContext vc) {
|
||||||
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
|
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
|
||||||
for( final String sample : vc.getGenotypes().keySet() ) {
|
for( final Genotype g : vc.getGenotypes() ) {
|
||||||
concordanceSummary.put(sample, new double[COLUMN_KEYS.length]);
|
concordanceSummary.put(g.getSampleName(), new double[COLUMN_KEYS.length]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
@ -91,13 +92,13 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
||||||
|
|
||||||
Set<String> allSamples = new HashSet<String>();
|
Set<String> allSamples = new HashSet<String>();
|
||||||
|
|
||||||
Map<String, Genotype> compSampGenotypes = null;
|
GenotypesContext compSampGenotypes = null;
|
||||||
if (isRelevantToPhasing(comp)) {
|
if (isRelevantToPhasing(comp)) {
|
||||||
allSamples.addAll(comp.getSampleNames());
|
allSamples.addAll(comp.getSampleNames());
|
||||||
compSampGenotypes = comp.getGenotypes();
|
compSampGenotypes = comp.getGenotypes();
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Genotype> evalSampGenotypes = null;
|
GenotypesContext evalSampGenotypes = null;
|
||||||
if (isRelevantToPhasing(eval)) {
|
if (isRelevantToPhasing(eval)) {
|
||||||
allSamples.addAll(eval.getSampleNames());
|
allSamples.addAll(eval.getSampleNames());
|
||||||
evalSampGenotypes = eval.getGenotypes();
|
evalSampGenotypes = eval.getGenotypes();
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,7 @@ public class IndelLengthHistogram extends VariantEvaluator {
|
||||||
|
|
||||||
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
if ( vc1.isIndel() && vc1.isPolymorphic() ) {
|
if ( vc1.isIndel() && vc1.isPolymorphicInSamples() ) {
|
||||||
|
|
||||||
if ( ! vc1.isBiallelic() ) {
|
if ( ! vc1.isBiallelic() ) {
|
||||||
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
//veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored.");
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,9 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
||||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
||||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
|
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
|
||||||
import org.broadinstitute.sting.utils.IndelUtils;
|
import org.broadinstitute.sting.utils.IndelUtils;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
|
@ -270,7 +268,7 @@ public class IndelStatistics extends VariantEvaluator {
|
||||||
|
|
||||||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
if (eval != null && eval.isPolymorphic()) {
|
if (eval != null && eval.isPolymorphicInSamples()) {
|
||||||
if ( indelStats == null ) {
|
if ( indelStats == null ) {
|
||||||
indelStats = new IndelStats(eval);
|
indelStats = new IndelStats(eval);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean includeGenotype(Genotype g) {
|
private boolean includeGenotype(Genotype g) {
|
||||||
return g.getNegLog10PError() > getQThreshold() && g.isCalled();
|
return g.getLog10PError() > getQThreshold() && g.isCalled();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) {
|
public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) {
|
||||||
|
|
|
||||||
|
|
@ -118,7 +118,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
||||||
int ac = -1;
|
int ac = -1;
|
||||||
|
|
||||||
if ( eval.hasGenotypes() )
|
if ( eval.hasGenotypes() )
|
||||||
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
|
ac = eval.getCalledChrCount(eval.getAlternateAllele(0));
|
||||||
else if ( eval.hasAttribute("AC") ) {
|
else if ( eval.hasAttribute("AC") ) {
|
||||||
ac = eval.getAttributeAsInt("AC", -1);
|
ac = eval.getAttributeAsInt("AC", -1);
|
||||||
}
|
}
|
||||||
|
|
@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) {
|
if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() && metrics != null ) {
|
||||||
metrics.incrValue(eval);
|
metrics.incrValue(eval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) {
|
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphicInSamples()) {
|
||||||
return null; //no interesting sites
|
return null; //no interesting sites
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
|
||||||
float numGenosHere = 0;
|
float numGenosHere = 0;
|
||||||
int numIndsHere = 0;
|
int numIndsHere = 0;
|
||||||
|
|
||||||
for (Genotype genotype : vc.getGenotypes().values()) {
|
for (final Genotype genotype : vc.getGenotypes()) {
|
||||||
numIndsHere++;
|
numIndsHere++;
|
||||||
if (!genotype.isNoCall()) {
|
if (!genotype.isNoCall()) {
|
||||||
//increment stats for heterozygosity
|
//increment stats for heterozygosity
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateTiTv(VariantContext vc, boolean updateStandard) {
|
public void updateTiTv(VariantContext vc, boolean updateStandard) {
|
||||||
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) {
|
if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphicInSamples()) {
|
||||||
if (VariantContextUtils.isTransition(vc)) {
|
if (VariantContextUtils.isTransition(vc)) {
|
||||||
if (updateStandard) nTiInComp++;
|
if (updateStandard) nTiInComp++;
|
||||||
else nTi++;
|
else nTi++;
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Broad Institute
|
* The Broad Institute
|
||||||
|
|
@ -118,8 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
|
||||||
public SiteStatus calcSiteStatus(VariantContext vc) {
|
public SiteStatus calcSiteStatus(VariantContext vc) {
|
||||||
if ( vc == null ) return SiteStatus.NO_CALL;
|
if ( vc == null ) return SiteStatus.NO_CALL;
|
||||||
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
|
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
|
||||||
if ( vc.isMonomorphic() ) return SiteStatus.MONO;
|
if ( vc.isMonomorphicInSamples() ) return SiteStatus.MONO;
|
||||||
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes
|
if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes
|
||||||
|
|
||||||
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
|
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||||
int ac = 0;
|
int ac = 0;
|
||||||
|
|
|
||||||
|
|
@ -232,14 +232,14 @@ public class VariantQualityScore extends VariantEvaluator {
|
||||||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
final String interesting = null;
|
final String interesting = null;
|
||||||
|
|
||||||
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||||
if( titvStats == null ) { titvStats = new TiTvStats(); }
|
if( titvStats == null ) { titvStats = new TiTvStats(); }
|
||||||
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
||||||
|
|
||||||
if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
|
if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
|
||||||
int alternateAlleleCount = 0;
|
int alternateAlleleCount = 0;
|
||||||
for (final Allele a : eval.getAlternateAlleles()) {
|
for (final Allele a : eval.getAlternateAlleles()) {
|
||||||
alternateAlleleCount += eval.getChromosomeCount(a);
|
alternateAlleleCount += eval.getCalledChrCount(a);
|
||||||
}
|
}
|
||||||
alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
|
alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue