Merge branch 'master' of ssh://copper.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
Conflicts: public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
This commit is contained in:
commit
b1dd632d5d
2
ivy.xml
2
ivy.xml
|
|
@ -76,7 +76,7 @@
|
|||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.2" />
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet" />
|
||||
|
|
|
|||
|
|
@ -0,0 +1,762 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.*;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Internal class for reading and querying BAM files.
|
||||
*/
|
||||
class BAMFileReader extends SAMFileReader.ReaderImplementation {
|
||||
// True if reading from a File rather than an InputStream
|
||||
private boolean mIsSeekable = false;
|
||||
|
||||
// For converting bytes into other primitive types
|
||||
private BinaryCodec mStream = null;
|
||||
|
||||
// Underlying compressed data stream.
|
||||
private final BAMInputStream mInputStream;
|
||||
private SAMFileHeader mFileHeader = null;
|
||||
|
||||
// Populated if the file is seekable and an index exists
|
||||
private File mIndexFile;
|
||||
private BAMIndex mIndex = null;
|
||||
private long mFirstRecordPointer = 0;
|
||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||
|
||||
// If true, all SAMRecords are fully decoded as they are read.
|
||||
private final boolean eagerDecode;
|
||||
|
||||
// For error-checking.
|
||||
private ValidationStringency mValidationStringency;
|
||||
|
||||
// For creating BAMRecords
|
||||
private SAMRecordFactory samRecordFactory;
|
||||
|
||||
/**
|
||||
* Use the caching index reader implementation rather than the disk-hit-per-file model.
|
||||
*/
|
||||
private boolean mEnableIndexCaching = false;
|
||||
|
||||
/**
|
||||
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
|
||||
*/
|
||||
private boolean mEnableIndexMemoryMapping = true;
|
||||
|
||||
/**
|
||||
* Add information about the origin (reader and position) to SAM records.
|
||||
*/
|
||||
private SAMFileReader mFileReader = null;
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a stream (not seekable)
|
||||
* @param stream source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final InputStream stream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = false;
|
||||
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a file (seekable)
|
||||
* @param file source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final File file,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
|
||||
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
|
||||
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
|
||||
" is older than BAM " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
BAMFileReader(final SeekableStream strm,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
|
||||
indexFile,
|
||||
eagerDecode,
|
||||
strm.getSource(),
|
||||
validationStringency,
|
||||
factory);
|
||||
}
|
||||
|
||||
private BAMFileReader(final BAMInputStream inputStream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final String source,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = true;
|
||||
mInputStream = inputStream;
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(source);
|
||||
mFirstRecordPointer = inputStream.getFilePointer();
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, writes the source of every read into the source SAMRecords.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
|
||||
this.mFileReader = enabled ? reader : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, uses the caching version of the index reader.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
public void enableIndexCaching(final boolean enabled) {
|
||||
if(mIndex != null)
|
||||
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
|
||||
this.mEnableIndexCaching = enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
|
||||
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
|
||||
* @param enabled True to use memory mapping, false to use regular I/O.
|
||||
*/
|
||||
public void enableIndexMemoryMapping(final boolean enabled) {
|
||||
if (mIndex != null) {
|
||||
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
|
||||
}
|
||||
this.mEnableIndexMemoryMapping = enabled;
|
||||
}
|
||||
|
||||
@Override void enableCrcChecking(final boolean enabled) {
|
||||
this.mInputStream.setCheckCrcs(enabled);
|
||||
}
|
||||
|
||||
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
|
||||
|
||||
/**
|
||||
* @return true if ths is a BAM file, and has an index
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
return (mIndexFile != null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
|
||||
* @return An index of the given type.
|
||||
*/
|
||||
public BAMIndex getIndex() {
|
||||
if(mIndexFile == null)
|
||||
throw new SAMException("No index is available for this BAM file.");
|
||||
if(mIndex == null)
|
||||
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
|
||||
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
|
||||
return mIndex;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (mStream != null) {
|
||||
mStream.close();
|
||||
}
|
||||
if (mIndex != null) {
|
||||
mIndex.close();
|
||||
}
|
||||
mStream = null;
|
||||
mFileHeader = null;
|
||||
mIndex = null;
|
||||
}
|
||||
|
||||
SAMFileHeader getFileHeader() {
|
||||
return mFileHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set error-checking level for subsequent SAMRecord reads.
|
||||
*/
|
||||
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
||||
this.mValidationStringency = validationStringency;
|
||||
}
|
||||
|
||||
SAMFileReader.ValidationStringency getValidationStringency() {
|
||||
return this.mValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords in file order.
|
||||
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
||||
* that iterator must be closed before getIterator() can be called again.
|
||||
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
||||
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
||||
* done in that situation.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> getIterator() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
mCurrentIterator = new BAMFileIterator();
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!(chunks instanceof BAMFileSpan)) {
|
||||
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
|
||||
}
|
||||
|
||||
// Create an iterator over the given chunk boundaries.
|
||||
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
|
||||
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
|
||||
* the rightmost bound is guaranteed to be after the last read in the file.
|
||||
* @return An unbounded pointer to the first record in the BAM file.
|
||||
*/
|
||||
@Override
|
||||
SAMFileSpan getFilePointerSpanningReads() {
|
||||
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords that match the given interval.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
||||
* A value of zero implies the start of the reference sequence.
|
||||
* @param end A value of zero implies the end of the reference sequence.
|
||||
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
||||
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
||||
* @return Iterator for the matching SAMRecords
|
||||
*/
|
||||
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords with the given alignment start.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Alignment start sought.
|
||||
* @return Iterator for the matching SAMRecords.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
try {
|
||||
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
|
||||
if (startOfLastLinearBin != -1) {
|
||||
mInputStream.seek(startOfLastLinearBin);
|
||||
} else {
|
||||
// No mapped reads in file, just start at the first read in file.
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
}
|
||||
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
||||
return mCurrentIterator;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the header from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private void readHeader(final String source)
|
||||
throws IOException {
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
mStream.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||
throw new IOException("Invalid BAM file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = mStream.readInt();
|
||||
final String textHeader = mStream.readString(headerTextLength);
|
||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||
headerCodec.setValidationStringency(mValidationStringency);
|
||||
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
||||
source);
|
||||
|
||||
final int sequenceCount = mStream.readInt();
|
||||
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
||||
throw new SAMFormatException("Number of sequences in text header (" +
|
||||
mFileHeader.getSequenceDictionary().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
||||
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||
source);
|
||||
}
|
||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
source);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If only binary sequences are present, copy them into mFileHeader
|
||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
sequences.add(readSequenceRecord(source));
|
||||
}
|
||||
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single binary sequence record from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private SAMSequenceRecord readSequenceRecord(final String source) {
|
||||
final int nameLength = mStream.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
||||
}
|
||||
final String sequenceName = mStream.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
mStream.readByte();
|
||||
final int sequenceLength = mStream.readInt();
|
||||
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
||||
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
||||
*/
|
||||
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
||||
private SAMRecord mNextRecord = null;
|
||||
private final BAMRecordCodec bamRecordCodec;
|
||||
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
||||
|
||||
BAMFileIterator() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param advance Trick to enable subclass to do more setup before advancing
|
||||
*/
|
||||
BAMFileIterator(final boolean advance) {
|
||||
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
|
||||
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
|
||||
|
||||
if (advance) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mCurrentIterator != null && this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
final SAMRecord result = mNextRecord;
|
||||
advance();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
void advance() {
|
||||
try {
|
||||
mNextRecord = getNextRecord();
|
||||
|
||||
if (mNextRecord != null) {
|
||||
++this.samRecordIndex;
|
||||
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
||||
mNextRecord.setValidationStringency(mValidationStringency);
|
||||
|
||||
if (mValidationStringency != ValidationStringency.SILENT) {
|
||||
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
||||
SAMUtils.processValidationErrors(validationErrors,
|
||||
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
|
||||
}
|
||||
}
|
||||
if (eagerDecode && mNextRecord != null) {
|
||||
mNextRecord.eagerDecode();
|
||||
}
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next record from the input stream.
|
||||
*/
|
||||
SAMRecord getNextRecord() throws IOException {
|
||||
final long startCoordinate = mInputStream.getFilePointer();
|
||||
final SAMRecord next = bamRecordCodec.decode();
|
||||
final long stopCoordinate = mInputStream.getFilePointer();
|
||||
|
||||
if(mFileReader != null && next != null)
|
||||
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The record that will be return by the next call to next()
|
||||
*/
|
||||
protected SAMRecord peek() {
|
||||
return mNextRecord;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords matching the target interval.
|
||||
* @param sequence Desired reference sequence.
|
||||
* @param start 1-based start of target interval, inclusive.
|
||||
* @param end 1-based end of target interval, inclusive.
|
||||
* @param queryType contained, overlapping, or starting-at query.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
||||
final int start,
|
||||
final int end,
|
||||
final QueryType queryType) {
|
||||
long[] filePointers = null;
|
||||
|
||||
// Hit the index to determine the chunk boundaries for the required data.
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMIndex fileIndex = getIndex();
|
||||
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
|
||||
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
|
||||
}
|
||||
|
||||
// Create an iterator over the above chunk boundaries.
|
||||
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
||||
|
||||
// Add some preprocessing filters for edge-case reads that don't fit into this
|
||||
// query type.
|
||||
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
||||
}
|
||||
|
||||
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
||||
|
||||
/**
|
||||
* Look for BAM index file according to standard naming convention.
|
||||
*
|
||||
* @param dataFile BAM file name.
|
||||
* @return Index file name, or null if not found.
|
||||
*/
|
||||
private static File findIndexFile(final File dataFile) {
|
||||
// If input is foo.bam, look for foo.bai
|
||||
final String bamExtension = ".bam";
|
||||
File indexFile;
|
||||
final String fileName = dataFile.getName();
|
||||
if (fileName.endsWith(bamExtension)) {
|
||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
|
||||
indexFile = new File(dataFile.getParent(), bai);
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
// If foo.bai doesn't exist look for foo.bam.bai
|
||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexIterator extends BAMFileIterator {
|
||||
|
||||
private long[] mFilePointers = null;
|
||||
private int mFilePointerIndex = 0;
|
||||
private long mFilePointerLimit = -1;
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
|
||||
* @param filePointers the block / offset combination, stored in chunk format.
|
||||
*/
|
||||
BAMFileIndexIterator(final long[] filePointers) {
|
||||
super(false); // delay advance() until after construction
|
||||
mFilePointers = filePointers;
|
||||
advance();
|
||||
}
|
||||
|
||||
SAMRecord getNextRecord()
|
||||
throws IOException {
|
||||
// Advance to next file block if necessary
|
||||
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||
if (mFilePointers == null ||
|
||||
mFilePointerIndex >= mFilePointers.length) {
|
||||
return null;
|
||||
}
|
||||
final long startOffset = mFilePointers[mFilePointerIndex++];
|
||||
final long endOffset = mFilePointers[mFilePointerIndex++];
|
||||
mInputStream.seek(startOffset);
|
||||
mFilePointerLimit = endOffset;
|
||||
}
|
||||
// Pull next record from stream
|
||||
return super.getNextRecord();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A decorating iterator that filters out records that are outside the bounds of the
|
||||
* given query parameters.
|
||||
*/
|
||||
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The next record to be returned. Will be null if no such record exists.
|
||||
*/
|
||||
private SAMRecord mNextRecord;
|
||||
|
||||
private final int mReferenceIndex;
|
||||
private final int mRegionStart;
|
||||
private final int mRegionEnd;
|
||||
private final QueryType mQueryType;
|
||||
|
||||
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
||||
this.wrappedIterator = iterator;
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
mRegionStart = start;
|
||||
if (queryType == QueryType.STARTING_AT) {
|
||||
mRegionEnd = mRegionStart;
|
||||
} else {
|
||||
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
||||
}
|
||||
mQueryType = queryType;
|
||||
mNextRecord = advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a next element exists; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return mNextRecord != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the given iterator.
|
||||
* @return The next SAM record in the iterator.
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
||||
final SAMRecord currentRead = mNextRecord;
|
||||
mNextRecord = advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes down the existing iterator.
|
||||
*/
|
||||
public void close() {
|
||||
if (this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
SAMRecord advance() {
|
||||
while (true) {
|
||||
// Pull next record from stream
|
||||
if(!wrappedIterator.hasNext())
|
||||
return null;
|
||||
|
||||
final SAMRecord record = wrappedIterator.next();
|
||||
// If beyond the end of this reference sequence, end iteration
|
||||
final int referenceIndex = record.getReferenceIndex();
|
||||
if (referenceIndex != mReferenceIndex) {
|
||||
if (referenceIndex < 0 ||
|
||||
referenceIndex > mReferenceIndex) {
|
||||
return null;
|
||||
}
|
||||
// If before this reference sequence, continue
|
||||
continue;
|
||||
}
|
||||
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
||||
// Quick exit to avoid expensive alignment end calculation
|
||||
return record;
|
||||
}
|
||||
final int alignmentStart = record.getAlignmentStart();
|
||||
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
||||
// the query region, regardless of whether the mapped mate will be returned.
|
||||
final int alignmentEnd;
|
||||
if (mQueryType == QueryType.STARTING_AT) {
|
||||
alignmentEnd = -1;
|
||||
} else {
|
||||
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
||||
record.getAlignmentEnd(): alignmentStart);
|
||||
}
|
||||
|
||||
if (alignmentStart > mRegionEnd) {
|
||||
// If scanned beyond target region, end iteration
|
||||
return null;
|
||||
}
|
||||
// Filter for overlap with region
|
||||
if (mQueryType == QueryType.CONTAINED) {
|
||||
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else if (mQueryType == QueryType.OVERLAPPING) {
|
||||
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else {
|
||||
if (alignmentStart == mRegionStart) {
|
||||
return record;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
||||
private BAMFileIndexUnmappedIterator() {
|
||||
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,6 +25,7 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
|
@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GATKBAMFileSpan from an existing BAMFileSpan.
|
||||
* @param sourceFileSpan
|
||||
*/
|
||||
public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
|
||||
if(!(sourceFileSpan instanceof BAMFileSpan))
|
||||
throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
|
||||
BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
|
||||
for(Chunk chunk: sourceBAMFileSpan.getChunks())
|
||||
add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience constructor to construct a BAM file span from
|
||||
* a single chunk.
|
||||
|
|
|
|||
|
|
@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
|
|||
super.setChunkEnd(value);
|
||||
}
|
||||
|
||||
public long getBlockStart() {
|
||||
return getChunkStart() >>> 16;
|
||||
}
|
||||
|
||||
public int getBlockOffsetStart() {
|
||||
return (int)(getChunkStart() & 0xFFFF);
|
||||
}
|
||||
|
||||
public long getBlockEnd() {
|
||||
return getChunkEnd() >>> 16;
|
||||
}
|
||||
|
||||
public int getBlockOffsetEnd() {
|
||||
return ((int)getChunkEnd() & 0xFFFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes an approximation of the uncompressed size of the
|
||||
* chunk, in bytes. Can be used to determine relative weights
|
||||
|
|
|
|||
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package net.sf.samtools.util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* An input stream formulated for use reading BAM files. Supports
|
||||
*/
|
||||
public interface BAMInputStream {
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos) throws IOException;
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer();
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check);
|
||||
|
||||
public int read() throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes) throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
|
||||
|
||||
public long skip(long l) throws java.io.IOException;
|
||||
|
||||
public int available() throws java.io.IOException;
|
||||
|
||||
public void close() throws java.io.IOException;
|
||||
|
||||
public void mark(int i);
|
||||
|
||||
public void reset() throws java.io.IOException;
|
||||
|
||||
public boolean markSupported();
|
||||
}
|
||||
|
|
@ -0,0 +1,483 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools.util;
|
||||
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.net.URL;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
|
||||
import net.sf.samtools.FileTruncatedException;
|
||||
|
||||
/*
|
||||
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
|
||||
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
|
||||
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
|
||||
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
|
||||
*
|
||||
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
|
||||
*/
|
||||
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
|
||||
private InputStream mStream = null;
|
||||
private SeekableStream mFile = null;
|
||||
private byte[] mFileBuffer = null;
|
||||
private byte[] mCurrentBlock = null;
|
||||
private int mCurrentOffset = 0;
|
||||
private long mBlockAddress = 0;
|
||||
private int mLastBlockLength = 0;
|
||||
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
|
||||
|
||||
|
||||
/**
|
||||
* Note that seek() is not supported if this ctor is used.
|
||||
*/
|
||||
public BlockCompressedInputStream(final InputStream stream) {
|
||||
mStream = IOUtil.toBufferedStream(stream);
|
||||
mFile = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this ctor if you wish to call seek()
|
||||
*/
|
||||
public BlockCompressedInputStream(final File file)
|
||||
throws IOException {
|
||||
mFile = new SeekableFileStream(file);
|
||||
mStream = null;
|
||||
|
||||
}
|
||||
|
||||
public BlockCompressedInputStream(final URL url) {
|
||||
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* For providing some arbitrary data source. No additional buffering is
|
||||
* provided, so if the underlying source is not buffered, wrap it in a
|
||||
* SeekableBufferedStream before passing to this ctor.
|
||||
*/
|
||||
public BlockCompressedInputStream(final SeekableStream strm) {
|
||||
mFile = strm;
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
this.blockGunzipper.setCheckCrcs(check);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
|
||||
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
|
||||
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
|
||||
* may block in order to fill an internal buffer if it has been exhausted.
|
||||
*/
|
||||
public int available()
|
||||
throws IOException {
|
||||
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
|
||||
readBlock();
|
||||
}
|
||||
if (mCurrentBlock == null) {
|
||||
return 0;
|
||||
}
|
||||
return mCurrentBlock.length - mCurrentOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the underlying InputStream or RandomAccessFile
|
||||
*/
|
||||
public void close()
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
mFile.close();
|
||||
mFile = null;
|
||||
} else if (mStream != null) {
|
||||
mStream.close();
|
||||
mStream = null;
|
||||
}
|
||||
// Encourage garbage collection
|
||||
mFileBuffer = null;
|
||||
mCurrentBlock = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
|
||||
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
|
||||
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
|
||||
|
||||
* @return the next byte of data, or -1 if the end of the stream is reached.
|
||||
*/
|
||||
public int read()
|
||||
throws IOException {
|
||||
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
|
||||
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
|
||||
* or an exception is thrown.
|
||||
*
|
||||
* read(buf) has the same effect as read(buf, 0, buf.length).
|
||||
*
|
||||
* @param buffer the buffer into which the data is read.
|
||||
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer)
|
||||
throws IOException {
|
||||
return read(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
private volatile ByteArrayOutputStream buf = null;
|
||||
private static final byte eol = '\n';
|
||||
private static final byte eolCr = '\r';
|
||||
|
||||
/**
|
||||
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
|
||||
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
|
||||
*
|
||||
* @return A String containing the contents of the line, excluding the line terminating
|
||||
* character, or null if the end of the stream has been reached
|
||||
*
|
||||
* @exception IOException If an I/O error occurs
|
||||
* @
|
||||
*/
|
||||
public String readLine() throws IOException {
|
||||
int available = available();
|
||||
if (available == 0) {
|
||||
return null;
|
||||
}
|
||||
if(null == buf){ // lazy initialisation
|
||||
buf = new ByteArrayOutputStream(8192);
|
||||
}
|
||||
buf.reset();
|
||||
boolean done = false;
|
||||
boolean foundCr = false; // \r found flag
|
||||
while (!done) {
|
||||
int linetmpPos = mCurrentOffset;
|
||||
int bCnt = 0;
|
||||
while((available-- > 0)){
|
||||
final byte c = mCurrentBlock[linetmpPos++];
|
||||
if(c == eol){ // found \n
|
||||
done = true;
|
||||
break;
|
||||
} else if(foundCr){ // previous char was \r
|
||||
--linetmpPos; // current char is not \n so put it back
|
||||
done = true;
|
||||
break;
|
||||
} else if(c == eolCr){ // found \r
|
||||
foundCr = true;
|
||||
continue; // no ++bCnt
|
||||
}
|
||||
++bCnt;
|
||||
}
|
||||
if(mCurrentOffset < linetmpPos){
|
||||
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
|
||||
mCurrentOffset = linetmpPos;
|
||||
}
|
||||
available = available();
|
||||
if(available == 0){
|
||||
// EOF
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
|
||||
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
|
||||
*
|
||||
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
|
||||
*
|
||||
* @param buffer buffer into which data is read.
|
||||
* @param offset the start offset in array b at which the data is written.
|
||||
* @param length the maximum number of bytes to read.
|
||||
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer, int offset, int length)
|
||||
throws IOException {
|
||||
final int originalLength = length;
|
||||
while (length > 0) {
|
||||
final int available = available();
|
||||
if (available == 0) {
|
||||
// Signal EOF to caller
|
||||
if (originalLength == length) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
final int copyLength = Math.min(length, available);
|
||||
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
|
||||
mCurrentOffset += copyLength;
|
||||
offset += copyLength;
|
||||
length -= copyLength;
|
||||
}
|
||||
return originalLength - length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos)
|
||||
throws IOException {
|
||||
if (mFile == null) {
|
||||
throw new IOException("Cannot seek on stream based file");
|
||||
}
|
||||
// Decode virtual file pointer
|
||||
// Upper 48 bits is the byte offset into the compressed stream of a block.
|
||||
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
|
||||
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
|
||||
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
|
||||
final int available;
|
||||
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
|
||||
available = mCurrentBlock.length;
|
||||
} else {
|
||||
mFile.seek(compressedOffset);
|
||||
mBlockAddress = compressedOffset;
|
||||
mLastBlockLength = 0;
|
||||
readBlock();
|
||||
available = available();
|
||||
}
|
||||
if (uncompressedOffset > available ||
|
||||
(uncompressedOffset == available && !eof())) {
|
||||
throw new IOException("Invalid file pointer: " + pos);
|
||||
}
|
||||
mCurrentOffset = uncompressedOffset;
|
||||
}
|
||||
|
||||
private boolean eof() throws IOException {
|
||||
if (mFile.eof()) {
|
||||
return true;
|
||||
}
|
||||
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
|
||||
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer() {
|
||||
if (mCurrentOffset == mCurrentBlock.length) {
|
||||
// If current offset is at the end of the current block, file pointer should point
|
||||
// to the beginning of the next block.
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
|
||||
}
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
|
||||
}
|
||||
|
||||
public static long getFileBlock(final long bgzfOffset) {
|
||||
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
|
||||
* @return true if the given file looks like a valid BGZF file.
|
||||
*/
|
||||
public static boolean isValidFile(final InputStream stream)
|
||||
throws IOException {
|
||||
if (!stream.markSupported()) {
|
||||
throw new RuntimeException("Cannot test non-buffered stream");
|
||||
}
|
||||
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
||||
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
stream.reset();
|
||||
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
|
||||
}
|
||||
|
||||
private static boolean isValidBlockHeader(final byte[] buffer) {
|
||||
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
|
||||
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
|
||||
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
|
||||
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
|
||||
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
|
||||
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
|
||||
}
|
||||
|
||||
private void readBlock()
|
||||
throws IOException {
|
||||
|
||||
if (mFileBuffer == null) {
|
||||
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
|
||||
}
|
||||
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
if (count == 0) {
|
||||
// Handle case where there is no empty gzip block at end.
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mCurrentBlock = new byte[0];
|
||||
return;
|
||||
}
|
||||
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
|
||||
throw new IOException("Premature end of file");
|
||||
}
|
||||
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
|
||||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
|
||||
throw new IOException("Unexpected compressed block length: " + blockLength);
|
||||
}
|
||||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
|
||||
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
|
||||
if (count != remaining) {
|
||||
throw new FileTruncatedException("Premature end of file");
|
||||
}
|
||||
inflateBlock(mFileBuffer, blockLength);
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mLastBlockLength = blockLength;
|
||||
}
|
||||
|
||||
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
|
||||
throws IOException {
|
||||
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
|
||||
byte[] buffer = mCurrentBlock;
|
||||
mCurrentBlock = null;
|
||||
if (buffer == null || buffer.length != uncompressedLength) {
|
||||
try {
|
||||
buffer = new byte[uncompressedLength];
|
||||
} catch (NegativeArraySizeException e) {
|
||||
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
|
||||
}
|
||||
}
|
||||
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
|
||||
mCurrentBlock = buffer;
|
||||
}
|
||||
|
||||
private int readBytes(final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
return readBytes(mFile, buffer, offset, length);
|
||||
} else if (mStream != null) {
|
||||
return readBytes(mStream, buffer, offset, length);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private int unpackInt16(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8));
|
||||
}
|
||||
|
||||
private int unpackInt32(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8) |
|
||||
((buffer[offset+2] & 0xFF) << 16) |
|
||||
((buffer[offset+3] & 0xFF) << 24));
|
||||
}
|
||||
|
||||
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
|
||||
|
||||
public static FileTermination checkTermination(final File file)
|
||||
throws IOException {
|
||||
final long fileSize = file.length();
|
||||
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
|
||||
try {
|
||||
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
|
||||
raFile.readFully(buf);
|
||||
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
|
||||
return FileTermination.HAS_TERMINATOR_BLOCK;
|
||||
}
|
||||
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
|
||||
buf = new byte[bufsize];
|
||||
raFile.seek(fileSize - bufsize);
|
||||
raFile.read(buf);
|
||||
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
|
||||
i >= 0; --i) {
|
||||
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
|
||||
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
|
||||
continue;
|
||||
}
|
||||
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
|
||||
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
|
||||
if (buf.length - i == totalBlockSizeMinusOne + 1) {
|
||||
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
|
||||
} else {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
}
|
||||
return FileTermination.DEFECTIVE;
|
||||
} finally {
|
||||
raFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (preamble[i] != buf[i + startOffset]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
|
|||
* used to indicate an error occured
|
||||
*
|
||||
* @param msg the message
|
||||
* @param e the error
|
||||
* @param t the error
|
||||
*/
|
||||
public static void exitSystemWithError(String msg, final Exception e) {
|
||||
public static void exitSystemWithError(String msg, final Throwable t) {
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
errorPrintf("stack trace %n");
|
||||
e.printStackTrace();
|
||||
t.printStackTrace();
|
||||
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
|
||||
|
|
@ -392,10 +392,10 @@ public abstract class CommandLineProgram {
|
|||
/**
|
||||
* used to indicate an error occured
|
||||
*
|
||||
* @param e the exception occured
|
||||
* @param t the exception that occurred
|
||||
*/
|
||||
public static void exitSystemWithError(Exception e) {
|
||||
exitSystemWithError(e.getMessage(), e);
|
||||
public static void exitSystemWithError(Throwable t) {
|
||||
exitSystemWithError(t.getMessage(), t);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ import java.util.*;
|
|||
*
|
||||
* The IntervalBinding<T> is a formal GATK argument that bridges between a walker and
|
||||
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
|
||||
* either be a RodBinding<T>, a string of one or more intervals, or a file with interval strings.
|
||||
* either be a RodBinding<T>, a string of one interval, or a file with interval strings.
|
||||
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
|
||||
*
|
||||
* Note that this class is immutable.
|
||||
|
|
@ -108,4 +108,8 @@ public final class IntervalBinding<T extends Feature> {
|
|||
|
||||
return intervals;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getSource();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.Attribution;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
|
@ -97,13 +96,20 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
|
||||
exitSystemWithUserError(e);
|
||||
} catch (net.sf.samtools.SAMException e) {
|
||||
// Let's try this out and see how it is received by our users
|
||||
checkForTooManyOpenFilesProblem(e.getMessage());
|
||||
exitSystemWithSamError(e);
|
||||
} catch (Exception e) {
|
||||
exitSystemWithError(e);
|
||||
} catch (Throwable t) {
|
||||
checkForTooManyOpenFilesProblem(t.getMessage());
|
||||
exitSystemWithError(t);
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkForTooManyOpenFilesProblem(String message) {
|
||||
// Special case the "Too many open files" error because it's a common User Error for which we know what to do
|
||||
if ( message != null && message.indexOf("Too many open files") != -1 )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the a short blurb about the GATK, copyright info, and where to get documentation.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
|
|
@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<ReadFilter> filters;
|
||||
|
||||
/**
|
||||
* Controls the allocation of threads between CPU vs IO.
|
||||
*/
|
||||
private ThreadAllocation threadAllocation;
|
||||
|
||||
/**
|
||||
* A currently hacky unique name for this GATK instance
|
||||
*/
|
||||
|
|
@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
|
|||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
||||
// Prepare the data for traversal.
|
||||
initializeDataSources();
|
||||
|
||||
|
|
@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
|
|||
// create the output streams "
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
|
||||
// execute the microscheduler, storing the results
|
||||
return microScheduler.execute(this.walker, shardStrategy);
|
||||
|
|
@ -266,6 +275,16 @@ public class GenomeAnalysisEngine {
|
|||
return Collections.unmodifiableList(filters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse out the thread allocation from the given command-line argument.
|
||||
*/
|
||||
private void determineThreadAllocation() {
|
||||
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
|
||||
Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null;
|
||||
Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null;
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow subclasses and others within this package direct access to the walker manager.
|
||||
* @return The walker manager used by this package.
|
||||
|
|
@ -286,7 +305,7 @@ public class GenomeAnalysisEngine {
|
|||
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
|
||||
}
|
||||
|
||||
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
|
||||
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
|
||||
}
|
||||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
|
|
@ -397,103 +416,49 @@ public class GenomeAnalysisEngine {
|
|||
* @param intervals intervals
|
||||
* @return the sharding strategy
|
||||
*/
|
||||
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
||||
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
|
||||
// sharding system; it's required with the new sharding system only for locus walkers.
|
||||
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
|
||||
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
||||
|
||||
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
|
||||
if(!readsDataSource.isEmpty()) {
|
||||
if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
||||
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
|
||||
if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
|
||||
|
||||
Shard.ShardType shardType;
|
||||
if(walker instanceof LocusWalker) {
|
||||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
shardType = Shard.ShardType.LOCUS;
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
|
||||
}
|
||||
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
|
||||
// Apply special validation to read pair walkers.
|
||||
if(walker instanceof ReadPairWalker) {
|
||||
if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
|
||||
if(intervals != null && !intervals.isEmpty())
|
||||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
}
|
||||
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
|
||||
}
|
||||
else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
|
||||
shardType = Shard.ShardType.READ;
|
||||
else
|
||||
throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
|
||||
|
||||
List<GenomeLoc> region;
|
||||
if(intervals != null)
|
||||
region = intervals.toList();
|
||||
else {
|
||||
region = new ArrayList<GenomeLoc>();
|
||||
for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
|
||||
region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
|
||||
}
|
||||
|
||||
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
|
||||
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
|
||||
}
|
||||
else {
|
||||
final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000;
|
||||
if(intervals == null)
|
||||
return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
|
||||
else
|
||||
return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
|
||||
}
|
||||
|
||||
ShardStrategy shardStrategy;
|
||||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
||||
|
||||
long SHARD_SIZE = 100000L;
|
||||
|
||||
if (walker instanceof LocusWalker) {
|
||||
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
|
||||
|
||||
if (intervals != null && !intervals.isEmpty()) {
|
||||
if (readsDataSource == null)
|
||||
throw new IllegalArgumentException("readsDataSource is null");
|
||||
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser(),
|
||||
intervals);
|
||||
} else
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,getGenomeLocParser());
|
||||
} else if (walker instanceof ReadWalker ||
|
||||
walker instanceof DuplicateWalker) {
|
||||
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
|
||||
|
||||
if (intervals != null && !intervals.isEmpty()) {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser(),
|
||||
intervals);
|
||||
} else {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser());
|
||||
}
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
|
||||
if(intervals != null && !intervals.isEmpty())
|
||||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser());
|
||||
} else
|
||||
throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
|
||||
|
||||
return shardStrategy;
|
||||
}
|
||||
|
||||
protected boolean flashbackData() {
|
||||
|
|
@ -751,6 +716,8 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
return new SAMDataSource(
|
||||
samReaderIDs,
|
||||
threadAllocation,
|
||||
argCollection.numberOfBAMFileHandles,
|
||||
genomeLocParser,
|
||||
argCollection.useOriginalBaseQualities,
|
||||
argCollection.strictnessLevel,
|
||||
|
|
@ -763,8 +730,7 @@ public class GenomeAnalysisEngine {
|
|||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||
getWalkerBAQQualityMode(),
|
||||
refReader,
|
||||
argCollection.defaultBaseQualities,
|
||||
!argCollection.disableLowMemorySharding);
|
||||
argCollection.defaultBaseQualities);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -194,10 +194,14 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||
public ValidationExclusion.TYPE unsafe;
|
||||
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
|
||||
public int numberOfThreads = 1;
|
||||
/** How many threads should be allocated to this analysis. */
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfThreads = 1;
|
||||
|
||||
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false)
|
||||
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
|
||||
public Integer numberOfBAMFileHandles = null;
|
||||
|
||||
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
|
||||
public List<String> readGroupBlackList = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -292,9 +296,6 @@ public class GATKArgumentCollection {
|
|||
@Hidden
|
||||
public boolean allowIntervalsWithUnindexedBAM = false;
|
||||
|
||||
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
|
||||
public boolean disableLowMemorySharding = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods
|
||||
|
|
@ -365,7 +366,11 @@ public class GATKArgumentCollection {
|
|||
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
|
||||
return false;
|
||||
}
|
||||
if (other.numberOfThreads != this.numberOfThreads) {
|
||||
if (!other.numberOfThreads.equals(this.numberOfThreads)) {
|
||||
return false;
|
||||
}
|
||||
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
|
||||
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
|
||||
return false;
|
||||
}
|
||||
if (other.intervalMerging != this.intervalMerging) {
|
||||
|
|
@ -389,9 +394,6 @@ public class GATKArgumentCollection {
|
|||
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
|
||||
return false;
|
||||
|
||||
if (disableLowMemorySharding != other.disableLowMemorySharding)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,128 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 7, 2011
|
||||
* Time: 2:46:34 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class BAMBlockStartIterator implements Iterator<Long> {
|
||||
/**
|
||||
* How large is a BGZF header?
|
||||
*/
|
||||
private static int BGZF_HEADER_SIZE = 18;
|
||||
|
||||
/**
|
||||
* Where within the header does the BLOCKSIZE actually live?
|
||||
*/
|
||||
private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
|
||||
|
||||
private FileChannel bamInputChannel;
|
||||
private ByteBuffer headerByteBuffer;
|
||||
|
||||
private long nextLocation = 0;
|
||||
|
||||
public BAMBlockStartIterator(File bamFile) {
|
||||
try {
|
||||
FileInputStream bamInputStream = new FileInputStream(bamFile);
|
||||
bamInputChannel = bamInputStream.getChannel();
|
||||
|
||||
headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
|
||||
headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not open file",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextLocation != -1;
|
||||
}
|
||||
|
||||
public Long next() {
|
||||
long currentLocation = nextLocation;
|
||||
advance();
|
||||
return currentLocation;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
int readStatus;
|
||||
|
||||
headerByteBuffer.clear();
|
||||
try {
|
||||
readStatus = bamInputChannel.read(headerByteBuffer);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not read header data",ex);
|
||||
}
|
||||
|
||||
if(readStatus == -1) {
|
||||
nextLocation = -1;
|
||||
try {
|
||||
bamInputChannel.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not close input file",ex);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
|
||||
int blockSize = headerByteBuffer.getShort();
|
||||
|
||||
try {
|
||||
bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
|
||||
nextLocation = bamInputChannel.position();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not reposition input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String argv[]) throws IOException {
|
||||
BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
|
||||
int i = 0;
|
||||
while(blockStartIterator.hasNext())
|
||||
System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBin;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.LinearIndex;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Represents the contents of a bam index file for one reference.
|
||||
* A BAM index (.bai) file contains information for all references in the bam file.
|
||||
* This class describes the data present in the index file for one of these references;
|
||||
* including the bins, chunks, and linear index.
|
||||
*/
|
||||
class BAMIndexContent {
|
||||
/**
|
||||
* The reference sequence for the data currently loaded.
|
||||
*/
|
||||
private final int mReferenceSequence;
|
||||
|
||||
/**
|
||||
* A list of all bins in the above reference sequence.
|
||||
*/
|
||||
private final BinList mBinList;
|
||||
|
||||
/**
|
||||
* The linear index for the reference sequence above.
|
||||
*/
|
||||
private final LinearIndex mLinearIndex;
|
||||
|
||||
|
||||
/**
|
||||
* @param referenceSequence Content corresponds to this reference.
|
||||
* @param bins Array of bins represented by this content, possibly sparse
|
||||
* @param numberOfBins Number of non-null bins
|
||||
* @param linearIndex Additional index used to optimize queries
|
||||
*/
|
||||
BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
|
||||
this.mReferenceSequence = referenceSequence;
|
||||
this.mBinList = new BinList(bins, numberOfBins);
|
||||
this.mLinearIndex = linearIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reference for this Content
|
||||
*/
|
||||
public int getReferenceSequence() {
|
||||
return mReferenceSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this content have anything in this bin?
|
||||
*/
|
||||
public boolean containsBin(final GATKBin bin) {
|
||||
return mBinList.getBin(bin.getBinNumber()) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return iterable list of bins represented by this content
|
||||
*/
|
||||
public BinList getBins() {
|
||||
return mBinList;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of non-null bins represented by this content
|
||||
*/
|
||||
int getNumberOfNonNullBins() {
|
||||
return mBinList.getNumberOfNonNullBins();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all chunks associated with all bins in this content
|
||||
*/
|
||||
public List<GATKChunk> getAllChunks() {
|
||||
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
|
||||
for (GATKBin b : mBinList)
|
||||
if (b.getChunkList() != null) {
|
||||
allChunks.addAll(Arrays.asList(b.getChunkList()));
|
||||
}
|
||||
return Collections.unmodifiableList(allChunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the linear index represented by this content
|
||||
*/
|
||||
public LinearIndex getLinearIndex() {
|
||||
return mLinearIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is used to encapsulate the list of Bins store in the BAMIndexContent
|
||||
* While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
|
||||
*/
|
||||
class BinList implements Iterable<GATKBin> {
|
||||
|
||||
private final GATKBin[] mBinArray;
|
||||
public final int numberOfNonNullBins;
|
||||
public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
|
||||
|
||||
/**
|
||||
* @param binArray a sparse array representation of the bins. The index into the array is the bin number.
|
||||
* @param numberOfNonNullBins
|
||||
*/
|
||||
BinList(GATKBin[] binArray, int numberOfNonNullBins) {
|
||||
this.mBinArray = binArray;
|
||||
this.numberOfNonNullBins = numberOfNonNullBins;
|
||||
this.maxBinNumber = mBinArray.length - 1;
|
||||
}
|
||||
|
||||
GATKBin getBin(int binNumber) {
|
||||
if (binNumber > maxBinNumber) return null;
|
||||
return mBinArray[binNumber];
|
||||
}
|
||||
|
||||
int getNumberOfNonNullBins() {
|
||||
return numberOfNonNullBins;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an iterator over all non-null bins.
|
||||
*
|
||||
* @return An iterator over all bins.
|
||||
*/
|
||||
public Iterator<GATKBin> iterator() {
|
||||
return new BinIterator();
|
||||
}
|
||||
|
||||
private class BinIterator implements Iterator<GATKBin> {
|
||||
/**
|
||||
* Stores the bin # of the Bin currently in use.
|
||||
*/
|
||||
private int nextBin;
|
||||
|
||||
public BinIterator() {
|
||||
nextBin = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are there more bins in this set, waiting to be returned?
|
||||
*
|
||||
* @return True if more bins are remaining.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
while (nextBin <= maxBinNumber) {
|
||||
if (getBin(nextBin) != null) return true;
|
||||
nextBin++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next bin in the provided BinList.
|
||||
*
|
||||
* @return the next available bin in the BinList.
|
||||
*/
|
||||
public GATKBin next() {
|
||||
if (!hasNext())
|
||||
throw new NoSuchElementException("This BinIterator is currently empty");
|
||||
GATKBin result = getBin(nextBin);
|
||||
nextBin++;
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from a bin iterator");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Models a bin at which all BAM files in the merged input stream overlap.
|
||||
*/
|
||||
class BAMOverlap {
|
||||
public final int start;
|
||||
public final int stop;
|
||||
|
||||
private final Map<SAMReaderID,Bin> bins = new HashMap<SAMReaderID,Bin>();
|
||||
|
||||
public BAMOverlap(final int start, final int stop) {
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
}
|
||||
|
||||
public void addBin(final SAMReaderID id, final Bin bin) {
|
||||
bins.put(id,bin);
|
||||
}
|
||||
|
||||
public Bin getBin(final SAMReaderID id) {
|
||||
return bins.get(id);
|
||||
}
|
||||
}
|
||||
|
|
@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
|
||||
/**
|
||||
* Create a new BAM schedule based on the given index.
|
||||
* @param indexFiles Index files.
|
||||
* @param dataSource The SAM data source to use.
|
||||
* @param intervals List of
|
||||
*/
|
||||
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) {
|
||||
public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
|
||||
if(intervals.isEmpty())
|
||||
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
||||
|
||||
referenceSequence = intervals.get(0).getContigIndex();
|
||||
referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
|
||||
|
||||
createScheduleFile();
|
||||
|
||||
readerIDs.addAll(indexFiles.keySet());
|
||||
readerIDs.addAll(dataSource.getReaderIDs());
|
||||
|
||||
for(final SAMReaderID reader: readerIDs) {
|
||||
final GATKBAMIndex index = indexFiles.get(reader);
|
||||
final GATKBAMIndex index = dataSource.getIndex(reader);
|
||||
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
|
||||
|
||||
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
||||
|
|
@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
if(selectedIterators.isEmpty())
|
||||
return;
|
||||
|
||||
// Create the target schedule entry
|
||||
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
|
||||
|
||||
// For each schedule entry with data, load the data into the merged schedule.
|
||||
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
|
||||
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
|
||||
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
|
||||
|
|
@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
scheduleIterator.next();
|
||||
}
|
||||
|
||||
// For each schedule entry without data, add a blank entry.
|
||||
for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
|
||||
mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
|
||||
}
|
||||
|
||||
nextScheduleEntry = mergedScheduleEntry;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
private final GenomeLocSortedSet loci;
|
||||
private GenomeLocSortedSet loci;
|
||||
private PeekableIterator<GenomeLoc> locusIterator;
|
||||
private GenomeLoc currentLocus;
|
||||
|
||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
||||
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
|
||||
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
|
||||
// Match only on sequence name; trust startup validation to make sure all the sequences match.
|
||||
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
|
||||
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||
}
|
||||
scheduler.populateFilteredIntervalList(intervals);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
private GenomeLoc currentLocus;
|
||||
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
scheduler.populateUnfilteredIntervalList(parser);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
scheduler.populateFilteredIntervalList(loci);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
|
||||
private BAMScheduler(final SAMDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs()) {
|
||||
GATKBAMIndex index = dataSource.getIndex(reader);
|
||||
if(index != null)
|
||||
indexFiles.put(reader,dataSource.getIndex(reader));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
|
||||
* @param loci The list of locations to search and iterate over.
|
||||
*/
|
||||
private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
if(!indexFiles.isEmpty()) {
|
||||
// If index data is available, start up the iterator.
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
}
|
||||
else {
|
||||
// Otherwise, seed the iterator with a single file pointer over the entire region.
|
||||
nextFilePointer = generatePointerOverEntireFileset();
|
||||
for(GenomeLoc locus: loci)
|
||||
nextFilePointer.addLocation(locus);
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
|
||||
* from just before the start of the region to the end of the region.
|
||||
*/
|
||||
private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
|
||||
this.loci = new GenomeLocSortedSet(parser);
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||
nextFilePointer = generatePointerOverEntireFileset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a span that runs from the end of the BAM header to the end of the fle.
|
||||
* @return A file pointer over the specified region.
|
||||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next element available in interval sharder");
|
||||
FilePointer currentFilePointer = nextFilePointer;
|
||||
nextFilePointer = null;
|
||||
advance();
|
||||
|
||||
return currentFilePointer;
|
||||
}
|
||||
|
||||
|
|
@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(loci.isEmpty())
|
||||
return;
|
||||
|
||||
nextFilePointer = null;
|
||||
while(nextFilePointer == null && currentLocus != null) {
|
||||
// special case handling of the unmapped shard.
|
||||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
|
||||
nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
int coveredRegionStop = Integer.MAX_VALUE;
|
||||
GenomeLoc coveredRegion = null;
|
||||
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
|
||||
|
||||
// No overlapping data at all.
|
||||
if(scheduleEntry != null) {
|
||||
|
|
@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
}
|
||||
else {
|
||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
||||
for(SAMReaderID reader: indexFiles.keySet())
|
||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||
}
|
||||
|
|
@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
// Early exit if no bins were found.
|
||||
if(coveredRegion == null) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
if(currentLocus.size() > 16384) {
|
||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
}
|
||||
else {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
}
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Early exit if only part of the first interval was found.
|
||||
if(currentLocus.startsBefore(coveredRegion)) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
|
|
@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param indices BAM indices.
|
||||
* @param currentLocus The actual locus for which to check overlap.
|
||||
* @return The next schedule entry overlapping with the given list of loci.
|
||||
*/
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
|
||||
// Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
|
||||
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
|
||||
// we'll be using the correct contig index for the BAMs.
|
||||
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
|
||||
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
|
||||
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
|
||||
if(bamScheduleIterator != null)
|
||||
bamScheduleIterator.close();
|
||||
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
|
||||
lastReferenceSequenceLoaded = currentContigIndex;
|
||||
|
||||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
|
||||
if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
|
||||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(dataSource,lociInContig));
|
||||
}
|
||||
|
||||
if(!bamScheduleIterator.hasNext())
|
||||
|
|
@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a span from the given start point to the end of the file.
|
||||
* @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
|
||||
* @return A file span from the given point to the end of the file.
|
||||
*/
|
||||
private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
|
||||
return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* Preloads BGZF blocks in preparation for unzipping and data processing.
|
||||
* TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
|
||||
* TODO: be replaced with a central thread management strategy.
|
||||
*/
|
||||
public class BGZFBlockLoadingDispatcher {
|
||||
/**
|
||||
* The file handle cache, used when allocating blocks from the dispatcher.
|
||||
*/
|
||||
private final FileHandleCache fileHandleCache;
|
||||
|
||||
private final ExecutorService threadPool;
|
||||
|
||||
private final Queue<SAMReaderPosition> inputQueue;
|
||||
|
||||
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
||||
threadPool = Executors.newFixedThreadPool(numThreads);
|
||||
fileHandleCache = new FileHandleCache(numFileHandles);
|
||||
inputQueue = new LinkedList<SAMReaderPosition>();
|
||||
|
||||
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a request for a new block load.
|
||||
* @param readerPosition Position at which to load.
|
||||
*/
|
||||
void queueBlockLoad(final SAMReaderPosition readerPosition) {
|
||||
synchronized(inputQueue) {
|
||||
inputQueue.add(readerPosition);
|
||||
inputQueue.notify();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Claims the next work request from the queue.
|
||||
* @return The next work request, or null if none is available.
|
||||
*/
|
||||
SAMReaderPosition claimNextWorkRequest() {
|
||||
synchronized(inputQueue) {
|
||||
while(inputQueue.isEmpty()) {
|
||||
try {
|
||||
inputQueue.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
|
||||
}
|
||||
}
|
||||
return inputQueue.poll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,436 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.util.BAMInputStream;
|
||||
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.RuntimeEOFException;
|
||||
import net.sf.samtools.util.SeekableStream;
|
||||
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Presents decompressed blocks to the SAMFileReader.
|
||||
*/
|
||||
public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||
/**
|
||||
* Mechanism for triggering block loads.
|
||||
*/
|
||||
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* The reader whose data is supplied by this input stream.
|
||||
*/
|
||||
private final SAMReaderID reader;
|
||||
|
||||
/**
|
||||
* Length of the input stream.
|
||||
*/
|
||||
private final long length;
|
||||
|
||||
/**
|
||||
* The latest error reported by an asynchronous block load.
|
||||
*/
|
||||
private Throwable error;
|
||||
|
||||
/**
|
||||
* Current position.
|
||||
*/
|
||||
private SAMReaderPosition position;
|
||||
|
||||
/**
|
||||
* A stream of compressed data blocks.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
/**
|
||||
* Offsets of the given blocks in the buffer.
|
||||
*/
|
||||
private LinkedList<Integer> blockOffsets = new LinkedList<Integer>();
|
||||
|
||||
/**
|
||||
* Source positions of the given blocks in the buffer.
|
||||
*/
|
||||
private LinkedList<Long> blockPositions = new LinkedList<Long>();
|
||||
|
||||
/**
|
||||
* Provides a lock to wait for more data to arrive.
|
||||
*/
|
||||
private final Object lock = new Object();
|
||||
|
||||
/**
|
||||
* An input stream to use when comparing data back to what it should look like.
|
||||
*/
|
||||
private final BlockCompressedInputStream validatingInputStream;
|
||||
|
||||
/**
|
||||
* Has the buffer been filled since last request?
|
||||
*/
|
||||
private boolean bufferFilled = false;
|
||||
|
||||
/**
|
||||
* Create a new block presenting input stream with a dedicated buffer.
|
||||
* @param dispatcher the block loading messenger.
|
||||
* @param reader the reader for which to load data.
|
||||
* @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
|
||||
*/
|
||||
BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
|
||||
this.reader = reader;
|
||||
this.length = reader.samFile.length();
|
||||
|
||||
buffer = ByteBuffer.wrap(new byte[64*1024]);
|
||||
buffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
// The state of the buffer assumes that the range of data written into the buffer appears in the range
|
||||
// [position,limit), while extra capacity exists in the range [limit,capacity)
|
||||
buffer.limit(0);
|
||||
|
||||
this.dispatcher = dispatcher;
|
||||
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
||||
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||
|
||||
try {
|
||||
if(validate) {
|
||||
System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
|
||||
validatingInputStream = new BlockCompressedInputStream(reader.samFile);
|
||||
// A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
|
||||
// Poke the stream to start reading data.
|
||||
validatingInputStream.available();
|
||||
}
|
||||
else
|
||||
validatingInputStream = null;
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public long length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public long getFilePointer() {
|
||||
long filePointer;
|
||||
synchronized(lock) {
|
||||
if(buffer.remaining() > 0) {
|
||||
// If there's data in the buffer, figure out from whence it came.
|
||||
final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
|
||||
final int blockOffset = buffer.position();
|
||||
filePointer = blockAddress << 16 | blockOffset;
|
||||
}
|
||||
else {
|
||||
// Otherwise, find the next position to load.
|
||||
filePointer = position.getBlockAddress() << 16;
|
||||
}
|
||||
}
|
||||
|
||||
if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
|
||||
throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
|
||||
BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
|
||||
BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
|
||||
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public void seek(long target) {
|
||||
// TODO: Validate the seek point.
|
||||
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||
synchronized(lock) {
|
||||
clearBuffers();
|
||||
position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
|
||||
waitForBufferFill();
|
||||
buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.seek(target);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void clearBuffers() {
|
||||
this.position.reset();
|
||||
|
||||
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
||||
// Indicate no data to be read.
|
||||
buffer.clear();
|
||||
buffer.limit(0);
|
||||
|
||||
blockOffsets.clear();
|
||||
blockPositions.clear();
|
||||
}
|
||||
|
||||
public boolean eof() {
|
||||
synchronized(lock) {
|
||||
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
||||
return position != null && position.getBlockAddress() >= length;
|
||||
}
|
||||
}
|
||||
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
// TODO: Implement
|
||||
}
|
||||
|
||||
/**
|
||||
* Submits a new access plan for the given dataset.
|
||||
* @param position The next seek point for BAM data in this reader.
|
||||
*/
|
||||
public void submitAccessPlan(final SAMReaderPosition position) {
|
||||
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
||||
synchronized(lock) {
|
||||
// Assume that the access plan is going to tell us to start where we are and move forward.
|
||||
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
|
||||
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
|
||||
position.advancePosition(this.position.getBlockAddress());
|
||||
}
|
||||
this.position = position;
|
||||
}
|
||||
|
||||
private void compactBuffer() {
|
||||
// Compact buffer to maximize storage space.
|
||||
int bytesToRemove = 0;
|
||||
|
||||
// Look ahead to see if we can compact away the first block in the series.
|
||||
while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
|
||||
bytesToRemove += blockOffsets.remove();
|
||||
blockPositions.remove();
|
||||
}
|
||||
|
||||
// If we end up with an empty block at the end of the series, compact this as well.
|
||||
if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
|
||||
bytesToRemove += buffer.position();
|
||||
blockOffsets.remove();
|
||||
blockPositions.remove();
|
||||
}
|
||||
|
||||
int finalBufferStart = buffer.position() - bytesToRemove;
|
||||
int finalBufferSize = buffer.remaining();
|
||||
|
||||
buffer.position(bytesToRemove);
|
||||
buffer.compact();
|
||||
|
||||
buffer.position(finalBufferStart);
|
||||
buffer.limit(finalBufferStart+finalBufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Push contents of incomingBuffer into the end of this buffer.
|
||||
* MUST be called from a thread that is NOT the reader thread.
|
||||
* @param incomingBuffer The data being pushed into this input stream.
|
||||
* @param position target position for the data.
|
||||
*/
|
||||
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
|
||||
synchronized(lock) {
|
||||
try {
|
||||
compactBuffer();
|
||||
// Open up the buffer for more reading.
|
||||
buffer.limit(buffer.capacity());
|
||||
|
||||
// Advance the position to take the most recent read into account.
|
||||
long lastReadPosition = position.getBlockAddress();
|
||||
|
||||
byte[] validBytes = null;
|
||||
if(validatingInputStream != null) {
|
||||
validBytes = new byte[incomingBuffer.remaining()];
|
||||
|
||||
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
||||
int pos = incomingBuffer.position();
|
||||
int lim = incomingBuffer.limit();
|
||||
incomingBuffer.get(currentBytes);
|
||||
|
||||
incomingBuffer.limit(lim);
|
||||
incomingBuffer.position(pos);
|
||||
|
||||
long currentFilePointer = validatingInputStream.getFilePointer();
|
||||
validatingInputStream.seek(lastReadPosition << 16);
|
||||
validatingInputStream.read(validBytes);
|
||||
validatingInputStream.seek(currentFilePointer);
|
||||
|
||||
if(!Arrays.equals(validBytes,currentBytes))
|
||||
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
||||
}
|
||||
|
||||
this.position = position;
|
||||
position.advancePosition(filePosition);
|
||||
|
||||
if(buffer.remaining() < incomingBuffer.remaining()) {
|
||||
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
|
||||
lock.wait();
|
||||
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
|
||||
}
|
||||
|
||||
// Queue list of block offsets / block positions.
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.add(lastReadPosition);
|
||||
|
||||
buffer.put(incomingBuffer);
|
||||
|
||||
// Set up the buffer for reading.
|
||||
buffer.flip();
|
||||
bufferFilled = true;
|
||||
|
||||
lock.notify();
|
||||
}
|
||||
catch(Exception ex) {
|
||||
reportException(ex);
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reportException(Throwable t) {
|
||||
synchronized(lock) {
|
||||
this.error = t;
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForErrors() {
|
||||
synchronized(lock) {
|
||||
if(error != null) {
|
||||
ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
|
||||
toThrow.setStackTrace(error.getStackTrace());
|
||||
throw toThrow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next byte of data from the input stream.
|
||||
* @return Next byte of data, from 0->255, as an int.
|
||||
*/
|
||||
@Override
|
||||
public int read() {
|
||||
byte[] singleByte = new byte[1];
|
||||
read(singleByte);
|
||||
return singleByte[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the given byte array to the extent possible.
|
||||
* @param bytes byte array to be filled.
|
||||
* @return The number of bytes actually read.
|
||||
*/
|
||||
@Override
|
||||
public int read(byte[] bytes) {
|
||||
return read(bytes,0,bytes.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, final int offset, final int length) {
|
||||
int remaining = length;
|
||||
synchronized(lock) {
|
||||
while(remaining > 0) {
|
||||
// Check for error conditions during last read.
|
||||
checkForErrors();
|
||||
|
||||
// If completely out of space, queue up another buffer fill.
|
||||
waitForBufferFill();
|
||||
|
||||
// Couldn't manage to load any data at all; abort and return what's available.
|
||||
if(buffer.remaining() == 0)
|
||||
break;
|
||||
|
||||
int numBytesToCopy = Math.min(buffer.remaining(),remaining);
|
||||
buffer.get(bytes,length-remaining+offset,numBytesToCopy);
|
||||
remaining -= numBytesToCopy;
|
||||
|
||||
//if(remaining > 0)
|
||||
// System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
|
||||
// TODO: Assert that we don't copy across a block boundary
|
||||
}
|
||||
|
||||
// Notify any waiting threads that some of the contents of the buffer were removed.
|
||||
if(length-remaining > 0)
|
||||
lock.notify();
|
||||
}
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
byte[] validBytes = new byte[length];
|
||||
try {
|
||||
validatingInputStream.read(validBytes,offset,length);
|
||||
for(int i = offset; i < offset+length; i++) {
|
||||
if(bytes[i] != validBytes[i]) {
|
||||
System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
|
||||
throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
return length - remaining;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return reader.getSamFilePath();
|
||||
}
|
||||
|
||||
private void waitForBufferFill() {
|
||||
synchronized(lock) {
|
||||
bufferFilled = false;
|
||||
if(buffer.remaining() == 0 && !eof()) {
|
||||
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
||||
dispatcher.queueBlockLoad(position);
|
||||
try {
|
||||
lock.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
// TODO: handle me.
|
||||
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
||||
}
|
||||
|
||||
if(bufferFilled && buffer.remaining() == 0)
|
||||
throw new RuntimeEOFException("No more data left in InputStream");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
/**
|
||||
* An engine for loading blocks.
|
||||
*/
|
||||
class BlockLoader implements Runnable {
|
||||
/**
|
||||
* Coordinates the input queue.
|
||||
*/
|
||||
private BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* A cache from which to retrieve open file handles.
|
||||
*/
|
||||
private final FileHandleCache fileHandleCache;
|
||||
|
||||
/**
|
||||
* Whether asynchronous decompression should happen.
|
||||
*/
|
||||
private final boolean decompress;
|
||||
|
||||
/**
|
||||
* An direct input buffer for incoming data from disk.
|
||||
*/
|
||||
private final ByteBuffer inputBuffer;
|
||||
|
||||
public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
|
||||
this.dispatcher = dispatcher;
|
||||
this.fileHandleCache = fileHandleCache;
|
||||
this.decompress = decompress;
|
||||
|
||||
this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
for(;;) {
|
||||
SAMReaderPosition readerPosition = null;
|
||||
try {
|
||||
readerPosition = dispatcher.claimNextWorkRequest();
|
||||
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
|
||||
|
||||
long blockAddress = readerPosition.getBlockAddress();
|
||||
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
||||
|
||||
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
|
||||
long nextBlockAddress = position(inputStream);
|
||||
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
|
||||
|
||||
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
||||
int bytesCopied = block.remaining();
|
||||
|
||||
BlockInputStream bamInputStream = readerPosition.getInputStream();
|
||||
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
|
||||
|
||||
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
||||
}
|
||||
catch(Throwable error) {
|
||||
if(readerPosition != null && readerPosition.getInputStream() != null)
|
||||
readerPosition.getInputStream().reportException(error);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
|
||||
FileChannel channel = inputStream.getChannel();
|
||||
|
||||
// Read the block header
|
||||
channel.position(blockAddress);
|
||||
|
||||
int uncompressedDataSize = 0;
|
||||
int bufferSize = 0;
|
||||
|
||||
do {
|
||||
inputBuffer.clear();
|
||||
inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
channel.read(inputBuffer);
|
||||
|
||||
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||
// value into an int buffer to transfer the bitwise contents into an int.
|
||||
inputBuffer.flip();
|
||||
if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
|
||||
throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
|
||||
|
||||
// Verify that the file was read at a valid point.
|
||||
if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
|
||||
unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
|
||||
unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
|
||||
unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
|
||||
unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
|
||||
unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
|
||||
throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
|
||||
}
|
||||
|
||||
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
|
||||
bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
|
||||
|
||||
// Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
|
||||
inputBuffer.limit(bufferSize);
|
||||
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
channel.read(inputBuffer);
|
||||
|
||||
// Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
|
||||
uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
|
||||
//System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
|
||||
}
|
||||
while(uncompressedDataSize == 0 && channel.position() < channel.size());
|
||||
|
||||
// Prepare the buffer for reading.
|
||||
inputBuffer.flip();
|
||||
|
||||
return inputBuffer;
|
||||
}
|
||||
|
||||
private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
|
||||
final int compressedBufferSize = bgzfBlock.remaining();
|
||||
|
||||
// Determine the uncompressed buffer size (
|
||||
bgzfBlock.position(bgzfBlock.limit()-4);
|
||||
int uncompressedBufferSize = bgzfBlock.getInt();
|
||||
byte[] uncompressedContent = new byte[uncompressedBufferSize];
|
||||
|
||||
// Bound the CDATA section of the buffer.
|
||||
bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
|
||||
bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
byte[] compressedContent = new byte[bgzfBlock.remaining()];
|
||||
ByteBuffer.wrap(compressedContent).put(bgzfBlock);
|
||||
|
||||
// Decompress the buffer.
|
||||
final Inflater inflater = new Inflater(true);
|
||||
inflater.setInput(compressedContent);
|
||||
int bytesUncompressed = inflater.inflate(uncompressedContent);
|
||||
if(bytesUncompressed != uncompressedBufferSize)
|
||||
throw new ReviewedStingException("Error decompressing block");
|
||||
|
||||
return ByteBuffer.wrap(uncompressedContent);
|
||||
}
|
||||
|
||||
private long position(final FileInputStream inputStream) throws IOException {
|
||||
return inputStream.getChannel().position();
|
||||
}
|
||||
|
||||
private int unpackUByte8(final ByteBuffer buffer,final int position) {
|
||||
return buffer.get(position) & 0xFF;
|
||||
}
|
||||
|
||||
private int unpackUInt16(final ByteBuffer buffer,final int position) {
|
||||
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||
// value into an int buffer to transfer the bitwise contents into an int.
|
||||
return buffer.getShort(position) & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
|
||||
/**
|
||||
* Caches frequently used file handles. Right now, caches only a single file handle.
|
||||
* TODO: Generalize to support arbitrary file handle caches.
|
||||
*/
|
||||
public class FileHandleCache {
|
||||
/**
|
||||
* The underlying data structure storing file handles.
|
||||
*/
|
||||
private final FileHandleStorage fileHandleStorage;
|
||||
|
||||
/**
|
||||
* How many file handles should be kept open at once.
|
||||
*/
|
||||
private final int cacheSize;
|
||||
|
||||
/**
|
||||
* A uniquifier: assign a unique ID to every instance of a file handle.
|
||||
*/
|
||||
private final Map<SAMReaderID,Integer> keyCounter = new HashMap<SAMReaderID,Integer>();
|
||||
|
||||
/**
|
||||
* A shared lock, private so that outside users cannot notify it.
|
||||
*/
|
||||
private final Object lock = new Object();
|
||||
|
||||
/**
|
||||
* Indicates how many file handles are outstanding at this point.
|
||||
*/
|
||||
private int numOutstandingFileHandles = 0;
|
||||
|
||||
/**
|
||||
* Create a new file handle cache of the given cache size.
|
||||
* @param cacheSize how many readers to hold open at once.
|
||||
*/
|
||||
public FileHandleCache(final int cacheSize) {
|
||||
this.cacheSize = cacheSize;
|
||||
fileHandleStorage = new FileHandleStorage();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves or opens a file handle for the given reader ID.
|
||||
* @param key The ke
|
||||
* @return A file input stream from the cache, if available, or otherwise newly opened.
|
||||
*/
|
||||
public FileInputStream claimFileInputStream(final SAMReaderID key) {
|
||||
synchronized(lock) {
|
||||
FileInputStream inputStream = findExistingEntry(key);
|
||||
if(inputStream == null) {
|
||||
try {
|
||||
// If the cache is maxed out, wait for another file handle to emerge.
|
||||
if(numOutstandingFileHandles >= cacheSize)
|
||||
lock.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
throw new ReviewedStingException("Interrupted while waiting for a file handle");
|
||||
}
|
||||
inputStream = openInputStream(key);
|
||||
}
|
||||
numOutstandingFileHandles++;
|
||||
|
||||
//System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases the current reader and returns it to the cache.
|
||||
* @param key The reader.
|
||||
* @param inputStream The stream being used.
|
||||
*/
|
||||
public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
|
||||
synchronized(lock) {
|
||||
numOutstandingFileHandles--;
|
||||
UniqueKey newID = allocateKey(key);
|
||||
fileHandleStorage.put(newID,inputStream);
|
||||
// Let any listeners know that another file handle has become available.
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds an existing entry in the storage mechanism.
|
||||
* @param key Reader.
|
||||
* @return a cached stream, if available. Otherwise,
|
||||
*/
|
||||
private FileInputStream findExistingEntry(final SAMReaderID key) {
|
||||
int existingHandles = getMostRecentUniquifier(key);
|
||||
|
||||
// See if any of the keys currently exist in the repository.
|
||||
for(int i = 0; i <= existingHandles; i++) {
|
||||
UniqueKey uniqueKey = new UniqueKey(key,i);
|
||||
if(fileHandleStorage.containsKey(uniqueKey))
|
||||
return fileHandleStorage.remove(uniqueKey);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the most recent uniquifier used for the given reader.
|
||||
* @param reader Reader for which to determine uniqueness.
|
||||
* @return
|
||||
*/
|
||||
private int getMostRecentUniquifier(final SAMReaderID reader) {
|
||||
if(keyCounter.containsKey(reader))
|
||||
return keyCounter.get(reader);
|
||||
else return -1;
|
||||
}
|
||||
|
||||
private UniqueKey allocateKey(final SAMReaderID reader) {
|
||||
int uniquifier = getMostRecentUniquifier(reader)+1;
|
||||
keyCounter.put(reader,uniquifier);
|
||||
return new UniqueKey(reader,uniquifier);
|
||||
}
|
||||
|
||||
private FileInputStream openInputStream(final SAMReaderID reader) {
|
||||
try {
|
||||
return new FileInputStream(reader.getSamFilePath());
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open input file");
|
||||
}
|
||||
}
|
||||
|
||||
private void closeInputStream(final FileInputStream inputStream) {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open input file");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually contains the file handles, purging them as they get too old.
|
||||
*/
|
||||
private class FileHandleStorage extends LinkedHashMap<UniqueKey,FileInputStream> {
|
||||
/**
|
||||
* Remove the oldest entry
|
||||
* @param entry Entry to consider removing.
|
||||
* @return True if the cache size has been exceeded. False otherwise.
|
||||
*/
|
||||
@Override
|
||||
protected boolean removeEldestEntry(Map.Entry<UniqueKey,FileInputStream> entry) {
|
||||
synchronized (lock) {
|
||||
if(size() > cacheSize) {
|
||||
keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
|
||||
closeInputStream(entry.getValue());
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uniquifies a key by adding a numerical uniquifier.
|
||||
*/
|
||||
private class UniqueKey {
|
||||
/**
|
||||
* The file handle's key.
|
||||
*/
|
||||
private final SAMReaderID key;
|
||||
|
||||
/**
|
||||
* A uniquifier, so that multiple of the same reader can exist in the cache.
|
||||
*/
|
||||
private final int uniqueID;
|
||||
|
||||
public UniqueKey(final SAMReaderID reader, final int uniqueID) {
|
||||
this.key = reader;
|
||||
this.uniqueID = uniqueID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(!(other instanceof UniqueKey))
|
||||
return false;
|
||||
UniqueKey otherUniqueKey = (UniqueKey)other;
|
||||
return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return key.hashCode();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
|
|||
import net.sf.samtools.SAMFileSpan;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
|
@ -40,28 +41,25 @@ import java.util.*;
|
|||
*/
|
||||
public class FilePointer {
|
||||
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
|
||||
protected final BAMOverlap overlap;
|
||||
protected final List<GenomeLoc> locations;
|
||||
protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
|
||||
/**
|
||||
* Does this file pointer point into an unmapped region?
|
||||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
public FilePointer() {
|
||||
this((BAMOverlap)null);
|
||||
}
|
||||
|
||||
public FilePointer(final GenomeLoc location) {
|
||||
this.overlap = null;
|
||||
this.locations = Collections.singletonList(location);
|
||||
this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
|
||||
}
|
||||
|
||||
public FilePointer(final BAMOverlap overlap) {
|
||||
this.overlap = overlap;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
this.isRegionUnmapped = false;
|
||||
public FilePointer(final GenomeLoc... locations) {
|
||||
this.locations.addAll(Arrays.asList(locations));
|
||||
boolean foundMapped = false, foundUnmapped = false;
|
||||
for(GenomeLoc location: locations) {
|
||||
if(GenomeLoc.isUnmapped(location))
|
||||
foundUnmapped = true;
|
||||
else
|
||||
foundMapped = true;
|
||||
}
|
||||
if(foundMapped && foundUnmapped)
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
this.isRegionUnmapped = foundUnmapped;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -217,4 +215,20 @@ public class FilePointer {
|
|||
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
|
||||
combined.addFileSpans(initialElement.getKey(),fileSpan);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("FilePointer:%n");
|
||||
builder.append("\tlocations = {");
|
||||
builder.append(Utils.join(";",locations));
|
||||
builder.append("}%n\tregions = %n");
|
||||
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
|
||||
builder.append(entry.getKey());
|
||||
builder.append("= {");
|
||||
builder.append(entry.getValue());
|
||||
builder.append("}");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,419 +25,58 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.AbstractBAMFileIndex;
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.BrowseableBAMIndex;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Shard intervals based on position within the BAM file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
* Handles the process of aggregating BAM intervals into individual shards.
|
||||
* TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
|
||||
*/
|
||||
public class IntervalSharder {
|
||||
private static Logger logger = Logger.getLogger(IntervalSharder.class);
|
||||
public class IntervalSharder implements Iterator<FilePointer> {
|
||||
/**
|
||||
* The iterator actually laying out the data for BAM scheduling.
|
||||
*/
|
||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||
|
||||
public static Iterator<FilePointer> shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
return new IntervalSharder.FilePointerIterator(dataSource,loci);
|
||||
/**
|
||||
* The parser, for interval manipulation.
|
||||
*/
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
|
||||
}
|
||||
|
||||
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
|
||||
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* A lazy-loading iterator over file pointers.
|
||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||
* @return The next file pointer to process.
|
||||
*/
|
||||
private static class FilePointerIterator implements Iterator<FilePointer> {
|
||||
final SAMDataSource dataSource;
|
||||
final GenomeLocSortedSet loci;
|
||||
final PeekableIterator<GenomeLoc> locusIterator;
|
||||
final Queue<FilePointer> cachedFilePointers = new LinkedList<FilePointer>();
|
||||
|
||||
public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return !cachedFilePointers.isEmpty();
|
||||
}
|
||||
|
||||
public FilePointer next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("FilePointerIterator iteration is complete");
|
||||
FilePointer filePointer = cachedFilePointers.remove();
|
||||
if(cachedFilePointers.isEmpty())
|
||||
advance();
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
|
||||
String contig = null;
|
||||
|
||||
// If the next section of the BAM to be processed is unmapped, handle this region separately.
|
||||
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
|
||||
contig = null;
|
||||
while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
|
||||
GenomeLoc nextLocus = locusIterator.next();
|
||||
contig = nextLocus.getContig();
|
||||
nextBatch.add(nextLocus);
|
||||
}
|
||||
}
|
||||
|
||||
if(nextBatch.size() > 0) {
|
||||
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
|
||||
}
|
||||
}
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge / split intervals based on an awareness of the structure of the BAM file.
|
||||
* @param dataSource
|
||||
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
|
||||
* @param loci
|
||||
* @return
|
||||
*/
|
||||
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
|
||||
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
|
||||
if(contig == null) {
|
||||
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(id,null);
|
||||
return Collections.singletonList(filePointer);
|
||||
}
|
||||
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
FilePointer lastFilePointer = null;
|
||||
BAMOverlap lastBAMOverlap = null;
|
||||
|
||||
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
|
||||
IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
|
||||
for(SAMReaderID id: dataSource.getReaderIDs()) {
|
||||
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
|
||||
// If this contig can't be found in the reference, skip over it.
|
||||
if(referenceSequence == null && contig != null)
|
||||
continue;
|
||||
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
|
||||
binMerger.addReader(id,
|
||||
index,
|
||||
referenceSequence.getSequenceIndex(),
|
||||
index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
|
||||
// Cache the reader for later data lookup.
|
||||
readerToIndexMap.put(id,index);
|
||||
}
|
||||
|
||||
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
|
||||
|
||||
for(GenomeLoc location: loci) {
|
||||
if(!location.getContig().equals(contig))
|
||||
throw new ReviewedStingException("Location outside bounds of contig");
|
||||
|
||||
if(!binIterator.hasNext())
|
||||
break;
|
||||
|
||||
int locationStart = location.getStart();
|
||||
final int locationStop = location.getStop();
|
||||
|
||||
// Advance to first bin.
|
||||
while(binIterator.peek().stop < locationStart)
|
||||
binIterator.next();
|
||||
|
||||
// Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
|
||||
// sure the extending bin is not pruned from the list.
|
||||
List<BAMOverlap> bamOverlaps = new ArrayList<BAMOverlap>();
|
||||
while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
|
||||
bamOverlaps.add(binIterator.next());
|
||||
if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
|
||||
bamOverlaps.add(binIterator.peek());
|
||||
|
||||
// Bins found; try to match bins with locations.
|
||||
Iterator<BAMOverlap> bamOverlapIterator = bamOverlaps.iterator();
|
||||
|
||||
while(locationStop >= locationStart) {
|
||||
int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
|
||||
int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
|
||||
|
||||
while(binStop < locationStart && bamOverlapIterator.hasNext()) {
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
||||
filePointers.add(lastFilePointer);
|
||||
|
||||
lastBAMOverlap = bamOverlapIterator.next();
|
||||
lastFilePointer = new FilePointer(lastBAMOverlap);
|
||||
binStart = lastFilePointer.overlap.start;
|
||||
binStop = lastFilePointer.overlap.stop;
|
||||
}
|
||||
|
||||
if(locationStart < binStart) {
|
||||
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
||||
filePointers.add(lastFilePointer);
|
||||
lastFilePointer = null;
|
||||
lastBAMOverlap = null;
|
||||
}
|
||||
|
||||
final int regionStop = Math.min(locationStop,binStart-1);
|
||||
|
||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
|
||||
lastFilePointer = new FilePointer(subset);
|
||||
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
else if(locationStart > binStop) {
|
||||
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
||||
filePointers.add(lastFilePointer);
|
||||
lastFilePointer = null;
|
||||
lastBAMOverlap = null;
|
||||
}
|
||||
|
||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
|
||||
filePointers.add(new FilePointer(subset));
|
||||
|
||||
locationStart = locationStop + 1;
|
||||
}
|
||||
else {
|
||||
if(lastFilePointer == null)
|
||||
throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
|
||||
|
||||
// The start of the region overlaps the bin. Add the overlapping subset.
|
||||
final int regionStop = Math.min(locationStop,binStop);
|
||||
lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
||||
filePointers.add(lastFilePointer);
|
||||
|
||||
// Lookup the locations for every file pointer in the index.
|
||||
for(SAMReaderID id: readerToIndexMap.keySet()) {
|
||||
BrowseableBAMIndex index = readerToIndexMap.get(id);
|
||||
for(FilePointer filePointer: filePointers)
|
||||
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
|
||||
}
|
||||
|
||||
return filePointers;
|
||||
}
|
||||
|
||||
private static class BinMergingIterator implements Iterator<BAMOverlap> {
|
||||
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
|
||||
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
|
||||
|
||||
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
|
||||
binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
|
||||
}
|
||||
|
||||
public BAMOverlap next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No elements left in merging iterator");
|
||||
if(pendingOverlaps.isEmpty())
|
||||
advance();
|
||||
return pendingOverlaps.remove();
|
||||
}
|
||||
|
||||
public void advance() {
|
||||
List<ReaderBin> bins = new ArrayList<ReaderBin>();
|
||||
int boundsStart, boundsStop;
|
||||
|
||||
// Prime the pump
|
||||
if(binQueue.isEmpty())
|
||||
return;
|
||||
bins.add(getNextBin());
|
||||
boundsStart = bins.get(0).getStart();
|
||||
boundsStop = bins.get(0).getStop();
|
||||
|
||||
// Accumulate all the bins that overlap the current bin, in sorted order.
|
||||
while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
|
||||
ReaderBin bin = getNextBin();
|
||||
bins.add(bin);
|
||||
boundsStart = Math.min(boundsStart,bin.getStart());
|
||||
boundsStop = Math.max(boundsStop,bin.getStop());
|
||||
}
|
||||
|
||||
List<Pair<Integer,Integer>> range = new ArrayList<Pair<Integer,Integer>>();
|
||||
int start = bins.get(0).getStart();
|
||||
int stop = bins.get(0).getStop();
|
||||
while(start <= boundsStop) {
|
||||
// Find the next stopping point.
|
||||
for(ReaderBin bin: bins) {
|
||||
stop = Math.min(stop,bin.getStop());
|
||||
if(start < bin.getStart())
|
||||
stop = Math.min(stop,bin.getStart()-1);
|
||||
}
|
||||
|
||||
range.add(new Pair<Integer,Integer>(start,stop));
|
||||
// If the last entry added included the last element, stop.
|
||||
if(stop >= boundsStop)
|
||||
break;
|
||||
|
||||
// Find the next start.
|
||||
start = stop + 1;
|
||||
for(ReaderBin bin: bins) {
|
||||
if(start >= bin.getStart() && start <= bin.getStop())
|
||||
break;
|
||||
else if(start < bin.getStart()) {
|
||||
start = bin.getStart();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add the next series of BAM overlaps to the window.
|
||||
for(Pair<Integer,Integer> window: range) {
|
||||
BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
|
||||
for(ReaderBin bin: bins)
|
||||
bamOverlap.addBin(bin.id,bin.bin);
|
||||
pendingOverlaps.add(bamOverlap);
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
|
||||
|
||||
private ReaderBin peekNextBin() {
|
||||
if(binQueue.isEmpty())
|
||||
throw new NoSuchElementException("No more bins are available");
|
||||
BinQueueState current = binQueue.peek();
|
||||
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
|
||||
}
|
||||
|
||||
private ReaderBin getNextBin() {
|
||||
if(binQueue.isEmpty())
|
||||
throw new NoSuchElementException("No more bins are available");
|
||||
BinQueueState current = binQueue.remove();
|
||||
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
|
||||
if(current.hasNextBin())
|
||||
binQueue.add(current);
|
||||
return readerBin;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters out bins not at the lowest level in the tree.
|
||||
*/
|
||||
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
|
||||
private BrowseableBAMIndex index;
|
||||
private Iterator<Bin> wrappedIterator;
|
||||
|
||||
private Bin nextBin;
|
||||
|
||||
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
|
||||
this.index = index;
|
||||
this.wrappedIterator = iterator;
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextBin != null;
|
||||
}
|
||||
|
||||
public Bin next() {
|
||||
Bin bin = nextBin;
|
||||
advance();
|
||||
return bin;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
|
||||
|
||||
private void advance() {
|
||||
nextBin = null;
|
||||
while(wrappedIterator.hasNext() && nextBin == null) {
|
||||
Bin bin = wrappedIterator.next();
|
||||
if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
|
||||
nextBin = bin;
|
||||
}
|
||||
}
|
||||
}
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||
}
|
||||
|
||||
class BinQueueState implements Comparable<org.broadinstitute.sting.gatk.datasources.reads.BinQueueState> {
|
||||
private final SAMReaderID id;
|
||||
private final BrowseableBAMIndex index;
|
||||
private final int referenceSequence;
|
||||
private final PeekableIterator<Bin> bins;
|
||||
|
||||
private int firstLocusInCurrentBin;
|
||||
private int lastLocusInCurrentBin;
|
||||
|
||||
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
|
||||
this.id = id;
|
||||
this.index = index;
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.bins = new PeekableIterator<Bin>(bins);
|
||||
refreshLocusInBinCache();
|
||||
}
|
||||
|
||||
public SAMReaderID getReaderID() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public BrowseableBAMIndex getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public int getReferenceSequence() {
|
||||
return referenceSequence;
|
||||
}
|
||||
|
||||
public boolean hasNextBin() {
|
||||
return bins.hasNext();
|
||||
}
|
||||
|
||||
public Bin peekNextBin() {
|
||||
return bins.peek();
|
||||
}
|
||||
|
||||
public Bin nextBin() {
|
||||
Bin nextBin = bins.next();
|
||||
refreshLocusInBinCache();
|
||||
return nextBin;
|
||||
}
|
||||
|
||||
public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
|
||||
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
|
||||
if(!this.bins.hasNext()) return -1;
|
||||
if(!this.bins.hasNext()) return 1;
|
||||
|
||||
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
|
||||
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
|
||||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
|
||||
throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
|
||||
}
|
||||
|
||||
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
|
||||
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
|
||||
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
|
||||
|
||||
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
|
||||
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
|
||||
}
|
||||
|
||||
private void refreshLocusInBinCache() {
|
||||
firstLocusInCurrentBin = -1;
|
||||
lastLocusInCurrentBin = -1;
|
||||
if(bins.hasNext()) {
|
||||
Bin bin = bins.peek();
|
||||
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
|
||||
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Batch granular file pointers into potentially larger shards.
|
||||
*/
|
||||
public class LocusShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
public boolean hasNext() {
|
||||
return filePointers.hasNext();
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
FilePointer current = filePointers.next();
|
||||
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
|
||||
current = current.combine(parser,filePointers.next());
|
||||
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A sharding strategy for loci based on reading of the index.
|
||||
*/
|
||||
public class LocusShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* The data source to use when performing this sharding.
|
||||
*/
|
||||
private final SAMDataSource reads;
|
||||
|
||||
/**
|
||||
* the parser for creating shards
|
||||
*/
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* An iterator through the available file pointers.
|
||||
*/
|
||||
private final Iterator<FilePointer> filePointerIterator;
|
||||
|
||||
/**
|
||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
||||
* @param reads Data source from which to load index data.
|
||||
* @param locations List of locations for which to load data.
|
||||
*/
|
||||
public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
|
||||
this.reads = reads;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
||||
if(!reads.isEmpty()) {
|
||||
GenomeLocSortedSet intervals;
|
||||
if(locations == null) {
|
||||
// If no locations were passed in, shard the entire BAM file.
|
||||
SAMFileHeader header = reads.getHeader();
|
||||
intervals = new GenomeLocSortedSet(genomeLocParser);
|
||||
|
||||
for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
|
||||
// Check this sequence against the reference sequence dictionary.
|
||||
// TODO: Do a better job of merging reads + reference.
|
||||
SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
|
||||
if(refSequenceRecord != null) {
|
||||
final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
|
||||
intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
intervals = locations;
|
||||
|
||||
if(reads.isLowMemoryShardingEnabled()) {
|
||||
/*
|
||||
Iterator<FilePointer> filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
while(filePointerIterator.hasNext())
|
||||
filePointers.add(filePointerIterator.next());
|
||||
this.filePointerIterator = filePointers.iterator();
|
||||
*/
|
||||
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
||||
}
|
||||
else
|
||||
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
|
||||
}
|
||||
else {
|
||||
final int maxShardSize = 100000;
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
if(locations == null) {
|
||||
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
||||
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
||||
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
||||
filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for(GenomeLoc interval: locations) {
|
||||
while(interval.size() > maxShardSize) {
|
||||
filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
|
||||
interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
||||
}
|
||||
filePointers.add(new FilePointer(interval));
|
||||
}
|
||||
}
|
||||
filePointerIterator = filePointers.iterator();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if there are additional shards
|
||||
*
|
||||
* @return false if we're done processing shards
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return filePointerIterator.hasNext();
|
||||
}
|
||||
|
||||
public long shardNumber = 0;
|
||||
|
||||
/**
|
||||
* gets the next Shard
|
||||
*
|
||||
* @return the next shard
|
||||
*/
|
||||
public LocusShard next() {
|
||||
FilePointer nextFilePointer = filePointerIterator.next();
|
||||
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
|
||||
|
||||
/*
|
||||
System.out.printf("Shard %d: interval = {",++shardNumber);
|
||||
for(GenomeLoc locus: nextFilePointer.locations)
|
||||
System.out.printf("%s;",locus);
|
||||
System.out.printf("}; ");
|
||||
|
||||
if(fileSpansBounding == null)
|
||||
System.out.printf("no shard data%n");
|
||||
else {
|
||||
SortedMap<SAMReaderID,SAMFileSpan> sortedSpans = new TreeMap<SAMReaderID,SAMFileSpan>(fileSpansBounding);
|
||||
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: sortedSpans.entrySet()) {
|
||||
System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
|
||||
}
|
||||
|
||||
/** we don't support the remove command */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* makes the IntervalShard iterable, i.e. usable in a for loop.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Handles the process of aggregating BAM intervals into individual shards.
|
||||
*/
|
||||
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
||||
/**
|
||||
* The iterator actually laying out the data for BAM scheduling.
|
||||
*/
|
||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The parser, for interval manipulation.
|
||||
*/
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
||||
parser = loci.getGenomeLocParser();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||
* @return The next file pointer to process.
|
||||
*/
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||
}
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A single, monolithic shard bridging all available data.
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class MonolithicShard extends Shard {
|
||||
/**
|
||||
* Creates a new monolithic shard of the given type.
|
||||
* @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
|
||||
* @param locs Intervals that this monolithic shard should process.
|
||||
*/
|
||||
public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List<GenomeLoc> locs) {
|
||||
super(parser, shardType, locs, readsDataSource, null, false);
|
||||
if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
|
||||
throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
|
||||
}
|
||||
|
||||
/**
|
||||
* String representation of this shard.
|
||||
* @return "entire genome".
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "entire genome";
|
||||
}
|
||||
}
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Create a giant shard representing all the data in the input BAM(s).
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class MonolithicShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* The single shard associated with this sharding strategy.
|
||||
*/
|
||||
private MonolithicShard shard;
|
||||
|
||||
/**
|
||||
* Create a new shard strategy for shards of the given type.
|
||||
* @param shardType The shard type.
|
||||
*/
|
||||
public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List<GenomeLoc> region) {
|
||||
shard = new MonolithicShard(parser,readsDataSource,shardType,region);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
|
||||
* will only return another copy of the active iterator.
|
||||
* @return A copy of this.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the monolithic shard has not yet been consumed, or false otherwise.
|
||||
* @return True if shard has been consumed, false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return shard != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the monolithic shard if it has not already been retrieved.
|
||||
* @return The monolithic shard.
|
||||
* @throws NoSuchElementException if no such data exists.
|
||||
*/
|
||||
public Shard next() {
|
||||
if(shard == null)
|
||||
throw new NoSuchElementException("Monolithic shard has already been retrived.");
|
||||
|
||||
Shard working = shard;
|
||||
shard = null;
|
||||
return working;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a shard strategy");
|
||||
}
|
||||
|
||||
/**
|
||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
||||
* @param size adjust the next size to this
|
||||
*/
|
||||
public void adjustNextShardSize( long size ) {
|
||||
throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -35,10 +35,15 @@ import java.util.Map;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class ReadShard extends Shard {
|
||||
/**
|
||||
* What is the maximum number of reads which should go into a read shard.
|
||||
*/
|
||||
public static final int MAX_READS = 10000;
|
||||
|
||||
/**
|
||||
* The reads making up this shard.
|
||||
*/
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(MAX_READS);
|
||||
|
||||
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
|
||||
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
|
||||
|
|
@ -66,7 +71,7 @@ public class ReadShard extends Shard {
|
|||
* @return True if this shard's buffer is full (and the shard can buffer reads).
|
||||
*/
|
||||
public boolean isBufferFull() {
|
||||
return reads.size() > ReadShardStrategy.MAX_READS;
|
||||
return reads.size() > ReadShard.MAX_READS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
*/
|
||||
public class ReadShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
||||
* has a specific number of reads (default to 10K) which is configured in the constructor.
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 14, 2009
|
||||
*/
|
||||
public class ReadShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* What is the maximum number of reads which should go into a read shard.
|
||||
*/
|
||||
protected static final int MAX_READS = 10000;
|
||||
|
||||
/**
|
||||
* The data source used to shard.
|
||||
*/
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
/**
|
||||
* The intervals to be processed.
|
||||
*/
|
||||
private final GenomeLocSortedSet locations;
|
||||
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/** our storage of the genomic locations they'd like to shard over */
|
||||
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
|
||||
/**
|
||||
* Iterator over the list of file pointers.
|
||||
*/
|
||||
private final Iterator<FilePointer> filePointerIterator;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,SAMFileSpan> position;
|
||||
|
||||
/**
|
||||
* An indicator whether the strategy has sharded into the unmapped region.
|
||||
*/
|
||||
private boolean isIntoUnmappedRegion = false;
|
||||
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
/**
|
||||
* Create a new read shard strategy, loading read shards from the given BAM file.
|
||||
* @param dataSource Data source from which to load shards.
|
||||
* @param locations intervals to use for sharding.
|
||||
*/
|
||||
public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
||||
this.dataSource = dataSource;
|
||||
this.parser = parser;
|
||||
this.position = this.dataSource.getCurrentPosition();
|
||||
this.locations = locations;
|
||||
|
||||
if(locations != null)
|
||||
filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
|
||||
else
|
||||
filePointerIterator = filePointers.iterator();
|
||||
|
||||
if(filePointerIterator.hasNext())
|
||||
currentFilePointer = filePointerIterator.next();
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have another read shard?
|
||||
* @return True if any more data is available. False otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the next shard, if available.
|
||||
* @return The next shard, if available.
|
||||
* @throws java.util.NoSuchElementException if no such shard is available.
|
||||
*/
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
nextShard = null;
|
||||
|
||||
if(locations != null) {
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
dataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// todo -- this nulling of intervals is a bit annoying since readwalkers without
|
||||
// todo -- any -L values need to be special cased throughout the code.
|
||||
Shard shard = new ReadShard(parser,dataSource,position,null,false);
|
||||
dataSource.fillShard(shard);
|
||||
nextShard = !shard.isBufferEmpty() ? shard : null;
|
||||
}
|
||||
|
||||
this.position = dataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove not supported");
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for using ShardStrategy in an foreach loop.
|
||||
* @return A iterator over shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.BrowseableBAMIndex;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 2, 2011
|
||||
* Time: 4:36:40 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class ReaderBin {
|
||||
public final SAMReaderID id;
|
||||
public final BrowseableBAMIndex index;
|
||||
public final int referenceSequence;
|
||||
public final Bin bin;
|
||||
|
||||
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
|
||||
this.id = id;
|
||||
this.index = index;
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.bin = bin;
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return index.getFirstLocusInBin(bin);
|
||||
}
|
||||
|
||||
public int getStop() {
|
||||
return index.getLastLocusInBin(bin);
|
||||
}
|
||||
}
|
||||
|
|
@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -71,7 +73,7 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
|
||||
*/
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
protected final GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* Identifiers for the readers driving this data source.
|
||||
|
|
@ -91,13 +93,18 @@ public class SAMDataSource {
|
|||
/**
|
||||
* How far along is each reader?
|
||||
*/
|
||||
private final Map<SAMReaderID, SAMFileSpan> readerPositions = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
|
||||
|
||||
/**
|
||||
* The merged header.
|
||||
*/
|
||||
private final SAMFileHeader mergedHeader;
|
||||
|
||||
/**
|
||||
* The constituent headers of the unmerged files.
|
||||
*/
|
||||
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
|
||||
|
||||
/**
|
||||
* The sort order of the BAM files. Files without a sort order tag are assumed to be
|
||||
* in coordinate order.
|
||||
|
|
@ -131,17 +138,24 @@ public class SAMDataSource {
|
|||
private final SAMResourcePool resourcePool;
|
||||
|
||||
/**
|
||||
* Whether to enable the new low-memory sharding mechanism.
|
||||
* Asynchronously loads BGZF blocks.
|
||||
*/
|
||||
private boolean enableLowMemorySharding = false;
|
||||
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* How are threads allocated.
|
||||
*/
|
||||
private final ThreadAllocation threadAllocation;
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
* @param samFiles list of reads files.
|
||||
*/
|
||||
public SAMDataSource(Collection<SAMReaderID> samFiles,GenomeLocParser genomeLocParser) {
|
||||
public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
|
||||
this(
|
||||
samFiles,
|
||||
threadAllocation,
|
||||
numFileHandles,
|
||||
genomeLocParser,
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.STRICT,
|
||||
|
|
@ -150,8 +164,7 @@ public class SAMDataSource {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
false,
|
||||
false,
|
||||
true);
|
||||
false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -159,6 +172,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
public SAMDataSource(
|
||||
Collection<SAMReaderID> samFiles,
|
||||
ThreadAllocation threadAllocation,
|
||||
Integer numFileHandles,
|
||||
GenomeLocParser genomeLocParser,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
|
|
@ -167,9 +182,10 @@ public class SAMDataSource {
|
|||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
boolean generateExtendedEvents,
|
||||
boolean enableLowMemorySharding) {
|
||||
boolean generateExtendedEvents) {
|
||||
this( samFiles,
|
||||
threadAllocation,
|
||||
numFileHandles,
|
||||
genomeLocParser,
|
||||
useOriginalBaseQualities,
|
||||
strictness,
|
||||
|
|
@ -182,8 +198,7 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
(byte) -1,
|
||||
enableLowMemorySharding);
|
||||
(byte) -1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -205,6 +220,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
public SAMDataSource(
|
||||
Collection<SAMReaderID> samFiles,
|
||||
ThreadAllocation threadAllocation,
|
||||
Integer numFileHandles,
|
||||
GenomeLocParser genomeLocParser,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
|
|
@ -217,13 +234,19 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
byte defaultBaseQualities,
|
||||
boolean enableLowMemorySharding) {
|
||||
this.enableLowMemorySharding(enableLowMemorySharding);
|
||||
byte defaultBaseQualities) {
|
||||
this.readMetrics = new ReadMetrics();
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
||||
readerIDs = samFiles;
|
||||
|
||||
this.threadAllocation = threadAllocation;
|
||||
// TODO: Consider a borrowed-thread dispatcher implementation.
|
||||
if(this.threadAllocation.getNumIOThreads() > 0)
|
||||
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
|
||||
else
|
||||
dispatcher = null;
|
||||
|
||||
validationStringency = strictness;
|
||||
for (SAMReaderID readerID : samFiles) {
|
||||
if (!readerID.samFile.canRead())
|
||||
|
|
@ -235,10 +258,13 @@ public class SAMDataSource {
|
|||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
// Determine the sort order.
|
||||
for(SAMFileReader reader: readers.values()) {
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
// Get the sort order, forcing it to coordinate if unsorted.
|
||||
SAMFileReader reader = readers.getReader(readerID);
|
||||
SAMFileHeader header = reader.getFileHeader();
|
||||
|
||||
headers.put(readerID,header);
|
||||
|
||||
if ( header.getReadGroups().isEmpty() ) {
|
||||
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
|
||||
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
|
||||
|
|
@ -275,7 +301,7 @@ public class SAMDataSource {
|
|||
qmode,
|
||||
refReader,
|
||||
defaultBaseQualities);
|
||||
|
||||
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
// and read group id (merged) -> read group id (original) mappings.
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
|
|
@ -296,12 +322,10 @@ public class SAMDataSource {
|
|||
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
||||
}
|
||||
|
||||
if(enableLowMemorySharding) {
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null)
|
||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||
}
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null)
|
||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||
}
|
||||
|
||||
resourcePool.releaseReaders(readers);
|
||||
|
|
@ -314,22 +338,6 @@ public class SAMDataSource {
|
|||
*/
|
||||
public ReadProperties getReadsInfo() { return readProperties; }
|
||||
|
||||
/**
|
||||
* Enable experimental low-memory sharding.
|
||||
* @param enable True to enable sharding. False otherwise.
|
||||
*/
|
||||
public void enableLowMemorySharding(final boolean enable) {
|
||||
enableLowMemorySharding = enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether low-memory sharding is enabled.
|
||||
* @return True if enabled, false otherwise.
|
||||
*/
|
||||
public boolean isLowMemoryShardingEnabled() {
|
||||
return enableLowMemorySharding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see whether any reads files are supplying data.
|
||||
* @return True if no reads files are supplying data to the traversal; false otherwise.
|
||||
|
|
@ -368,7 +376,7 @@ public class SAMDataSource {
|
|||
* Retrieves the current position within the BAM file.
|
||||
* @return A mapping of reader to current position.
|
||||
*/
|
||||
public Map<SAMReaderID,SAMFileSpan> getCurrentPosition() {
|
||||
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
|
||||
return readerPositions;
|
||||
}
|
||||
|
||||
|
|
@ -381,7 +389,7 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
public SAMFileHeader getHeader(SAMReaderID id) {
|
||||
return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader();
|
||||
return headers.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -404,45 +412,21 @@ public class SAMDataSource {
|
|||
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* No read group collisions at this time because only one SAM file is currently supported.
|
||||
* @return False always.
|
||||
*/
|
||||
public boolean hasReadGroupCollisions() {
|
||||
return hasReadGroupCollisions;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if all readers have an index.
|
||||
* @return True if all readers have an index.
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
if(enableLowMemorySharding)
|
||||
return readerIDs.size() == bamIndices.size();
|
||||
else {
|
||||
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
|
||||
if(!reader.hasIndex())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return readerIDs.size() == bamIndices.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the index for a particular reader. Always preloaded.
|
||||
* TODO: Should return object of type GATKBAMIndex, but cannot because there
|
||||
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
|
||||
* TODO: sharding system goes live.
|
||||
* @param id Id of the reader.
|
||||
* @return The index. Will preload the index if necessary.
|
||||
*/
|
||||
public Object getIndex(final SAMReaderID id) {
|
||||
if(enableLowMemorySharding)
|
||||
return bamIndices.get(id);
|
||||
else {
|
||||
SAMReaders readers = resourcePool.getReadersWithoutLocking();
|
||||
return readers.getReader(id).getBrowseableIndex();
|
||||
}
|
||||
public GATKBAMIndex getIndex(final SAMReaderID id) {
|
||||
return bamIndices.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -454,7 +438,7 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets the cumulative read metrics for shards already processed.
|
||||
* Gets the cumulative read metrics for shards already processed.
|
||||
* @return Cumulative read metrics.
|
||||
*/
|
||||
public ReadMetrics getCumulativeReadMetrics() {
|
||||
|
|
@ -507,10 +491,6 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
public StingSAMIterator seek(Shard shard) {
|
||||
// todo: refresh monolithic sharding implementation
|
||||
if(shard instanceof MonolithicShard)
|
||||
return seekMonolithic(shard);
|
||||
|
||||
if(shard.buffersReads()) {
|
||||
return shard.iterator();
|
||||
}
|
||||
|
|
@ -540,7 +520,7 @@ public class SAMDataSource {
|
|||
*/
|
||||
private void initializeReaderPositions(SAMReaders readers) {
|
||||
for(SAMReaderID id: getReaderIDs())
|
||||
readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads());
|
||||
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -548,7 +528,6 @@ public class SAMDataSource {
|
|||
* @param readers Readers from which to load data.
|
||||
* @param shard The shard specifying the data limits.
|
||||
* @param enableVerification True to verify. For compatibility with old sharding strategy.
|
||||
* TODO: Collapse this flag when the two sharding systems are merged.
|
||||
* @return An iterator over the selected data.
|
||||
*/
|
||||
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
|
||||
|
|
@ -559,14 +538,20 @@ public class SAMDataSource {
|
|||
|
||||
for(SAMReaderID id: getReaderIDs()) {
|
||||
CloseableIterator<SAMRecord> iterator = null;
|
||||
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
|
||||
continue;
|
||||
iterator = shard.getFileSpans().get(id) != null ?
|
||||
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
|
||||
readers.getReader(id).queryUnmapped();
|
||||
|
||||
// TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
|
||||
// TODO: Kill this check once we've proven that the design elements are gone.
|
||||
if(shard.getFileSpans().get(id) == null)
|
||||
throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
|
||||
|
||||
if(threadAllocation.getNumIOThreads() > 0) {
|
||||
BlockInputStream inputStream = readers.getInputStream(id);
|
||||
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
|
||||
}
|
||||
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
if(readProperties.getReadBufferSize() != null)
|
||||
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
||||
if(shard.getGenomeLocs() != null)
|
||||
if(shard.getGenomeLocs().size() > 0)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||
}
|
||||
|
|
@ -584,33 +569,6 @@ public class SAMDataSource {
|
|||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* A stopgap measure to handle monolithic sharding
|
||||
* @param shard the (monolithic) shard.
|
||||
* @return An iterator over the monolithic shard.
|
||||
*/
|
||||
private StingSAMIterator seekMonolithic(Shard shard) {
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
|
||||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
for(SAMReaderID id: getReaderIDs())
|
||||
mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
|
||||
|
||||
return applyDecoratingIterators(shard.getReadMetrics(),
|
||||
shard instanceof ReadShard,
|
||||
readProperties.useOriginalBaseQualities(),
|
||||
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
||||
readProperties.getDownsamplingMethod().toFraction,
|
||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getBAQCalculationMode(),
|
||||
readProperties.getBAQQualityMode(),
|
||||
readProperties.getRefReader(),
|
||||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this read to the given shard.
|
||||
* @param shard The shard to which to add the read.
|
||||
|
|
@ -618,7 +576,7 @@ public class SAMDataSource {
|
|||
* @param read The read to add to the shard.
|
||||
*/
|
||||
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
|
||||
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
|
||||
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
|
||||
shard.addRead(read);
|
||||
readerPositions.put(id,endChunk);
|
||||
}
|
||||
|
|
@ -689,19 +647,6 @@ public class SAMDataSource {
|
|||
this.maxEntries = maxEntries;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dangerous internal method; retrieves any set of readers, whether in iteration or not.
|
||||
* Used to handle non-exclusive, stateless operations, such as index queries.
|
||||
* @return Any collection of SAMReaders, whether in iteration or not.
|
||||
*/
|
||||
protected SAMReaders getReadersWithoutLocking() {
|
||||
synchronized(this) {
|
||||
if(allResources.size() == 0)
|
||||
createNewResource();
|
||||
}
|
||||
return allResources.get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Choose a set of readers from the pool to use for this query. When complete,
|
||||
* @return
|
||||
|
|
@ -753,6 +698,11 @@ public class SAMDataSource {
|
|||
*/
|
||||
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
|
||||
|
||||
/**
|
||||
* The inptu streams backing
|
||||
*/
|
||||
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
|
||||
|
||||
/**
|
||||
* Derive a new set of readers from the Reads metadata.
|
||||
* @param readerIDs reads to load.
|
||||
|
|
@ -760,12 +710,20 @@ public class SAMDataSource {
|
|||
*/
|
||||
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
SAMFileReader reader = new SAMFileReader(readerID.samFile);
|
||||
File indexFile = findIndexFile(readerID.samFile);
|
||||
|
||||
SAMFileReader reader = null;
|
||||
|
||||
if(threadAllocation.getNumIOThreads() > 0) {
|
||||
BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||
inputStreams.put(readerID,blockInputStream);
|
||||
}
|
||||
else
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
reader.setSAMRecordFactory(factory);
|
||||
|
||||
reader.enableFileSource(true);
|
||||
reader.enableIndexMemoryMapping(false);
|
||||
if(!enableLowMemorySharding)
|
||||
reader.enableIndexCaching(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
|
||||
final SAMFileHeader header = reader.getFileHeader();
|
||||
|
|
@ -786,6 +744,15 @@ public class SAMDataSource {
|
|||
return readers.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the input stream backing a reader.
|
||||
* @param id The ID of the reader to retrieve.
|
||||
* @return the reader associated with the given id.
|
||||
*/
|
||||
public BlockInputStream getInputStream(final SAMReaderID id) {
|
||||
return inputStreams.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for the reader id of this reader.
|
||||
* @param reader Reader for which to search.
|
||||
|
|
@ -883,7 +850,7 @@ public class SAMDataSource {
|
|||
* Filters out reads that do not overlap the current GenomeLoc.
|
||||
* Note the custom implementation: BAM index querying returns all reads that could
|
||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
*/
|
||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
|
|
@ -903,7 +870,7 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalContigIndices;
|
||||
private int[] intervalStarts;
|
||||
|
|
@ -941,7 +908,7 @@ public class SAMDataSource {
|
|||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
|
|
@ -1070,6 +1037,40 @@ public class SAMDataSource {
|
|||
|
||||
return indexFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
|
||||
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
|
||||
* @return An iterator that spans all reads in all BAM files.
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
|
||||
* read that has been assigned
|
||||
* @return
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a schedule for processing the initialized BAM file using the given interval list.
|
||||
* The returned schedule should be as granular as possible.
|
||||
* @param intervals The list of intervals for which to create the schedule.
|
||||
* @return A granular iterator over file pointers.
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
|
||||
if(intervals == null)
|
||||
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: 10/14/11
|
||||
* Time: 10:47 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class SAMReaderPosition {
|
||||
private final SAMReaderID reader;
|
||||
private final BlockInputStream inputStream;
|
||||
|
||||
private final List<GATKChunk> positions;
|
||||
private PeekableIterator<GATKChunk> positionIterator;
|
||||
|
||||
/**
|
||||
* Stores the next block address to read, or -1 if no such block is available.
|
||||
*/
|
||||
private long nextBlockAddress;
|
||||
|
||||
|
||||
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||
this.reader = reader;
|
||||
this.inputStream = inputStream;
|
||||
|
||||
this.positions = fileSpan.getGATKChunks();
|
||||
initialize();
|
||||
}
|
||||
|
||||
public SAMReaderID getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
public BlockInputStream getInputStream() {
|
||||
return inputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the next block address to be read.
|
||||
* @return Next block address to be read.
|
||||
*/
|
||||
public long getBlockAddress() {
|
||||
return nextBlockAddress;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the SAM reader position to its original state.
|
||||
*/
|
||||
private void initialize() {
|
||||
this.positionIterator = new PeekableIterator<GATKChunk>(positions.iterator());
|
||||
if(positionIterator.hasNext())
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
else
|
||||
nextBlockAddress = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the current position to the next block to read, given the current position in the file.
|
||||
* @param filePosition The current position within the file.
|
||||
*/
|
||||
void advancePosition(final long filePosition) {
|
||||
nextBlockAddress = filePosition;
|
||||
|
||||
// Check the current file position against the iterator; if the iterator is before the current file position,
|
||||
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
||||
try {
|
||||
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
|
||||
positionIterator.next();
|
||||
// Check to see if the iterator has more data available.
|
||||
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(Exception ex) {
|
||||
throw new ReviewedStingException("");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
|
||||
return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Balances maximally granular file pointers into shards of reasonable size.
|
||||
*/
|
||||
public abstract class ShardBalancer implements Iterable<Shard> {
|
||||
protected SAMDataSource readsDataSource;
|
||||
protected PeekableIterator<FilePointer> filePointers;
|
||||
protected GenomeLocParser parser;
|
||||
|
||||
public void initialize(final SAMDataSource readsDataSource, final Iterator<FilePointer> filePointers, final GenomeLocParser parser) {
|
||||
this.readsDataSource = readsDataSource;
|
||||
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
|
||||
this.parser = parser;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import java.util.Iterator;
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 10, 2009
|
||||
* Time: 4:55:37 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 10, 2009
|
||||
* <p/>
|
||||
* Interface ShardStrategy
|
||||
* <p/>
|
||||
* The base interface for the sharding strategy; before we had a base abstract
|
||||
* class, but not this will be an interface to accomidate read based sharding
|
||||
*/
|
||||
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
||||
}
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 6, 2009
|
||||
* Time: 7:09:22 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 6, 2009
|
||||
* <p/>
|
||||
* Class ShardStrategyFactory
|
||||
* <p/>
|
||||
* The Shard Strategy Factory, use this class to create and transfer shard strategies
|
||||
* between different approaches.
|
||||
*/
|
||||
public class ShardStrategyFactory {
|
||||
public enum SHATTER_STRATEGY {
|
||||
MONOLITHIC, // Put all of the available data into one shard.
|
||||
LOCUS_EXPERIMENTAL,
|
||||
READS_EXPERIMENTAL
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
|
||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
|
||||
switch (strat) {
|
||||
case LOCUS_EXPERIMENTAL:
|
||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
|
||||
case READS_EXPERIMENTAL:
|
||||
return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
|
||||
default:
|
||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
|
||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource The reads used to shatter this file.
|
||||
* @param referenceDataSource The reference used to shatter this file.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return A strategy for shattering this data.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
|
||||
switch (strat) {
|
||||
case LOCUS_EXPERIMENTAL:
|
||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
|
||||
case READS_EXPERIMENTAL:
|
||||
return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
|
||||
default:
|
||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
|
@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
// initialize reads
|
||||
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
|
||||
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
|
||||
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
|
||||
|
||||
// intervals
|
||||
GenomeLocSortedSet intervalSortedSet = null;
|
||||
|
|
@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
|
||||
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
FilePointer filePointer = sharder.next();
|
||||
|
||||
|
|
@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
|
||||
|
||||
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
FilePointer filePointer = sharder.next();
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex;
|
|||
import net.sf.picard.reference.FastaSequenceIndexBuilder;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.sam.CreateSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
|
@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
|||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Loads reference data from fasta file
|
||||
* Looks for fai and dict files, and tries to create them if they don't exist
|
||||
*/
|
||||
public class ReferenceDataSource {
|
||||
private IndexedFastaSequenceFile index;
|
||||
private IndexedFastaSequenceFile reference;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
|
||||
|
|
@ -173,7 +185,7 @@ public class ReferenceDataSource {
|
|||
logger.info("Treating existing index file as complete.");
|
||||
}
|
||||
|
||||
index = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||
reference = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
|
||||
|
|
@ -192,6 +204,52 @@ public class ReferenceDataSource {
|
|||
* @return IndexedFastaSequenceFile that was created from file
|
||||
*/
|
||||
public IndexedFastaSequenceFile getReference() {
|
||||
return this.index;
|
||||
return this.reference;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an iterator for processing the entire reference.
|
||||
* @param readsDataSource the reads datasource to embed in the locus shard.
|
||||
* @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||
* @param maxShardSize The maximum shard size which can be used to create this list.
|
||||
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||
*/
|
||||
public Iterable<Shard> createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
|
||||
List<Shard> shards = new ArrayList<Shard>();
|
||||
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
||||
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
||||
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
||||
shards.add(new LocusShard(parser,
|
||||
readsDataSource,
|
||||
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
|
||||
null));
|
||||
}
|
||||
}
|
||||
return shards;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an iterator for processing the entire reference.
|
||||
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||
* @param intervals the list of intervals to use when processing the reference.
|
||||
* @param maxShardSize The maximum shard size which can be used to create this list.
|
||||
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||
*/
|
||||
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
|
||||
List<Shard> shards = new ArrayList<Shard>();
|
||||
for(GenomeLoc interval: intervals) {
|
||||
while(interval.size() > maxShardSize) {
|
||||
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||
readsDataSource,
|
||||
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
|
||||
null));
|
||||
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
||||
}
|
||||
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||
readsDataSource,
|
||||
Collections.singletonList(interval),
|
||||
null));
|
||||
}
|
||||
return shards;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
||||
|
|
@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||
}
|
||||
|
||||
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
|
||||
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
|
||||
// Fast fail for walkers not supporting TreeReducible interface.
|
||||
if (!( walker instanceof TreeReducible ))
|
||||
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
|
|||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
|
|
@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
* @param walker Computation to perform over dataset.
|
||||
* @param shardStrategy A strategy for sharding the data.
|
||||
*/
|
||||
public Object execute(Walker walker, ShardStrategy shardStrategy) {
|
||||
public Object execute(Walker walker, Iterable<Shard> shardStrategy) {
|
||||
walker.initialize();
|
||||
Accumulator accumulator = Accumulator.create(engine,walker);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
* @param reads the informations associated with the reads
|
||||
* @param reference the reference file
|
||||
* @param rods the rods to include in the traversal
|
||||
* @param nThreadsToUse Number of threads to utilize.
|
||||
* @param threadAllocation Number of threads to utilize.
|
||||
*
|
||||
* @return The best-fit microscheduler.
|
||||
*/
|
||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
|
||||
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
|
||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
|
||||
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
|
||||
if(walker.isReduceByInterval())
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
if(walker instanceof ReadWalker)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse));
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse);
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
|
||||
} else {
|
||||
if(nThreadsToUse > 1)
|
||||
if(threadAllocation.getNumCPUThreads() > 1)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
|
||||
}
|
||||
|
|
@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
*
|
||||
* @return the return type of the walker
|
||||
*/
|
||||
public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
|
||||
public abstract Object execute(Walker walker, Iterable<Shard> shardStrategy);
|
||||
|
||||
/**
|
||||
* Retrieves the object responsible for tracking and managing output.
|
||||
|
|
|
|||
|
|
@ -6,10 +6,9 @@ import org.broad.tribble.annotation.Strand;
|
|||
import org.broad.tribble.dbsnp.OldDbSNPFeature;
|
||||
import org.broad.tribble.gelitext.GeliTextFeature;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -187,30 +186,23 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
|
||||
if ( index < 0 )
|
||||
return null; // we weren't given enough reference context to create the VariantContext
|
||||
Byte refBaseForIndel = new Byte(ref.getBases()[index]);
|
||||
|
||||
Map<String, Genotype> genotypes = null;
|
||||
VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel);
|
||||
return vc;
|
||||
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||
builder.source(name).id(dbsnp.getRsID());
|
||||
builder.loc(dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0));
|
||||
builder.alleles(alleles);
|
||||
builder.referenceBaseForIndel(refBaseForIndel);
|
||||
return builder.make();
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
}
|
||||
|
||||
public static VCFHeader createVCFHeader(Set<VCFHeaderLine> hInfo, VariantContext vc) {
|
||||
HashSet<String> names = new LinkedHashSet<String>();
|
||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
names.add(g.getSampleName());
|
||||
}
|
||||
|
||||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GELI to VariantContext
|
||||
|
|
@ -257,20 +249,15 @@ public class VariantContextAdaptors {
|
|||
else genotypeAlleles.add(refAllele);
|
||||
}
|
||||
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
MutableGenotype call = new MutableGenotype(name, genotypeAlleles);
|
||||
|
||||
// set the likelihoods, depth, and RMS mapping quality values
|
||||
//call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.getLikelihoods());
|
||||
//call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaximumMappingQual());
|
||||
//call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.getDepthOfCoverage());
|
||||
Genotype call = new Genotype(name, genotypeAlleles);
|
||||
|
||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||
genotypes.add(call);
|
||||
alleles.add(refAllele);
|
||||
VariantContext vc = VariantContextUtils.toVC(name, ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()), alleles, genotypes, geli.getLODBestToReference(), null, attributes);
|
||||
return vc;
|
||||
GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart());
|
||||
return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make();
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
|
|
@ -329,7 +316,7 @@ public class VariantContextAdaptors {
|
|||
String[] samples = hapmap.getSampleIDs();
|
||||
String[] genotypeStrings = hapmap.getGenotypes();
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
|
||||
GenotypesContext genotypes = GenotypesContext.create(samples.length);
|
||||
for ( int i = 0; i < samples.length; i++ ) {
|
||||
// ignore bad genotypes
|
||||
if ( genotypeStrings[i].contains("N") )
|
||||
|
|
@ -358,16 +345,13 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
|
||||
Genotype g = new Genotype(samples[i], myAlleles);
|
||||
genotypes.put(samples[i], g);
|
||||
genotypes.add(g);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
|
||||
attrs.put(VariantContext.ID_KEY, hapmap.getName());
|
||||
|
||||
long end = hapmap.getEnd();
|
||||
if ( deletionLength > 0 )
|
||||
end += deletionLength;
|
||||
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel);
|
||||
VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).referenceBaseForIndel(refBaseForIndel).make();
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,9 +6,10 @@ import java.util.TreeMap;
|
|||
* Holds values for a column in a GATK report table
|
||||
*/
|
||||
public class GATKReportColumn extends TreeMap<Object, Object> {
|
||||
private String columnName;
|
||||
private Object defaultValue;
|
||||
private boolean display;
|
||||
final private String columnName;
|
||||
final private Object defaultValue;
|
||||
final private String format;
|
||||
final private boolean display;
|
||||
|
||||
/**
|
||||
* Construct the column object, specifying the column name, default value, and whether or not the column should be displayed
|
||||
|
|
@ -18,11 +19,17 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
* @param display if true, the column will be displayed in the final output
|
||||
*/
|
||||
public GATKReportColumn(String columnName, Object defaultValue, boolean display) {
|
||||
this(columnName, defaultValue, display, null);
|
||||
}
|
||||
|
||||
public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||
this.columnName = columnName;
|
||||
this.defaultValue = defaultValue;
|
||||
this.display = display;
|
||||
this.format = format == null ? null : (format.equals("") ? null : format);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initialize an element in the column with a default value
|
||||
*
|
||||
|
|
@ -55,7 +62,7 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
* @return the string value at the specified position in the column, or the default value if the element is not set
|
||||
*/
|
||||
public String getStringValue(Object primaryKey) {
|
||||
return toString(getWithoutSideEffects(primaryKey));
|
||||
return formatValue(getWithoutSideEffects(primaryKey));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -77,7 +84,7 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
|
||||
for (Object obj : this.values()) {
|
||||
if (obj != null) {
|
||||
int width = toString(obj).length();
|
||||
int width = formatValue(obj).length();
|
||||
|
||||
if (width > maxWidth) {
|
||||
maxWidth = width;
|
||||
|
|
@ -93,10 +100,12 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
* @param obj The object to convert to a string
|
||||
* @return The string representation of the column
|
||||
*/
|
||||
private static String toString(Object obj) {
|
||||
private String formatValue(Object obj) {
|
||||
String value;
|
||||
if (obj == null) {
|
||||
value = "null";
|
||||
} else if ( format != null ) {
|
||||
value = String.format(format, obj);
|
||||
} else if (obj instanceof Float) {
|
||||
value = String.format("%.8f", (Float) obj);
|
||||
} else if (obj instanceof Double) {
|
||||
|
|
|
|||
|
|
@ -250,13 +250,12 @@ public class GATKReportTable {
|
|||
* @param defaultValue the default value for the column
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue) {
|
||||
if (!isValidName(columnName)) {
|
||||
throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed.");
|
||||
}
|
||||
|
||||
addColumn(columnName, defaultValue, true);
|
||||
addColumn(columnName, defaultValue, null);
|
||||
}
|
||||
|
||||
public void addColumn(String columnName, Object defaultValue, String format) {
|
||||
addColumn(columnName, defaultValue, true, format);
|
||||
}
|
||||
/**
|
||||
* Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file.
|
||||
*
|
||||
|
|
@ -265,7 +264,14 @@ public class GATKReportTable {
|
|||
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display) {
|
||||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display));
|
||||
addColumn(columnName, defaultValue, display, null);
|
||||
}
|
||||
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||
if (!isValidName(columnName)) {
|
||||
throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed.");
|
||||
}
|
||||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.resourcemanagement;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Models how threads are distributed between various components of the GATK.
|
||||
*/
|
||||
public class ThreadAllocation {
|
||||
/**
|
||||
* The number of CPU threads to be used by the GATK.
|
||||
*/
|
||||
private final int numCPUThreads;
|
||||
|
||||
/**
|
||||
* Number of threads to devote exclusively to IO. Default is 0.
|
||||
*/
|
||||
private final int numIOThreads;
|
||||
|
||||
public int getNumCPUThreads() {
|
||||
return numCPUThreads;
|
||||
}
|
||||
|
||||
public int getNumIOThreads() {
|
||||
return numIOThreads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct the default thread allocation.
|
||||
*/
|
||||
public ThreadAllocation() {
|
||||
this(1,null,null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
|
||||
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
|
||||
* @param totalThreads Complete number of threads to allocate.
|
||||
* @param numCPUThreads Total number of threads allocated to the traversal.
|
||||
* @param numIOThreads Total number of threads allocated exclusively to IO.
|
||||
*/
|
||||
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
|
||||
// If no allocation information is present, allocate all threads to CPU
|
||||
if(numCPUThreads == null && numIOThreads == null) {
|
||||
this.numCPUThreads = totalThreads;
|
||||
this.numIOThreads = 0;
|
||||
}
|
||||
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
|
||||
else if(numIOThreads == null) {
|
||||
if(numCPUThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = totalThreads - numCPUThreads;
|
||||
}
|
||||
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
|
||||
else if(numCPUThreads == null) {
|
||||
if(numIOThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
|
||||
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
else {
|
||||
if(numCPUThreads + numIOThreads != totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -54,18 +55,18 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
|||
|
||||
if ( !vc.isBiallelic() )
|
||||
return null;
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( !vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
double ratio = 0.0;
|
||||
double totalWeights = 0.0;
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
for ( Genotype genotype : genotypes ) {
|
||||
// we care only about het calls
|
||||
if ( !genotype.getValue().isHet() )
|
||||
if ( !genotype.isHet() )
|
||||
continue;
|
||||
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
|
|
@ -84,8 +85,8 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
|||
continue;
|
||||
|
||||
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
|
||||
ratio += genotype.getValue().getNegLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
||||
totalWeights += genotype.getValue().getNegLog10PError();
|
||||
ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
||||
totalWeights += genotype.getLog10PError();
|
||||
} else if ( vc.isIndel() && context.hasExtendedEventPileup() ) {
|
||||
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
||||
if ( indelPileup == null ) {
|
||||
|
|
|
|||
|
|
@ -59,10 +59,8 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn
|
|||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( ! vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
VariantContextUtils.calculateChromosomeCounts(vc, map, true);
|
||||
return map;
|
||||
|
||||
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true);
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
|
|
|
|||
|
|
@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -89,9 +89,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
||||
if (haplotypes != null) {
|
||||
final Set<Map.Entry<String, Genotype>> genotypes = vc.getGenotypes().entrySet();
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes ) {
|
||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : vc.getGenotypes()) {
|
||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( thisContext != null ) {
|
||||
final ReadBackedPileup thisPileup;
|
||||
if (thisContext.hasExtendedEventPileup())
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -26,20 +27,18 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
|||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
private static final int MIN_GENOTYPE_QUALITY = 10;
|
||||
private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
||||
private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||
return null;
|
||||
|
||||
int refCount = 0;
|
||||
int hetCount = 0;
|
||||
int homCount = 0;
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
Genotype g = genotype.getValue();
|
||||
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.isNoCall() )
|
||||
continue;
|
||||
|
||||
|
|
@ -47,7 +46,7 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
|||
// Right now we just ignore genotypes that are not confident, but this throws off
|
||||
// our HW ratios. More analysis is needed to determine the right thing to do when
|
||||
// the genotyper cannot decide whether a given sample is het or hom var.
|
||||
if ( g.getNegLog10PError() < MIN_NEG_LOG10_PERROR )
|
||||
if ( g.getLog10PError() > MIN_LOG10_PERROR )
|
||||
continue;
|
||||
|
||||
if ( g.isHomRef() )
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -32,7 +33,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||
return null;
|
||||
|
||||
|
|
@ -51,8 +52,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
double hetCount = 0.0;
|
||||
double homCount = 0.0;
|
||||
int N = 0; // number of samples that have likelihoods
|
||||
for ( final Map.Entry<String, Genotype> genotypeMap : genotypes.entrySet() ) {
|
||||
Genotype g = genotypeMap.getValue();
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.isNoCall() || !g.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -28,19 +29,19 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
|
||||
int depth = 0;
|
||||
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
for ( final Genotype genotype : genotypes ) {
|
||||
|
||||
// we care only about variant calls with likelihoods
|
||||
if ( genotype.getValue().isHomRef() )
|
||||
if ( genotype.isHomRef() )
|
||||
continue;
|
||||
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
|
|
@ -50,7 +51,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( depth == 0 )
|
||||
return null;
|
||||
|
||||
double QD = 10.0 * vc.getNegLog10PError() / (double)depth;
|
||||
double QD = -10.0 * vc.getLog10PError() / (double)depth;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -32,7 +33,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
|
||||
|
|
@ -42,8 +43,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
|
||||
if (vc.isSNP() && vc.isBiallelic()) {
|
||||
// todo - no current support for multiallelic snps
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -52,8 +53,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
}
|
||||
else if (vc.isIndel() || vc.isMixed()) {
|
||||
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,11 +47,11 @@ import java.util.Map;
|
|||
public class SampleList extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( vc.isMonomorphic() || !vc.hasGenotypes() )
|
||||
if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
StringBuffer samples = new StringBuffer();
|
||||
for ( Genotype genotype : vc.getGenotypesSortedByName() ) {
|
||||
for ( Genotype genotype : vc.getGenotypesOrderedByName() ) {
|
||||
if ( genotype.isCalled() && !genotype.isHomRef() ){
|
||||
if ( samples.length() > 0 )
|
||||
samples.append(",");
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
// We refuse to parse SnpEff output files generated by unsupported versions, or
|
||||
// lacking a SnpEff version number in the VCF header:
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" };
|
||||
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
|
||||
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
|
||||
|
||||
|
|
@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
public enum InfoFieldKey {
|
||||
EFFECT_KEY ("SNPEFF_EFFECT", -1),
|
||||
IMPACT_KEY ("SNPEFF_IMPACT", 0),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
|
||||
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
|
||||
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 8);
|
||||
|
||||
// Actual text of the key
|
||||
private final String keyName;
|
||||
|
|
@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
// are validated against this list.
|
||||
public enum EffectType {
|
||||
// High-impact effects:
|
||||
FRAME_SHIFT (EffectFunctionalClass.NONE, false),
|
||||
STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
|
||||
START_LOST (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
|
||||
EXON_DELETED (EffectFunctionalClass.NONE, false),
|
||||
STOP_LOST (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_ACCEPTOR,
|
||||
SPLICE_SITE_DONOR,
|
||||
START_LOST,
|
||||
EXON_DELETED,
|
||||
FRAME_SHIFT,
|
||||
STOP_GAINED,
|
||||
STOP_LOST,
|
||||
|
||||
// Moderate-impact effects:
|
||||
NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
|
||||
CODON_CHANGE (EffectFunctionalClass.NONE, false),
|
||||
CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
UTR_5_DELETED (EffectFunctionalClass.NONE, false),
|
||||
UTR_3_DELETED (EffectFunctionalClass.NONE, false),
|
||||
NON_SYNONYMOUS_CODING,
|
||||
CODON_CHANGE,
|
||||
CODON_INSERTION,
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION,
|
||||
CODON_DELETION,
|
||||
CODON_CHANGE_PLUS_CODON_DELETION,
|
||||
UTR_5_DELETED,
|
||||
UTR_3_DELETED,
|
||||
|
||||
// Low-impact effects:
|
||||
SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
START_GAINED (EffectFunctionalClass.NONE, false),
|
||||
SYNONYMOUS_START,
|
||||
NON_SYNONYMOUS_START,
|
||||
START_GAINED,
|
||||
SYNONYMOUS_CODING,
|
||||
SYNONYMOUS_STOP,
|
||||
NON_SYNONYMOUS_STOP,
|
||||
|
||||
// Modifiers:
|
||||
NONE (EffectFunctionalClass.NONE, true),
|
||||
CHROMOSOME (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC (EffectFunctionalClass.NONE, true),
|
||||
UPSTREAM (EffectFunctionalClass.NONE, true),
|
||||
UTR_5_PRIME (EffectFunctionalClass.NONE, true),
|
||||
CDS (EffectFunctionalClass.NONE, true),
|
||||
GENE (EffectFunctionalClass.NONE, true),
|
||||
TRANSCRIPT (EffectFunctionalClass.NONE, true),
|
||||
EXON (EffectFunctionalClass.NONE, true),
|
||||
INTRON (EffectFunctionalClass.NONE, true),
|
||||
UTR_3_PRIME (EffectFunctionalClass.NONE, true),
|
||||
DOWNSTREAM (EffectFunctionalClass.NONE, true),
|
||||
INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
REGULATION (EffectFunctionalClass.NONE, true),
|
||||
CUSTOM (EffectFunctionalClass.NONE, true),
|
||||
WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
|
||||
|
||||
private final EffectFunctionalClass functionalClass;
|
||||
private final boolean isModifier;
|
||||
|
||||
EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
|
||||
this.functionalClass = functionalClass;
|
||||
this.isModifier = isModifier;
|
||||
}
|
||||
|
||||
public EffectFunctionalClass getFunctionalClass() {
|
||||
return functionalClass;
|
||||
}
|
||||
|
||||
public boolean isModifier() {
|
||||
return isModifier;
|
||||
}
|
||||
NONE,
|
||||
CHROMOSOME,
|
||||
CUSTOM,
|
||||
CDS,
|
||||
GENE,
|
||||
TRANSCRIPT,
|
||||
EXON,
|
||||
INTRON_CONSERVED,
|
||||
UTR_5_PRIME,
|
||||
UTR_3_PRIME,
|
||||
DOWNSTREAM,
|
||||
INTRAGENIC,
|
||||
INTERGENIC,
|
||||
INTERGENIC_CONSERVED,
|
||||
UPSTREAM,
|
||||
REGULATION,
|
||||
INTRON
|
||||
}
|
||||
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
|
||||
// classifying some of the LOW impact effects as MODIFIERs.
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER.
|
||||
public enum EffectImpact {
|
||||
MODIFIER (0),
|
||||
LOW (1),
|
||||
|
|
@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
UNKNOWN
|
||||
}
|
||||
|
||||
// We assign a functional class to each SnpEff effect.
|
||||
// SnpEff assigns a functional class to each effect.
|
||||
public enum EffectFunctionalClass {
|
||||
NONE (0),
|
||||
SILENT (1),
|
||||
|
|
@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
public List<String> getKeyNames() {
|
||||
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
|
||||
InfoFieldKey.IMPACT_KEY.getKeyName(),
|
||||
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(),
|
||||
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
|
||||
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName()
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
protected static class SnpEffEffect {
|
||||
private EffectType effect;
|
||||
private EffectImpact impact;
|
||||
private EffectFunctionalClass functionalClass;
|
||||
private String codonChange;
|
||||
private String aminoAcidChange;
|
||||
private String geneName;
|
||||
|
|
@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
private String parseError = null;
|
||||
private boolean isWellFormed = true;
|
||||
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11;
|
||||
|
||||
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
|
||||
// errors come after warnings, not vice versa:
|
||||
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
|
||||
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
|
||||
// If there is either a warning OR an error, it will be in the last field. If there is both
|
||||
// a warning AND an error, the warning will be in the second-to-last field, and the error will
|
||||
// be in the last field.
|
||||
private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1;
|
||||
private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2;
|
||||
private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1;
|
||||
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
|
||||
// Position of the field indicating whether the effect is coding or non-coding. This field is used
|
||||
// in selecting the most significant effect, but is not included in the annotations we return
|
||||
// since it can be deduced from the SNPEFF_GENE_BIOTYPE field.
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 6;
|
||||
|
||||
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
|
||||
parseEffectName(effectName);
|
||||
|
|
@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
private void parseEffectMetadata ( String[] effectMetadata ) {
|
||||
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
|
||||
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following warning or error: \"%s\"",
|
||||
effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR]));
|
||||
}
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"",
|
||||
effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR],
|
||||
effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR]));
|
||||
}
|
||||
else {
|
||||
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
|
||||
|
|
@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return;
|
||||
}
|
||||
|
||||
if ( effect != null && effect.isModifier() ) {
|
||||
impact = EffectImpact.MODIFIER;
|
||||
// The impact field will never be empty, and should always contain one of the enumerated values:
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
}
|
||||
else {
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
}
|
||||
|
||||
// The functional class field will be empty when the effect has no functional class associated with it:
|
||||
if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) {
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]));
|
||||
}
|
||||
}
|
||||
else {
|
||||
functionalClass = EffectFunctionalClass.NONE;
|
||||
}
|
||||
|
||||
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
|
||||
|
||||
// The coding field will be empty when SnpEff has no coding info for the effect:
|
||||
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
|
||||
try {
|
||||
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
|
||||
|
|
@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return true;
|
||||
}
|
||||
else if ( impact.isSameImpactAs(other.impact) ) {
|
||||
return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
|
||||
return functionalClass.isHigherPriorityThan(other.functionalClass);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
|
||||
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -222,8 +222,33 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
if ( isUniqueHeaderLine(line, hInfo) )
|
||||
hInfo.add(line);
|
||||
}
|
||||
for ( String expression : expressionsToUse )
|
||||
hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
||||
// for the expressions, pull the info header line from the header of the resource rod
|
||||
for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
|
||||
// special case the ID field
|
||||
if ( expression.fieldName.equals("ID") ) {
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
|
||||
continue;
|
||||
}
|
||||
VCFInfoHeaderLine targetHeaderLine = null;
|
||||
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
||||
if ( infoline.getName().equals(expression.fieldName) ) {
|
||||
targetHeaderLine = infoline;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( targetHeaderLine != null ) {
|
||||
if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER )
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||
else
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||
} else {
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
||||
}
|
||||
}
|
||||
|
||||
engine.invokeAnnotationInitializationMethods(hInfo);
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -49,20 +51,20 @@ public class VariantAnnotatorEngine {
|
|||
private AnnotatorCompatibleWalker walker;
|
||||
private GenomeAnalysisEngine toolkit;
|
||||
|
||||
private static class VAExpression {
|
||||
protected static class VAExpression {
|
||||
|
||||
public String fullName, fieldName;
|
||||
public RodBinding<VariantContext> binding;
|
||||
|
||||
public VAExpression(String fullEpression, List<RodBinding<VariantContext>> bindings) {
|
||||
int indexOfDot = fullEpression.lastIndexOf(".");
|
||||
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
|
||||
int indexOfDot = fullExpression.lastIndexOf(".");
|
||||
if ( indexOfDot == -1 )
|
||||
throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format");
|
||||
throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
|
||||
|
||||
fullName = fullEpression;
|
||||
fieldName = fullEpression.substring(indexOfDot+1);
|
||||
fullName = fullExpression;
|
||||
fieldName = fullExpression.substring(indexOfDot+1);
|
||||
|
||||
String bindingName = fullEpression.substring(0, indexOfDot);
|
||||
String bindingName = fullExpression.substring(0, indexOfDot);
|
||||
for ( RodBinding<VariantContext> rod : bindings ) {
|
||||
if ( rod.getName().equals(bindingName) ) {
|
||||
binding = rod;
|
||||
|
|
@ -97,6 +99,8 @@ public class VariantAnnotatorEngine {
|
|||
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
||||
}
|
||||
|
||||
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
||||
|
||||
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
||||
AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
|
|
@ -160,11 +164,10 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
|
||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
|
||||
|
||||
// annotate db occurrences
|
||||
annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||
vc = annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||
|
||||
// annotate expressions where available
|
||||
annotateExpressions(tracker, ref, infoAnnotations);
|
||||
|
|
@ -177,20 +180,20 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
|
||||
// generate a new annotated VC
|
||||
final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
|
||||
|
||||
// annotate genotypes, creating another new VC in the process
|
||||
return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc));
|
||||
return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make();
|
||||
}
|
||||
|
||||
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
||||
// annotate dbsnp id if available and not already there
|
||||
if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )
|
||||
infoAnnotations.put(VariantContext.ID_KEY, rsID);
|
||||
if ( rsID != null && vc.emptyID() )
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
} else {
|
||||
boolean overlapsComp = false;
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||
|
|
@ -202,6 +205,8 @@ public class VariantAnnotatorEngine {
|
|||
infoAnnotations.put(dbSet.getValue(), overlapsComp);
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
|
||||
|
|
@ -211,21 +216,25 @@ public class VariantAnnotatorEngine {
|
|||
continue;
|
||||
|
||||
VariantContext vc = VCs.iterator().next();
|
||||
if ( vc.hasAttribute(expression.fieldName) )
|
||||
// special-case the ID field
|
||||
if ( expression.fieldName.equals("ID") ) {
|
||||
if ( vc.hasID() )
|
||||
infoAnnotations.put(expression.fullName, vc.getID());
|
||||
} else if ( vc.hasAttribute(expression.fieldName) ) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Genotype> annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( requestedGenotypeAnnotations.size() == 0 )
|
||||
return vc.getGenotypes();
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc.getNSamples());
|
||||
for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
|
||||
Genotype genotype = g.getValue();
|
||||
AlignmentContext context = stratifiedContexts.get(g.getKey());
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
|
||||
for ( final Genotype genotype : vc.getGenotypes() ) {
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
genotypes.put(g.getKey(), genotype);
|
||||
genotypes.add(genotype);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -235,7 +244,7 @@ public class VariantAnnotatorEngine {
|
|||
if ( result != null )
|
||||
genotypeAnnotations.putAll(result);
|
||||
}
|
||||
genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
||||
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
|
|
|
|||
|
|
@ -36,10 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -125,7 +122,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
protected static String line = null;
|
||||
|
||||
private final double MIN_PROB_ERROR = 0.000001;
|
||||
private final double MAX_GENOTYPE_QUALITY = 6.0;
|
||||
private final double MAX_GENOTYPE_QUALITY = -6.0;
|
||||
|
||||
public void initialize() {
|
||||
|
||||
|
|
@ -181,8 +178,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// ignore places where we don't have a variant
|
||||
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
||||
{
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -190,8 +187,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
byte refByte = ref.getBase();
|
||||
|
||||
// make new Genotypes based on Beagle results
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
|
||||
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc_input.getGenotypes().size());
|
||||
|
||||
// for each genotype, create a new object with Beagle information on it
|
||||
|
||||
|
|
@ -200,15 +196,13 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
Double alleleFrequencyH = 0.0;
|
||||
int beagleVarCounts = 0;
|
||||
|
||||
Map<String,Genotype> hapmapGenotypes = null;
|
||||
GenotypesContext hapmapGenotypes = null;
|
||||
|
||||
if (vc_comp != null) {
|
||||
hapmapGenotypes = vc_comp.getGenotypes();
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
|
||||
|
||||
Genotype g = originalGenotypes.getValue();
|
||||
for ( final Genotype g : vc_input.getGenotypes() ) {
|
||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||
|
||||
boolean genotypeIsPhased = true;
|
||||
|
|
@ -218,7 +212,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// use sample as key into genotypes structure
|
||||
if (vc_comp != null) {
|
||||
|
||||
if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) {
|
||||
if (vc_input.getGenotypes().containsSample(sample) && hapmapGenotypes.containsSample(sample)) {
|
||||
|
||||
Genotype hapmapGenotype = hapmapGenotypes.get(sample);
|
||||
if (hapmapGenotype.isCalled()){
|
||||
|
|
@ -255,9 +249,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
Allele bglAlleleA, bglAlleleB;
|
||||
|
||||
if (alleleA.matches(refString))
|
||||
bglAlleleA = Allele.create(alleleA,true);
|
||||
bglAlleleA = Allele.create(alleleA,true);
|
||||
else
|
||||
bglAlleleA = Allele.create(alleleA,false);
|
||||
bglAlleleA = Allele.create(alleleA,false);
|
||||
|
||||
if (alleleB.matches(refString))
|
||||
bglAlleleB = Allele.create(alleleB,true);
|
||||
|
|
@ -286,7 +280,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// deal with numerical errors coming from limited formatting value on Beagle output files
|
||||
if (probWrongGenotype > 1 - MIN_PROB_ERROR)
|
||||
probWrongGenotype = 1 - MIN_PROB_ERROR;
|
||||
|
||||
|
||||
if (1-probWrongGenotype < noCallThreshold) {
|
||||
// quality is bad: don't call genotype
|
||||
alleles.clear();
|
||||
|
|
@ -298,7 +292,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
if (probWrongGenotype < MIN_PROB_ERROR)
|
||||
genotypeQuality = MAX_GENOTYPE_QUALITY;
|
||||
else
|
||||
genotypeQuality = -log10(probWrongGenotype);
|
||||
genotypeQuality = log10(probWrongGenotype);
|
||||
|
||||
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
|
||||
|
||||
|
|
@ -329,47 +323,40 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
else {
|
||||
originalAttributes.put("OG",".");
|
||||
}
|
||||
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
|
||||
beagleVarCounts++;
|
||||
}
|
||||
|
||||
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
|
||||
|
||||
genotypes.add(imputedGenotype);
|
||||
}
|
||||
|
||||
VariantContext filteredVC;
|
||||
if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
|
||||
else {
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes);
|
||||
if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) {
|
||||
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
|
||||
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
|
||||
builder.alleles(new HashSet<Allele>(Arrays.asList(vc_input.getReference()))).filters(removedFilters);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
|
||||
// re-compute chromosome counts
|
||||
VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||
|
||||
// Get Hapmap AC and AF
|
||||
if (vc_comp != null) {
|
||||
attributes.put("ACH", alleleCountH.toString() );
|
||||
attributes.put("ANH", chrCountH.toString() );
|
||||
attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
||||
builder.attribute("ACH", alleleCountH.toString() );
|
||||
builder.attribute("ANH", chrCountH.toString() );
|
||||
builder.attribute("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
||||
|
||||
}
|
||||
|
||||
attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
||||
builder.attribute("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
||||
if( !beagleR2Feature.getR2value().equals(Double.NaN) ) {
|
||||
attributes.put("R2", beagleR2Feature.getR2value().toString() );
|
||||
builder.attribute("R2", beagleR2Feature.getR2value().toString() );
|
||||
}
|
||||
|
||||
|
||||
vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes));
|
||||
|
||||
vcfWriter.add(builder.make());
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
|
|||
|
|
@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -204,7 +201,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
|
||||
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
|
||||
if ( bootstrapVCFOutput != null ) {
|
||||
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER));
|
||||
bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
|
||||
}
|
||||
bootstrapSetSize++;
|
||||
return true;
|
||||
|
|
@ -245,18 +242,18 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
if ( markers != null ) markers.append("\n");
|
||||
|
||||
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
||||
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||
GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
|
||||
GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||
for ( String sample : samples ) {
|
||||
boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;
|
||||
|
||||
Genotype genotype;
|
||||
boolean isValidation;
|
||||
// use sample as key into genotypes structure
|
||||
if ( preferredGenotypes.keySet().contains(sample) ) {
|
||||
if ( preferredGenotypes.containsSample(sample) ) {
|
||||
genotype = preferredGenotypes.get(sample);
|
||||
isValidation = isValidationSite;
|
||||
} else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
|
||||
} else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) {
|
||||
genotype = otherGenotypes.get(sample);
|
||||
isValidation = ! isValidationSite;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.diffengine;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
|
|
@ -46,6 +47,8 @@ import java.util.Map;
|
|||
* Class implementing diffnode reader for VCF
|
||||
*/
|
||||
public class VCFDiffableReader implements DiffableReader {
|
||||
private static Logger logger = Logger.getLogger(VCFDiffableReader.class);
|
||||
|
||||
@Override
|
||||
public String getName() { return "VCF"; }
|
||||
|
||||
|
|
@ -68,7 +71,10 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
String key = headerLine.getKey();
|
||||
if ( headerLine instanceof VCFNamedHeaderLine )
|
||||
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
||||
root.add(key, headerLine.toString());
|
||||
if ( root.hasElement(key) )
|
||||
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
||||
else
|
||||
root.add(key, headerLine.toString());
|
||||
}
|
||||
|
||||
String line = lineReader.readLine();
|
||||
|
|
@ -90,22 +96,22 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
// add fields
|
||||
vcRoot.add("CHROM", vc.getChr());
|
||||
vcRoot.add("POS", vc.getStart());
|
||||
vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("ID", vc.getID());
|
||||
vcRoot.add("REF", vc.getReference());
|
||||
vcRoot.add("ALT", vc.getAlternateAlleles());
|
||||
vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("FILTER", vc.getFilters());
|
||||
|
||||
// add info fields
|
||||
for (Map.Entry<String, Object> attribute : vc.getAttributes().entrySet()) {
|
||||
if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY))
|
||||
if ( ! attribute.getKey().startsWith("_") )
|
||||
vcRoot.add(attribute.getKey(), attribute.getValue());
|
||||
}
|
||||
|
||||
for (Genotype g : vc.getGenotypes().values() ) {
|
||||
for (Genotype g : vc.getGenotypes() ) {
|
||||
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
|
||||
gRoot.add("GT", g.getGenotypeString());
|
||||
gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 );
|
||||
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 );
|
||||
|
||||
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
|
||||
if ( ! attribute.getKey().startsWith("_") )
|
||||
|
|
|
|||
|
|
@ -36,9 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -224,7 +222,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
|
||||
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
||||
|
|
@ -267,7 +265,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
|
||||
return vc;
|
||||
|
|
@ -279,20 +277,15 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
if ( context == null )
|
||||
return;
|
||||
|
||||
VariantContext vc = context.getVariantContext();
|
||||
final VariantContext vc = context.getVariantContext();
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
|
||||
// make new Genotypes based on filters
|
||||
Map<String, Genotype> genotypes;
|
||||
if ( genotypeFilterExps.size() == 0 ) {
|
||||
genotypes = null;
|
||||
} else {
|
||||
genotypes = new HashMap<String, Genotype>(vc.getGenotypes().size());
|
||||
if ( genotypeFilterExps.size() > 0 ) {
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
||||
|
||||
// for each genotype, check filters then create a new object
|
||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
|
||||
Genotype g = genotype.getValue();
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
if ( g.isCalled() ) {
|
||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||
|
||||
|
|
@ -300,11 +293,13 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
if ( VariantContextUtils.match(vc, g, exp) )
|
||||
filters.add(exp.name);
|
||||
}
|
||||
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
||||
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
||||
} else {
|
||||
genotypes.put(genotype.getKey(), g);
|
||||
genotypes.add(g);
|
||||
}
|
||||
}
|
||||
|
||||
builder.genotypes(genotypes);
|
||||
}
|
||||
|
||||
// make a new variant context based on filters
|
||||
|
|
@ -324,14 +319,9 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
filters.add(exp.name);
|
||||
}
|
||||
}
|
||||
builder.filters(filters);
|
||||
|
||||
VariantContext filteredVC;
|
||||
if ( genotypes == null )
|
||||
filteredVC = VariantContext.modifyFilters(vc, filters);
|
||||
else
|
||||
filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
|
||||
|
||||
writer.add(filteredVC);
|
||||
writer.add(builder.make());
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
|
|
|||
|
|
@ -26,16 +26,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -47,8 +43,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT,
|
||||
/** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
|
||||
GRID_SEARCH
|
||||
}
|
||||
|
||||
protected int N;
|
||||
|
|
@ -73,7 +67,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||
*/
|
||||
protected abstract void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> Alleles,
|
||||
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors);
|
||||
|
||||
|
|
@ -85,7 +79,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
*
|
||||
* @return calls
|
||||
*/
|
||||
protected abstract Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood);
|
||||
protected abstract GenotypesContext assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood);
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
* Time: 6:46:09 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
enum DiploidGenotype {
|
||||
public enum DiploidGenotype {
|
||||
AA ('A', 'A'),
|
||||
AC ('A', 'C'),
|
||||
AG ('A', 'G'),
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
|
||||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
|
||||
if ( elt.isReducedRead() ) {
|
||||
// reduced read representation
|
||||
byte qual = elt.getQual();
|
||||
if ( BaseUtils.isRegularBase( elt.getBase() )) {
|
||||
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getRepresentativeCount(); // we added nObs bases here
|
||||
} else // odd bases or deletions => don't use them
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
}
|
||||
|
||||
// odd bases or deletions => don't use them
|
||||
return 0;
|
||||
}
|
||||
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
}
|
||||
|
||||
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
|
|
@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
* @return
|
||||
*/
|
||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
byte qual = p.getQual();
|
||||
|
||||
return qual;
|
||||
}
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
return qual;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -26,14 +26,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -46,12 +42,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
private final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
||||
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
final int numAlleles = alleles.size();
|
||||
|
|
@ -95,11 +92,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
||||
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.values() ) {
|
||||
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
|
|
@ -155,7 +152,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
public int linearExact(Map<String, Genotype> GLs,
|
||||
public int linearExact(GenotypesContext GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
|
|
@ -268,14 +265,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
public GenotypesContext assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
|
||||
Map<String, Genotype> GLs = vc.getGenotypes();
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
||||
|
|
@ -291,16 +288,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
sampleIndices.addAll(GLs.keySet());
|
||||
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
else {
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( !genotype.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||
//System.out.print(sample.getKey()+":");
|
||||
|
|
@ -312,7 +309,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
continue;
|
||||
}
|
||||
|
||||
sampleIndices.add(sample.getKey());
|
||||
sampleIndices.add(genotype.getSampleName());
|
||||
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
||||
|
|
@ -342,7 +339,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
GenotypesContext calls = GenotypesContext.create();
|
||||
|
||||
int startIdx = AFofMaxLikelihood;
|
||||
for (int k = sampleIdx; k > 0; k--) {
|
||||
|
|
@ -355,11 +352,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// and will add no-call genotype to GL's in a second pass
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
|
||||
}
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];;
|
||||
|
|
@ -367,20 +363,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
/* System.out.format("Sample: %s GL:",sample);
|
||||
for (int i=0; i < likelihoods.length; i++)
|
||||
System.out.format("%1.4f, ",likelihoods[i]);
|
||||
*/
|
||||
|
||||
for (int i=0; i < likelihoods.length; i++) {
|
||||
if (i==bestGTguess)
|
||||
continue;
|
||||
if (likelihoods[i] >= qual)
|
||||
qual = likelihoods[i];
|
||||
}
|
||||
// qual contains now max(likelihoods[k]) for all k != bestGTguess
|
||||
qual = likelihoods[bestGTguess] - qual;
|
||||
|
||||
// likelihoods are stored row-wise in lower triangular matrix. IE
|
||||
// for 2 alleles they have ordering AA,AB,BB
|
||||
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
|
||||
|
|
@ -408,37 +390,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
break;
|
||||
}
|
||||
|
||||
if (qual < 0) {
|
||||
// QUAL can be negative if the chosen genotype is not the most likely one individually.
|
||||
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on
|
||||
double[] normalized = MathUtils.normalizeFromLog10(likelihoods);
|
||||
double chosenGenotype = normalized[bestGTguess];
|
||||
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||
|
||||
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( !genotype.hasLikelihoods() )
|
||||
continue;
|
||||
Genotype g = GLs.get(sample.getKey());
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
final Genotype g = GLs.get(genotype.getSampleName());
|
||||
final double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||
continue; // regular likelihoods
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Genotype.NO_NEG_LOG_10PERROR;
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
|
||||
final double qual = Genotype.NO_LOG10_PERROR;
|
||||
calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
* @param priors priors to use for GLs
|
||||
* @param GLs hash of sample->GL to fill in
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
*
|
||||
* @param useBAQedPileup
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @return genotype likelihoods per sample for AA, AB, BB
|
||||
*/
|
||||
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
|
|
@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, MultiallelicGenotypeLikelihoods> GLs,
|
||||
Allele alternateAlleleToUse, boolean useBAQedPileup);
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup);
|
||||
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( BaseUtils.isRegularBase( p.getBase() ) )
|
||||
count++;
|
||||
count += p.getRepresentativeCount();
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
|||
|
|
@ -1,271 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||
|
||||
// for use in optimizing the P(D|AF) calculations:
|
||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||
|
||||
private AlleleFrequencyMatrix AFMatrix;
|
||||
|
||||
protected GridSearchAFEstimation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||
}
|
||||
|
||||
protected void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
|
||||
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
|
||||
|
||||
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
|
||||
// add one more alternate allele
|
||||
AFMatrix.incrementFrequency();
|
||||
|
||||
// calculate new likelihoods
|
||||
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
|
||||
|
||||
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
||||
// that subsequent likelihoods won't factor into the confidence score, just quit
|
||||
if ( maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON )
|
||||
return;
|
||||
|
||||
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
|
||||
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the super class
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
protected Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
Allele refAllele = vc.getReference();
|
||||
Allele altAllele = vc.getAlternateAllele(0);
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// first, the potential alt calls
|
||||
for ( String sample : AFMatrix.getSamples() ) {
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(refAllele);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(altAllele);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||
myAlleles.add(altAllele);
|
||||
myAlleles.add(altAllele);
|
||||
}
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
private void initializeAFMatrix(Map<String, Genotype> GLs) {
|
||||
AFMatrix.clear();
|
||||
|
||||
for ( Genotype g : GLs.values() ) {
|
||||
if ( g.hasLikelihoods() )
|
||||
AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName());
|
||||
}
|
||||
}
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int maxN; // total possible frequencies in data
|
||||
private int frequency; // current frequency
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
maxN = N;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
clear();
|
||||
}
|
||||
|
||||
public List<String> getSamples() { return samples; }
|
||||
|
||||
public void clear() {
|
||||
frequency = 0;
|
||||
for (int i = 0; i < maxN; i++)
|
||||
indexes[i] = 0;
|
||||
samples.clear();
|
||||
samplesToGenotypesPerAF.clear();
|
||||
}
|
||||
|
||||
public void setLikelihoods(double[] GLs, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
int N = samples.size();
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
int N = samples.size();
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
System.out.print(samples.get(i));
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.AA.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -35,9 +35,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -108,9 +106,9 @@ public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
|||
return sum;
|
||||
|
||||
try {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(value.getAttributes());
|
||||
VariantContextUtils.calculateChromosomeCounts(value, attrs, true);
|
||||
writer.add(VariantContext.modifyAttributes(value, attrs));
|
||||
VariantContextBuilder builder = new VariantContextBuilder(value);
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, true);
|
||||
writer.add(builder.make());
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
|
@ -128,27 +126,27 @@ public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
|||
return null;
|
||||
|
||||
VariantContext variantVC = null;
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( variantVC == null && vc.isVariant() )
|
||||
variantVC = vc;
|
||||
genotypes.putAll(getGenotypesWithGLs(vc.getGenotypes()));
|
||||
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
|
||||
}
|
||||
|
||||
if ( variantVC == null ) {
|
||||
VariantContext vc = VCs.get(0);
|
||||
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
return new VariantContext("VCwithGLs", variantVC.getChr(), variantVC.getStart(), variantVC.getEnd(), variantVC.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null);
|
||||
|
||||
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
private static Map<String, Genotype> getGenotypesWithGLs(Map<String, Genotype> genotypes) {
|
||||
Map<String, Genotype> genotypesWithGLs = new HashMap<String, Genotype>();
|
||||
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
|
||||
if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null )
|
||||
genotypesWithGLs.put(g.getKey(), g.getValue());
|
||||
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
|
||||
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
|
||||
genotypesWithGLs.add(g);
|
||||
}
|
||||
|
||||
return genotypesWithGLs;
|
||||
}
|
||||
}
|
||||
|
|
@ -258,7 +258,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -229,8 +229,7 @@ public class UnifiedGenotyperEngine {
|
|||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return null;
|
||||
vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase());
|
||||
|
||||
vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).make();
|
||||
} else {
|
||||
// deal with bad/non-standard reference bases
|
||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||
|
|
@ -238,7 +237,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
Set<Allele> alleles = new HashSet<Allele>();
|
||||
alleles.add(Allele.create(ref.getBase(), true));
|
||||
vc = new VariantContext("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles);
|
||||
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
|
||||
}
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
|
|
@ -265,7 +264,7 @@ public class UnifiedGenotyperEngine {
|
|||
alleles.add(refAllele);
|
||||
boolean addedAltAlleles = false;
|
||||
|
||||
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( !addedAltAlleles ) {
|
||||
addedAltAlleles = true;
|
||||
|
|
@ -281,22 +280,13 @@ public class UnifiedGenotyperEngine {
|
|||
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
|
||||
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
|
||||
genotypes.add(new Genotype(GL.getSample(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
int endLoc = calculateEndPos(alleles, refAllele, loc);
|
||||
|
||||
return new VariantContext("UG_call",
|
||||
loc.getContig(),
|
||||
loc.getStart(),
|
||||
endLoc,
|
||||
alleles,
|
||||
genotypes,
|
||||
VariantContext.NO_NEG_LOG_10PERROR,
|
||||
null,
|
||||
null,
|
||||
refContext.getBase());
|
||||
return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make();
|
||||
}
|
||||
|
||||
// private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
|
||||
|
|
@ -354,7 +344,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// create the genotypes
|
||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
|
|
@ -420,8 +410,14 @@ public class UnifiedGenotyperEngine {
|
|||
myAlleles = new HashSet<Allele>(1);
|
||||
myAlleles.add(vc.getReference());
|
||||
}
|
||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
||||
myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase());
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
|
||||
builder.genotypes(genotypes);
|
||||
builder.log10PError(phredScaledConfidence/-10.0);
|
||||
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
|
||||
builder.attributes(attributes);
|
||||
builder.referenceBaseForIndel(refContext.getBase());
|
||||
VariantContext vcCall = builder.make();
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
|
|
@ -491,7 +487,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// create the genotypes
|
||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
|
@ -504,10 +500,15 @@ public class UnifiedGenotyperEngine {
|
|||
myAlleles = new HashSet<Allele>(1);
|
||||
myAlleles.add(vc.getReference());
|
||||
}
|
||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
||||
myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel());
|
||||
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
|
||||
builder.genotypes(genotypes);
|
||||
builder.log10PError(phredScaledConfidence/-10.0);
|
||||
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
|
||||
builder.attributes(attributes);
|
||||
builder.referenceBaseForIndel(vc.getReferenceBaseForIndel());
|
||||
|
||||
return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF));
|
||||
}
|
||||
|
||||
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||
|
|
@ -811,9 +812,6 @@ public class UnifiedGenotyperEngine {
|
|||
case EXACT:
|
||||
afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter);
|
||||
break;
|
||||
case GRID_SEARCH:
|
||||
afcm = new GridSearchAFEstimation(UAC, N, logger, verboseWriter);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -556,40 +556,51 @@ public class PairHMMIndelErrorModel {
|
|||
long indStart = start - haplotype.getStartPosition();
|
||||
long indStop = stop - haplotype.getStartPosition();
|
||||
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||
(int)indStart, (int)indStop);
|
||||
|
||||
double readLikelihood;
|
||||
if (matchMetricArray == null) {
|
||||
final int X_METRIC_LENGTH = readBases.length+1;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
if (DEBUG)
|
||||
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
|
||||
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString());
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
}
|
||||
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
if (previousHaplotypeSeen == null)
|
||||
startIdx = 0;
|
||||
else {
|
||||
final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
|
||||
final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP);
|
||||
startIdx = Math.min(Math.min(s1, s2), s3);
|
||||
}
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
previousGOP = currentContextGOP.clone();
|
||||
previousGCP = currentContextGCP.clone();
|
||||
if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) {
|
||||
// read spanned more than allowed reference context: we currently can't deal with this
|
||||
readLikelihood =0;
|
||||
} else
|
||||
{
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||
(int)indStart, (int)indStop);
|
||||
|
||||
if (matchMetricArray == null) {
|
||||
final int X_METRIC_LENGTH = readBases.length+1;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
}
|
||||
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
|
||||
if (previousHaplotypeSeen == null)
|
||||
startIdx = 0;
|
||||
else {
|
||||
final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
|
||||
final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP);
|
||||
startIdx = Math.min(Math.min(s1, s2), s3);
|
||||
}
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
previousGOP = currentContextGOP.clone();
|
||||
previousGCP = currentContextGCP.clone();
|
||||
|
||||
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
|
||||
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
|
||||
if (DEBUG) {
|
||||
System.out.println("H:"+new String(haplotypeBases));
|
||||
System.out.println("R:"+new String(readBases));
|
||||
System.out.format("L:%4.2f\n",readLikelihood);
|
||||
System.out.format("StPos:%d\n", startIdx);
|
||||
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
|
||||
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("H:"+new String(haplotypeBases));
|
||||
System.out.println("R:"+new String(readBases));
|
||||
System.out.format("L:%4.2f\n",readLikelihood);
|
||||
System.out.format("StPos:%d\n", startIdx);
|
||||
}
|
||||
}
|
||||
readEl.put(a,readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
|
|
|
|||
|
|
@ -58,9 +58,7 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
|||
import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -1057,16 +1055,15 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
stop += event_length;
|
||||
}
|
||||
|
||||
Map<String,Genotype> genotypes = new HashMap<String,Genotype>();
|
||||
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
for ( String sample : normalSamples ) {
|
||||
|
||||
Map<String,?> attrs = call.makeStatsAttributes(null);
|
||||
Map<String,Object> attrs = call.makeStatsAttributes(null);
|
||||
|
||||
if ( call.isCall() ) // we made a call - put actual het genotype here:
|
||||
genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false));
|
||||
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
|
||||
genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false));
|
||||
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||
|
||||
}
|
||||
Set<String> filters = null;
|
||||
|
|
@ -1074,8 +1071,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
filters = new HashSet<String>();
|
||||
filters.add("NoCall");
|
||||
}
|
||||
VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes,
|
||||
-1.0 /* log error */, filters, null, refBases[(int)start-1]);
|
||||
VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
|
||||
.genotypes(genotypes).filters(filters).referenceBaseForIndel(refBases[(int)start-1]).make();
|
||||
vcf.add(vc);
|
||||
}
|
||||
|
||||
|
|
@ -1147,14 +1144,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
homRefAlleles.add( alleles.get(0));
|
||||
homRefAlleles.add( alleles.get(0));
|
||||
|
||||
Map<String,Genotype> genotypes = new HashMap<String,Genotype>();
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
|
||||
for ( String sample : normalSamples ) {
|
||||
genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false));
|
||||
genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false));
|
||||
}
|
||||
|
||||
for ( String sample : tumorSamples ) {
|
||||
genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) );
|
||||
genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) );
|
||||
}
|
||||
|
||||
Set<String> filters = null;
|
||||
|
|
@ -1171,8 +1168,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
filters.add("TCov");
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes,
|
||||
-1.0 /* log error */, filters, attrs, refBases[(int)start-1]);
|
||||
VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
|
||||
.genotypes(genotypes).filters(filters).attributes(attrs).referenceBaseForIndel(refBases[(int)start-1]).make();
|
||||
vcf.add(vc);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import java.util.Arrays;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class BaseArray implements Comparable<BaseArray> {
|
||||
abstract class BaseArray implements Comparable<BaseArray> {
|
||||
protected Byte[] bases;
|
||||
|
||||
public BaseArray(byte[] bases) {
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import java.util.Iterator;
|
|||
/*
|
||||
* CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables.
|
||||
*/
|
||||
public class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
|
||||
class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
|
||||
private int[] cards;
|
||||
private int[] valList;
|
||||
private boolean hasNext;
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import java.util.NoSuchElementException;
|
|||
It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned
|
||||
to save the current pointer for a later time (while the original iterator can continue to iterate).
|
||||
*/
|
||||
public class CloneableIteratorLinkedList<E> {
|
||||
class CloneableIteratorLinkedList<E> {
|
||||
private CloneableIteratorDoublyLinkedNode<E> first;
|
||||
private CloneableIteratorDoublyLinkedNode<E> last;
|
||||
private int size;
|
||||
|
|
|
|||
|
|
@ -21,13 +21,13 @@
|
|||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.utils;
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
public class DisjointSet {
|
||||
class DisjointSet {
|
||||
private ItemNode[] nodes;
|
||||
|
||||
public DisjointSet(int numItems) {
|
||||
|
|
@ -27,7 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class Haplotype extends BaseArray implements Cloneable {
|
||||
class Haplotype extends BaseArray implements Cloneable {
|
||||
public Haplotype(byte[] bases) {
|
||||
super(bases);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD.
|
||||
*/
|
||||
@Allows(value = {DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.REFERENCE})
|
||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
||||
|
||||
public class MergeMNPsWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Output(doc = "File to which variants should be written", required = true)
|
||||
protected VCFWriter writer = null;
|
||||
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
|
||||
|
||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
||||
protected int maxGenomicDistanceForMNP = 1;
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
|
||||
public RodBinding<VariantContext> variants;
|
||||
|
||||
public void initialize() {
|
||||
initializeVcfWriter();
|
||||
}
|
||||
|
||||
private void initializeVcfWriter() {
|
||||
// false <-> don't take control of writer, since didn't create it:
|
||||
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false);
|
||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
||||
|
||||
// setup the header fields:
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
|
||||
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
|
||||
}
|
||||
|
||||
public boolean generateExtendedEvents() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each site, send it to be (possibly) merged with previously observed sites.
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return dummy Integer
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
|
||||
writeVCF(vc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void writeVCF(VariantContext vc) {
|
||||
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
|
||||
}
|
||||
|
||||
public Integer reduce(Integer result, Integer total) {
|
||||
if (result == null)
|
||||
return total;
|
||||
|
||||
return total + result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release any VariantContexts not yet processed.
|
||||
*
|
||||
* @param result Empty for now...
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
vcMergerWriter.close();
|
||||
|
||||
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
||||
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
|
||||
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
|
||||
System.out.println(vcMergerWriter.getAltAlleleStats());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -33,10 +33,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -44,7 +41,7 @@ import java.util.*;
|
|||
|
||||
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
|
||||
|
||||
public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||
class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
||||
private VCFWriter innerWriter;
|
||||
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
|
@ -52,7 +49,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
private ReferenceSequenceFile referenceFileForMNPmerging;
|
||||
|
||||
private VariantContextMergeRule vcMergeRule;
|
||||
private VariantContextUtils.AlleleMergeRule alleleMergeRule;
|
||||
private PhasingUtils.AlleleMergeRule alleleMergeRule;
|
||||
|
||||
private String useSingleSample = null;
|
||||
|
||||
|
|
@ -71,7 +68,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
// Should we call innerWriter.close() in close()
|
||||
private boolean takeOwnershipOfInner;
|
||||
|
||||
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, PhasingUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||
this.innerWriter = innerWriter;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
try {
|
||||
|
|
@ -122,7 +119,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
if (useSingleSample != null) { // only want to output context for one sample
|
||||
Genotype sampGt = vc.getGenotype(useSingleSample);
|
||||
if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP!
|
||||
vc = vc.subContextFromGenotypes(sampGt);
|
||||
vc = vc.subContextFromSample(sampGt.getSampleName());
|
||||
else // asked for a sample that this vc does not contain, so ignore this vc:
|
||||
return;
|
||||
}
|
||||
|
|
@ -179,14 +176,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
boolean mergedRecords = false;
|
||||
if (shouldAttemptToMerge) {
|
||||
numRecordsSatisfyingMergeRule++;
|
||||
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
|
||||
VariantContext mergedVc = PhasingUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
|
||||
|
||||
if (mergedVc != null) {
|
||||
mergedRecords = true;
|
||||
|
||||
Map<String, Object> addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc);
|
||||
addedAttribs.putAll(mergedVc.getAttributes());
|
||||
mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs);
|
||||
mergedVc = new VariantContextBuilder(mergedVc).attributes(addedAttribs).make();
|
||||
|
||||
vcfrWaitingToMerge = new VCFRecord(mergedVc, true);
|
||||
numMergedRecords++;
|
||||
|
|
@ -218,26 +215,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
filteredVcfrList.clear();
|
||||
}
|
||||
|
||||
public int getNumRecordsAttemptToMerge() {
|
||||
return numRecordsAttemptToMerge;
|
||||
}
|
||||
|
||||
public int getNumRecordsSatisfyingMergeRule() {
|
||||
return numRecordsSatisfyingMergeRule;
|
||||
}
|
||||
|
||||
public int getNumMergedRecords() {
|
||||
return numMergedRecords;
|
||||
}
|
||||
|
||||
public VariantContextMergeRule getVcMergeRule() {
|
||||
return vcMergeRule;
|
||||
}
|
||||
|
||||
public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() {
|
||||
return alleleMergeRule;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a string representation of this object.
|
||||
*
|
||||
|
|
@ -248,13 +225,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
return getClass().getName();
|
||||
}
|
||||
|
||||
public String getAltAlleleStats() {
|
||||
if (altAlleleStats == null)
|
||||
return "";
|
||||
|
||||
return "\n" + altAlleleStats.toString();
|
||||
}
|
||||
|
||||
private static class VCFRecord {
|
||||
public VariantContext vc;
|
||||
public boolean resultedFromMerge;
|
||||
|
|
@ -373,7 +343,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
if (shouldAttemptToMerge) {
|
||||
aas.numSuccessiveGenotypesAttemptedToBeMerged++;
|
||||
|
||||
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
||||
if (!PhasingUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
||||
aas.segregationUnknown++;
|
||||
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
|
||||
}
|
||||
|
|
@ -498,9 +468,9 @@ class DistanceMergeRule extends VariantContextMergeRule {
|
|||
}
|
||||
|
||||
|
||||
class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule {
|
||||
class ExistsDoubleAltAlleleMergeRule extends PhasingUtils.AlleleMergeRule {
|
||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
||||
return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
|
||||
return PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
|
@ -515,7 +485,7 @@ class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule {
|
|||
|
||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
||||
// Must be interesting AND consistent:
|
||||
return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
|
||||
return super.allelesShouldBeMerged(vc1, vc2) && PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
|
|
|||
|
|
@ -1,236 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD.
|
||||
*/
|
||||
@Allows(value = {DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.REFERENCE})
|
||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
||||
|
||||
public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Output(doc = "File to which variants should be written", required = true)
|
||||
protected VCFWriter writer = null;
|
||||
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
|
||||
|
||||
@Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false)
|
||||
protected int maxGenomicDistance = 1;
|
||||
|
||||
@Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false)
|
||||
protected String useSingleSample = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "emitOnlyMergedRecords", shortName = "emitOnlyMerged", doc = "Only output records that resulted from merging [For DEBUGGING purposes only - DO NOT USE, since it disregards the semantics of '|' as 'phased relative to previous non-filtered VC']; [default:false]", required = false)
|
||||
protected boolean emitOnlyMergedRecords = false;
|
||||
|
||||
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
||||
protected boolean disablePrintAlternateAlleleStatistics = false;
|
||||
|
||||
public final static String IGNORE_REFSEQ = "IGNORE";
|
||||
public final static String UNION_REFSEQ = "UNION";
|
||||
public final static String INTERSECT_REFSEQ = "INTERSECT";
|
||||
|
||||
@Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false)
|
||||
protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;
|
||||
|
||||
@Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false)
|
||||
protected boolean dontRequireSomeSampleHasDoubleAltAllele = false;
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
|
||||
public RodBinding<VariantContext> variants;
|
||||
|
||||
public void initialize() {
|
||||
initializeVcfWriter();
|
||||
}
|
||||
|
||||
private void initializeVcfWriter() {
|
||||
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
|
||||
|
||||
VariantContextMergeRule vcMergeRule;
|
||||
if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
|
||||
vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser);
|
||||
else
|
||||
vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation);
|
||||
|
||||
VariantContextUtils.AlleleMergeRule alleleMergeRule;
|
||||
if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them if there is a trailing prefix of polymorphisms (i.e., upstream polymorphic site):
|
||||
alleleMergeRule = new PrefixPolymorphismMergeAllelesRule();
|
||||
else
|
||||
alleleMergeRule = new ExistsDoubleAltAlleleMergeRule();
|
||||
|
||||
// false <-> don't take control of writer, since didn't create it:
|
||||
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
|
||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
||||
|
||||
// setup the header fields:
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
|
||||
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
|
||||
}
|
||||
|
||||
public boolean generateExtendedEvents() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each site, send it to be (possibly) merged with previously observed sites.
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return dummy Integer
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
|
||||
writeVCF(vc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void writeVCF(VariantContext vc) {
|
||||
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
|
||||
}
|
||||
|
||||
public Integer reduce(Integer result, Integer total) {
|
||||
if (result == null)
|
||||
return total;
|
||||
|
||||
return total + result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release any VariantContexts not yet processed.
|
||||
*
|
||||
* @param result Empty for now...
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
vcMergerWriter.close();
|
||||
|
||||
if (useSingleSample != null)
|
||||
System.out.println("Only considered single sample: " + useSingleSample);
|
||||
|
||||
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
||||
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
|
||||
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
|
||||
System.out.println(vcMergerWriter.getAltAlleleStats());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum MergeBasedOnRefSeqAnnotation {
|
||||
UNION_WITH_DIST, INTERSECT_WITH_DIST
|
||||
}
|
||||
|
||||
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
|
||||
private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;
|
||||
|
||||
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
|
||||
super(maxGenomicDistanceForMNP, genomeLocParser);
|
||||
|
||||
if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
|
||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
|
||||
else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
|
||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
|
||||
else
|
||||
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
|
||||
}
|
||||
|
||||
public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) {
|
||||
boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2);
|
||||
|
||||
if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
|
||||
return withinDistance || sameGene(vc1, vc2);
|
||||
else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
|
||||
return withinDistance && sameGene(vc1, vc2);
|
||||
}
|
||||
|
||||
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
|
||||
Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
|
||||
Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
|
||||
names_vc1.retainAll(names_vc2);
|
||||
|
||||
if (!names_vc1.isEmpty())
|
||||
return true;
|
||||
|
||||
// Check refseq.name2:
|
||||
Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
|
||||
Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
|
||||
names2_vc1.retainAll(names2_vc2);
|
||||
|
||||
return !names2_vc1.isEmpty();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
|
||||
}
|
||||
|
||||
public Map<String, Object> addToMergedAttributes(VariantContext vc1, VariantContext vc2) {
|
||||
Map<String, Object> addedAttribs = super.addToMergedAttributes(vc1, vc2);
|
||||
addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2));
|
||||
return addedAttribs;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
class PrefixPolymorphismMergeAllelesRule extends VariantContextUtils.AlleleMergeRule {
|
||||
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
|
||||
return vc1.isPolymorphic();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return super.toString() + ", there exists a polymorphism at the start of the merged allele";
|
||||
}
|
||||
}
|
||||
|
|
@ -13,10 +13,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -135,7 +132,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
private final Allele NO_CALL = Allele.create(".",false);
|
||||
private final String DUMMY_NAME = "DummySample";
|
||||
|
||||
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
|
||||
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
|
||||
|
||||
private ArrayList<Allele> getAlleles(Genotype.Type genotype){
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
|
||||
|
|
@ -165,10 +162,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
//Homozygous genotypes will be set as phased, heterozygous won't be
|
||||
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
|
||||
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
|
||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true));
|
||||
}
|
||||
else
|
||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
}
|
||||
|
||||
//Find the phase for a parent/child pair
|
||||
|
|
@ -176,8 +173,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
|
||||
//Special case for Het/Het as it is ambiguous
|
||||
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -189,23 +186,23 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
//If there is a possible phasing between the mother and child => phase
|
||||
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
|
||||
if(childTransmittedAlleleIndex > -1){
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||
childPhasedAlleles.add(childAlleles.get(0));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||
childPhasedAlleles.add(childAlleles.get(0));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||
}
|
||||
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
|
||||
parentPhasedAlleles.add(parentAlleles.get(1));
|
||||
parentPhasedAlleles.add(parentAlleles.get(0));
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||
childPhasedAlleles.add(childAlleles.get(0));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
|
||||
parentPhasedAlleles.add(parentAlleles.get(1));
|
||||
parentPhasedAlleles.add(parentAlleles.get(0));
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
|
||||
childPhasedAlleles.add(childAlleles.get(0));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
|
||||
}
|
||||
//This is a Mendelian Violation => Do not phase
|
||||
else{
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -239,7 +236,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
motherPhasedAlleles.add(motherAlleles.get(0));
|
||||
else
|
||||
motherPhasedAlleles.add(motherAlleles.get(1));
|
||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||
|
||||
//Create father's genotype
|
||||
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
|
||||
|
|
@ -248,10 +245,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
fatherPhasedAlleles.add(fatherAlleles.get(0));
|
||||
else
|
||||
fatherPhasedAlleles.add(fatherAlleles.get(1));
|
||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||
|
||||
//Create child's genotype
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
|
||||
|
||||
//Once a phased combination is found; exit
|
||||
return;
|
||||
|
|
@ -259,9 +256,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
}
|
||||
|
||||
//If this is reached then no phasing could be found
|
||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false));
|
||||
}
|
||||
|
||||
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
|
||||
|
|
@ -301,26 +298,26 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the trio genotype combination to the given trio.
|
||||
* @param ref: Reference allele
|
||||
* @param alt: Alternate allele
|
||||
* @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
|
||||
* @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
|
||||
* @param childGenotype: Genotype of the child to phase using this trio genotype combination
|
||||
* @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
|
||||
* @param phasedGenotypes: An ArrayList<Genotype> to which the newly phased genotypes are added in the following order: Mother, Father, Child
|
||||
*/
|
||||
/**
|
||||
* Applies the trio genotype combination to the given trio.
|
||||
* @param ref: Reference allele
|
||||
* @param alt: Alternate allele
|
||||
* @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
|
||||
* @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
|
||||
* @param childGenotype: Genotype of the child to phase using this trio genotype combination
|
||||
* @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
|
||||
* @param phasedGenotypes: An ArrayList<Genotype> to which the newly phased genotypes are added in the following order: Mother, Father, Child
|
||||
*/
|
||||
public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList<Genotype> phasedGenotypes){
|
||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
|
||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
|
||||
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
|
||||
}
|
||||
|
||||
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
|
||||
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
|
||||
|
||||
int phredScoreTransmission = -1;
|
||||
if(transmissionProb != NO_TRANSMISSION_PROB)
|
||||
int phredScoreTransmission = -1;
|
||||
if(transmissionProb != NO_TRANSMISSION_PROB)
|
||||
phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
|
||||
|
||||
//Handle null, missing and unavailable genotypes
|
||||
|
|
@ -336,27 +333,27 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
if(transmissionProb>NO_TRANSMISSION_PROB)
|
||||
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
|
||||
|
||||
ArrayList<Allele> phasedAlleles = new ArrayList<Allele>(2);
|
||||
for(Allele allele : phasedGenotype.getAlleles()){
|
||||
if(allele.isReference())
|
||||
phasedAlleles.add(refAllele);
|
||||
else if(allele.isNonReference())
|
||||
phasedAlleles.add(altAllele);
|
||||
//At this point there should not be any other alleles left
|
||||
else
|
||||
throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
|
||||
ArrayList<Allele> phasedAlleles = new ArrayList<Allele>(2);
|
||||
for(Allele allele : phasedGenotype.getAlleles()){
|
||||
if(allele.isReference())
|
||||
phasedAlleles.add(refAllele);
|
||||
else if(allele.isNonReference())
|
||||
phasedAlleles.add(altAllele);
|
||||
//At this point there should not be any other alleles left
|
||||
else
|
||||
throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//Compute the new Log10Error if the genotype is different from the original genotype
|
||||
double negLog10Error;
|
||||
if(genotype.getType() == phasedGenotype.getType())
|
||||
negLog10Error = genotype.getNegLog10PError();
|
||||
else
|
||||
negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
|
||||
//Compute the new Log10Error if the genotype is different from the original genotype
|
||||
double log10Error;
|
||||
if(genotype.getType() == phasedGenotype.getType())
|
||||
log10Error = genotype.getLog10PError();
|
||||
else
|
||||
log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
|
||||
|
||||
return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
|
||||
}
|
||||
return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -404,14 +401,14 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
}
|
||||
else{
|
||||
for(Sample familyMember : family){
|
||||
parents = familyMember.getParents();
|
||||
if(parents.size()>0){
|
||||
parents = familyMember.getParents();
|
||||
if(parents.size()>0){
|
||||
if(family.containsAll(parents))
|
||||
this.trios.add(familyMember);
|
||||
else
|
||||
logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -426,11 +423,11 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
|
||||
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class);
|
||||
for(Genotype.Type mother : Genotype.Type.values()){
|
||||
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
|
||||
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
|
||||
for(Genotype.Type father : Genotype.Type.values()){
|
||||
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
|
||||
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
|
||||
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
|
||||
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
|
||||
for(Genotype.Type father : Genotype.Type.values()){
|
||||
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
|
||||
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
|
||||
for(Genotype.Type child : Genotype.Type.values()){
|
||||
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
|
||||
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
|
||||
|
|
@ -671,9 +668,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
else
|
||||
phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
|
||||
|
||||
//Return the phased genotypes
|
||||
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
|
||||
return bestMVCount.get(configuration_index);
|
||||
//Return the phased genotypes
|
||||
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
|
||||
return bestMVCount.get(configuration_index);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -682,14 +679,14 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
|
||||
//Increment metrics counters
|
||||
if(parent.isCalled() && child.isCalled()){
|
||||
counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
|
||||
if(parent.isPhased())
|
||||
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
|
||||
else{
|
||||
counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
|
||||
if(parent.isPhased())
|
||||
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
|
||||
else{
|
||||
counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
|
||||
if(parent.isHet() && child.isHet())
|
||||
counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
|
||||
}
|
||||
|
|
@ -700,21 +697,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
|
||||
//Increment metrics counters
|
||||
if(mother.isCalled() && father.isCalled() && child.isCalled()){
|
||||
counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
|
||||
if(mother.isPhased())
|
||||
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
|
||||
counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
|
||||
if(mother.isPhased())
|
||||
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
|
||||
|
||||
else{
|
||||
if(mvCount > 0){
|
||||
if(mvCount >1)
|
||||
else{
|
||||
if(mvCount > 0){
|
||||
if(mvCount >1)
|
||||
counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
|
||||
else
|
||||
counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
|
||||
}
|
||||
else if(mother.isHet() && father.isHet() && child.isHet())
|
||||
counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
|
||||
else
|
||||
counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
|
||||
}
|
||||
else if(mother.isHet() && father.isHet() && child.isHet())
|
||||
counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
|
||||
|
||||
}
|
||||
}
|
||||
}else{
|
||||
counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
|
||||
}
|
||||
|
|
@ -749,11 +746,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
|
||||
if (tracker != null) {
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
|
||||
Map<String, Genotype> genotypeMap = vc.getGenotypes();
|
||||
|
||||
int mvCount;
|
||||
|
||||
GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes());
|
||||
for (Sample sample : trios) {
|
||||
Genotype mother = vc.getGenotype(sample.getMaternalID());
|
||||
Genotype father = vc.getGenotype(sample.getPaternalID());
|
||||
|
|
@ -764,18 +759,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
continue;
|
||||
|
||||
ArrayList<Genotype> trioGenotypes = new ArrayList<Genotype>(3);
|
||||
mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
||||
final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
||||
|
||||
Genotype phasedMother = trioGenotypes.get(0);
|
||||
Genotype phasedFather = trioGenotypes.get(1);
|
||||
Genotype phasedChild = trioGenotypes.get(2);
|
||||
|
||||
//Fill the genotype map with the new genotypes and increment metrics counters
|
||||
genotypeMap.put(phasedChild.getSampleName(),phasedChild);
|
||||
genotypesContext.replace(phasedChild);
|
||||
if(mother != null){
|
||||
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
|
||||
genotypesContext.replace(phasedMother);
|
||||
if(father != null){
|
||||
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
|
||||
genotypesContext.replace(phasedFather);
|
||||
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
||||
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||
|
|
@ -789,24 +784,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
}
|
||||
}
|
||||
else{
|
||||
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
|
||||
genotypesContext.replace(phasedFather);
|
||||
updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
|
||||
if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
||||
}
|
||||
}
|
||||
|
||||
//Report violation if set so
|
||||
//TODO: ADAPT FOR PAIRS TOO!!
|
||||
if(mvCount>0 && mvFile != null)
|
||||
mvFile.println(mvfLine);
|
||||
|
||||
}
|
||||
|
||||
|
||||
VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap);
|
||||
|
||||
vcfWriter.add(newvc);
|
||||
builder.genotypes(genotypesContext);
|
||||
vcfWriter.add(builder.make());
|
||||
}
|
||||
return metricsCounters;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,12 +23,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.DisjointSet;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
// Represents an undirected graph with no self-edges:
|
||||
public class PhasingGraph implements Iterable<PhasingGraphEdge> {
|
||||
class PhasingGraph implements Iterable<PhasingGraphEdge> {
|
||||
private Neighbors[] adj;
|
||||
|
||||
public PhasingGraph(int numVertices) {
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
|
|||
/*
|
||||
Edge class for PhasingGraph
|
||||
*/
|
||||
public class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
|
||||
class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
|
||||
protected int v1;
|
||||
protected int v2;
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class PhasingRead extends BaseArray {
|
||||
class PhasingRead extends BaseArray {
|
||||
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
|
||||
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
|
||||
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
|
||||
|
|
|
|||
|
|
@ -0,0 +1,382 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* [Short one sentence description of this walker]
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Functionality of this walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Input description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
class PhasingUtils {
|
||||
static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
|
||||
if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2))
|
||||
return null;
|
||||
|
||||
// Check that it's logically possible to merge the VCs:
|
||||
if (!allSamplesAreMergeable(vc1, vc2))
|
||||
return null;
|
||||
|
||||
// Check if there's a "point" in merging the VCs (e.g., annotations could be changed)
|
||||
if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2))
|
||||
return null;
|
||||
|
||||
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
|
||||
}
|
||||
|
||||
static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
|
||||
int startInter = vc1.getEnd() + 1;
|
||||
int endInter = vc2.getStart() - 1;
|
||||
byte[] intermediateBases = null;
|
||||
if (startInter <= endInter) {
|
||||
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
|
||||
StringUtil.toUpperCase(intermediateBases);
|
||||
}
|
||||
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
|
||||
|
||||
GenotypesContext mergedGenotypes = GenotypesContext.create();
|
||||
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||
|
||||
List<Allele> site1Alleles = gt1.getAlleles();
|
||||
List<Allele> site2Alleles = gt2.getAlleles();
|
||||
|
||||
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
|
||||
|
||||
/* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
|
||||
we preserve phase information (if any) relative to whatever precedes vc1:
|
||||
*/
|
||||
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||
for (Allele all1 : site1Alleles) {
|
||||
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||
|
||||
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
|
||||
mergedAllelesForSample.add(mergedAllele);
|
||||
}
|
||||
|
||||
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
|
||||
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
|
||||
|
||||
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
|
||||
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
|
||||
if (phaseQual.PQ != null)
|
||||
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
|
||||
|
||||
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
|
||||
mergedGenotypes.add(mergedGt);
|
||||
}
|
||||
|
||||
String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
|
||||
double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError());
|
||||
Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
|
||||
Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);
|
||||
|
||||
// ids
|
||||
List<String> mergedIDs = new ArrayList<String>();
|
||||
if ( vc1.hasID() ) mergedIDs.add(vc1.getID());
|
||||
if ( vc2.hasID() ) mergedIDs.add(vc2.getID());
|
||||
String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
|
||||
|
||||
VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs);
|
||||
VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true);
|
||||
return mergedBuilder.make();
|
||||
}
|
||||
|
||||
static String mergeVariantContextNames(String name1, String name2) {
|
||||
return name1 + "_" + name2;
|
||||
}
|
||||
|
||||
static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
|
||||
Map<String, Object> mergedAttribs = new HashMap<String, Object>();
|
||||
|
||||
List<VariantContext> vcList = new LinkedList<VariantContext>();
|
||||
vcList.add(vc1);
|
||||
vcList.add(vc2);
|
||||
|
||||
String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
|
||||
for (String orAttrib : MERGE_OR_ATTRIBS) {
|
||||
boolean attribVal = false;
|
||||
for (VariantContext vc : vcList) {
|
||||
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
|
||||
if (attribVal) // already true, so no reason to continue:
|
||||
break;
|
||||
}
|
||||
mergedAttribs.put(orAttrib, attribVal);
|
||||
}
|
||||
|
||||
return mergedAttribs;
|
||||
}
|
||||
|
||||
static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
|
||||
GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
|
||||
GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);
|
||||
|
||||
if (!loc1.onSameContig(loc2))
|
||||
throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");
|
||||
|
||||
if (!loc1.isBefore(loc2))
|
||||
throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2");
|
||||
|
||||
if (vc1.isFiltered() || vc2.isFiltered())
|
||||
return false;
|
||||
|
||||
if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets
|
||||
return false;
|
||||
|
||||
if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) {
|
||||
for (final Genotype gt : vc.getGenotypes()) {
|
||||
if (gt.isNoCall() || gt.isFiltered())
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
|
||||
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
|
||||
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||
|
||||
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
|
||||
if (gt1.getPloidy() != gt2.getPloidy())
|
||||
return false;
|
||||
|
||||
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
|
||||
|
||||
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
|
||||
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
|
||||
*/
|
||||
return (gt2.isPhased() || gt2.isHom() || gt1.isHom());
|
||||
}
|
||||
|
||||
static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
|
||||
if (gt2.isPhased() || gt2.isHom())
|
||||
return new PhaseAndQuality(gt1); // maintain the phase of gt1
|
||||
|
||||
if (!gt1.isHom())
|
||||
throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
|
||||
|
||||
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
|
||||
|
||||
For example, if we're merging the third Genotype with the second one:
|
||||
0/1
|
||||
1|1
|
||||
0/1
|
||||
|
||||
Then, we want to output:
|
||||
0/1
|
||||
1/2
|
||||
*/
|
||||
return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
|
||||
}
|
||||
|
||||
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
|
||||
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||
|
||||
List<Allele> site1Alleles = gt1.getAlleles();
|
||||
List<Allele> site2Alleles = gt2.getAlleles();
|
||||
|
||||
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||
for (Allele all1 : site1Alleles) {
|
||||
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
|
||||
|
||||
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
|
||||
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
|
||||
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
|
||||
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
|
||||
|
||||
// Note the segregation of the alleles for the reference genome:
|
||||
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
|
||||
allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
|
||||
|
||||
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
|
||||
for (final Genotype gt1 : vc1.getGenotypes()) {
|
||||
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
|
||||
|
||||
List<Allele> site1Alleles = gt1.getAlleles();
|
||||
List<Allele> site2Alleles = gt2.getAlleles();
|
||||
|
||||
Iterator<Allele> all2It = site2Alleles.iterator();
|
||||
for (Allele all1 : site1Alleles) {
|
||||
Allele all2 = all2It.next();
|
||||
|
||||
Allele all1To2 = allele1ToAllele2.get(all1);
|
||||
if (all1To2 == null)
|
||||
allele1ToAllele2.put(all1, all2);
|
||||
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
|
||||
return false;
|
||||
|
||||
Allele all2To1 = allele2ToAllele1.get(all2);
|
||||
if (all2To1 == null)
|
||||
allele2ToAllele1.put(all2, all1);
|
||||
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
abstract static class AlleleMergeRule {
|
||||
// vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
|
||||
abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
|
||||
|
||||
public String toString() {
|
||||
return "all samples are mergeable";
|
||||
}
|
||||
}
|
||||
|
||||
static class AlleleOneAndTwo {
|
||||
private Allele all1;
|
||||
private Allele all2;
|
||||
|
||||
public AlleleOneAndTwo(Allele all1, Allele all2) {
|
||||
this.all1 = all1;
|
||||
this.all2 = all2;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return all1.hashCode() + all2.hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof AlleleOneAndTwo))
|
||||
return false;
|
||||
|
||||
AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
|
||||
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
|
||||
}
|
||||
}
|
||||
|
||||
static class MergedAllelesData {
|
||||
private Map<AlleleOneAndTwo, Allele> mergedAlleles;
|
||||
private byte[] intermediateBases;
|
||||
private int intermediateLength;
|
||||
|
||||
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
|
||||
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
|
||||
this.intermediateBases = intermediateBases;
|
||||
this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0;
|
||||
|
||||
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
|
||||
}
|
||||
|
||||
public Allele ensureMergedAllele(Allele all1, Allele all2) {
|
||||
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
|
||||
}
|
||||
|
||||
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
|
||||
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
|
||||
Allele mergedAllele = mergedAlleles.get(all12);
|
||||
|
||||
if (mergedAllele == null) {
|
||||
byte[] bases1 = all1.getBases();
|
||||
byte[] bases2 = all2.getBases();
|
||||
|
||||
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
|
||||
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
|
||||
if (intermediateBases != null)
|
||||
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
|
||||
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
|
||||
|
||||
mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
|
||||
mergedAlleles.put(all12, mergedAllele);
|
||||
}
|
||||
|
||||
return mergedAllele;
|
||||
}
|
||||
|
||||
public Set<Allele> getAllMergedAlleles() {
|
||||
return new HashSet<Allele>(mergedAlleles.values());
|
||||
}
|
||||
}
|
||||
|
||||
static class PhaseAndQuality {
|
||||
public boolean isPhased;
|
||||
public Double PQ = null;
|
||||
|
||||
public PhaseAndQuality(Genotype gt) {
|
||||
this.isPhased = gt.isPhased();
|
||||
if (this.isPhased) {
|
||||
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
|
||||
if ( this.PQ == -1 ) this.PQ = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
|
|||
/* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values
|
||||
with precision (prevents underflow by representing in log10 space).
|
||||
*/
|
||||
public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
|
||||
class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
|
||||
private static final double EQUALS_THRESH = 1e-6;
|
||||
private static final double INFINITY = Double.POSITIVE_INFINITY;
|
||||
|
||||
|
|
|
|||
|
|
@ -34,17 +34,13 @@ import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.DisjointSet;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -125,7 +121,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
public int MIN_MAPPING_QUALITY_SCORE = 20;
|
||||
|
||||
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
|
||||
protected List<String> samplesToPhase = null;
|
||||
protected Set
|
||||
<String> samplesToPhase = null;
|
||||
|
||||
private GenomeLoc mostDownstreamLocusReached = null;
|
||||
|
||||
|
|
@ -275,10 +272,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList(PQ_KEY));
|
||||
|
||||
private VariantContext reduceVCToSamples(VariantContext vc, List<String> samplesToPhase) {
|
||||
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
|
||||
// for ( String sample : samplesToPhase )
|
||||
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
|
||||
VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values());
|
||||
VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
|
||||
// logger.debug("original VC = " + vc);
|
||||
// logger.debug("sub VC = " + subvc);
|
||||
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
|
||||
|
|
@ -355,17 +352,16 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
UnfinishedVariantContext uvc = uvr.unfinishedVariant;
|
||||
|
||||
// Perform per-sample phasing:
|
||||
Map<String, Genotype> sampGenotypes = vc.getGenotypes();
|
||||
GenotypesContext sampGenotypes = vc.getGenotypes();
|
||||
Map<String, PhaseCounts> samplePhaseStats = new TreeMap<String, PhaseCounts>();
|
||||
for (Map.Entry<String, Genotype> sampGtEntry : sampGenotypes.entrySet()) {
|
||||
String samp = sampGtEntry.getKey();
|
||||
Genotype gt = sampGtEntry.getValue();
|
||||
for (final Genotype gt : sampGenotypes) {
|
||||
String samp = gt.getSampleName();
|
||||
|
||||
if (DEBUG) logger.debug("sample = " + samp);
|
||||
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
||||
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
||||
// true <-> can trivially phase a hom site relative to ANY previous site:
|
||||
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
||||
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true);
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
}
|
||||
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
|
||||
|
|
@ -401,7 +397,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
|
||||
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
|
||||
gtAttribs.put(PQ_KEY, pr.phaseQuality);
|
||||
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
|
||||
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
}
|
||||
|
||||
|
|
@ -421,7 +417,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
if (genotypesArePhased) {
|
||||
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes());
|
||||
handledGtAttribs.put(PQ_KEY, pr.phaseQuality);
|
||||
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
|
||||
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
|
||||
interiorUvc.setGenotype(samp, phasedHomGt);
|
||||
}
|
||||
}
|
||||
|
|
@ -1055,7 +1051,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
private void writeVCF(VariantContext vc) {
|
||||
if (samplesToPhase == null || vc.isNotFiltered())
|
||||
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
|
||||
WriteVCF.writeVCF(vc, writer, logger);
|
||||
writer.add(vc);
|
||||
}
|
||||
|
||||
public static boolean processVariantInPhasing(VariantContext vc) {
|
||||
|
|
@ -1126,25 +1122,34 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
private int start;
|
||||
private int stop;
|
||||
private Collection<Allele> alleles;
|
||||
private Map<String, Genotype> genotypes;
|
||||
private double negLog10PError;
|
||||
private Map<String,Genotype> genotypes;
|
||||
private double log10PError;
|
||||
private Set<String> filters;
|
||||
private Map<String, Object> attributes;
|
||||
private String id;
|
||||
|
||||
public UnfinishedVariantContext(VariantContext vc) {
|
||||
this.name = vc.getSource();
|
||||
this.id = vc.getID();
|
||||
this.contig = vc.getChr();
|
||||
this.start = vc.getStart();
|
||||
this.stop = vc.getEnd();
|
||||
this.alleles = vc.getAlleles();
|
||||
this.genotypes = new HashMap<String, Genotype>(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable
|
||||
this.negLog10PError = vc.getNegLog10PError();
|
||||
|
||||
this.genotypes = new HashMap<String, Genotype>();
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
this.genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
|
||||
this.log10PError = vc.getLog10PError();
|
||||
this.filters = vc.filtersWereApplied() ? vc.getFilters() : null;
|
||||
this.attributes = new HashMap<String, Object>(vc.getAttributes());
|
||||
}
|
||||
|
||||
public VariantContext toVariantContext() {
|
||||
return new VariantContext(name, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
GenotypesContext gc = GenotypesContext.copy(this.genotypes.values());
|
||||
return new VariantContextBuilder(name, contig, start, stop, alleles).id(id)
|
||||
.genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make();
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
|
|
@ -1156,7 +1161,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
public void setGenotype(String sample, Genotype newGt) {
|
||||
genotypes.put(sample, newGt);
|
||||
this.genotypes.put(sample, newGt);
|
||||
}
|
||||
|
||||
public void setPhasingInconsistent() {
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
public class ReadBase {
|
||||
class ReadBase {
|
||||
public String readName;
|
||||
public byte base;
|
||||
public int mappingQual;
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
|
|||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class ReadBasesAtPosition implements Iterable<ReadBase> {
|
||||
class ReadBasesAtPosition implements Iterable<ReadBase> {
|
||||
// list of: <read name, base>
|
||||
private LinkedList<ReadBase> bases;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,189 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
|
||||
*/
|
||||
public class RefSeqDataParser {
|
||||
private static String REFSEQ_PREFIX = "refseq.";
|
||||
|
||||
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
|
||||
private static String NAME_KEY = REFSEQ_PREFIX + "name";
|
||||
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
|
||||
|
||||
private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY};
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
|
||||
String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
|
||||
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
|
||||
|
||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
|
||||
if (numRecords != -1) {
|
||||
boolean done = false;
|
||||
|
||||
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!done) {
|
||||
for (int i = 1; i <= numRecords; i++) {
|
||||
String key = nameKeyToUseMultiplePrefix + i;
|
||||
String name = vc.getAttributeAsString(key, null);
|
||||
if (name != null)
|
||||
entriesToNames.put(key, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // no entry with the # of records:
|
||||
String name = vc.getAttributeAsString(nameKeyToUse, null);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
}
|
||||
else { // Check all INFO fields for a match (if there are multiple entries):
|
||||
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if (key.startsWith(nameKeyToUseMultiplePrefix))
|
||||
entriesToNames.put(key, entry.getValue().toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
return entriesToNames;
|
||||
}
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
|
||||
return getRefSeqEntriesToNames(vc, false);
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
|
||||
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc) {
|
||||
return getRefSeqNames(vc, false);
|
||||
}
|
||||
|
||||
public static Map<String, Object> getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) {
|
||||
Map<String, Object> refSeqNameAttribs = new HashMap<String, Object>();
|
||||
|
||||
Map<String, RefSeqEntry> entriesMap1 = getAllRefSeqEntriesByName(vc1);
|
||||
Map<String, RefSeqEntry> entriesMap2 = getAllRefSeqEntriesByName(vc2);
|
||||
|
||||
Set<String> commonNames = entriesMap1.keySet();
|
||||
commonNames.retainAll(entriesMap2.keySet());
|
||||
boolean addSuffix = commonNames.size() > 1;
|
||||
int nextCount = 1;
|
||||
|
||||
for (String name : commonNames) {
|
||||
RefSeqEntry refseq1 = entriesMap1.get(name);
|
||||
RefSeqEntry refseq2 = entriesMap2.get(name);
|
||||
|
||||
String keySuffix = "";
|
||||
if (addSuffix)
|
||||
keySuffix = "_" + nextCount;
|
||||
|
||||
boolean added = false;
|
||||
for (String key : NAME_KEYS) {
|
||||
Object obj1 = refseq1.info.get(key);
|
||||
Object obj2 = refseq2.info.get(key);
|
||||
if (obj1 != null && obj2 != null && obj1.equals(obj2)) {
|
||||
added = true;
|
||||
String useKey = key + keySuffix;
|
||||
refSeqNameAttribs.put(useKey, obj1);
|
||||
}
|
||||
}
|
||||
if (added)
|
||||
nextCount++;
|
||||
}
|
||||
int totalCount = nextCount - 1; // since incremented count one extra time
|
||||
if (totalCount > 1)
|
||||
refSeqNameAttribs.put(NUM_RECORDS_KEY, totalCount);
|
||||
|
||||
return refSeqNameAttribs;
|
||||
}
|
||||
|
||||
public static Map<String, Object> removeRefSeqAttributes(Map<String, Object> attributes) {
|
||||
Map<String, Object> removedRefSeqAttributes = new HashMap<String, Object>(attributes);
|
||||
|
||||
Iterator<Map.Entry<String, Object>> attrIt = removedRefSeqAttributes.entrySet().iterator();
|
||||
while (attrIt.hasNext()) {
|
||||
String key = attrIt.next().getKey();
|
||||
if (key.startsWith(REFSEQ_PREFIX))
|
||||
attrIt.remove();
|
||||
}
|
||||
|
||||
return removedRefSeqAttributes;
|
||||
}
|
||||
|
||||
private static Map<String, RefSeqEntry> getAllRefSeqEntriesByName(VariantContext vc) {
|
||||
Map<String, RefSeqEntry> nameToEntries = new TreeMap<String, RefSeqEntry>();
|
||||
|
||||
List<RefSeqEntry> allEntries = getAllRefSeqEntries(vc);
|
||||
for (RefSeqEntry entry : allEntries) {
|
||||
Object name = entry.info.get(NAME_KEY);
|
||||
if (name != null)
|
||||
nameToEntries.put(name.toString(), entry);
|
||||
}
|
||||
|
||||
return nameToEntries;
|
||||
}
|
||||
|
||||
// Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
|
||||
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
|
||||
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
|
||||
|
||||
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
|
||||
String entry = entryToName.getKey();
|
||||
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
|
||||
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
|
||||
}
|
||||
|
||||
return allRefSeq;
|
||||
}
|
||||
|
||||
private static class RefSeqEntry {
|
||||
public Map<String, Object> info;
|
||||
|
||||
public RefSeqEntry(VariantContext vc, String entrySuffix) {
|
||||
this.info = new HashMap<String, Object>();
|
||||
|
||||
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
|
||||
String key = attribEntry.getKey();
|
||||
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
|
||||
String genericKey = key.replaceAll(entrySuffix, "");
|
||||
this.info.put(genericKey, attribEntry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
||||
public class SNPallelePair extends AllelePair {
|
||||
class SNPallelePair extends AllelePair {
|
||||
|
||||
public SNPallelePair(Genotype gt) {
|
||||
super(gt);
|
||||
|
|
|
|||
|
|
@ -1,34 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
public class WriteVCF {
|
||||
public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) {
|
||||
writer.add(vc);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,140 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadPairWalker;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Counts the number of read pairs encountered in a file sorted in
|
||||
* query name order. Breaks counts down by total pairs and number
|
||||
* of paired reads.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of pairs seen.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountPairs \
|
||||
* -o output.txt \
|
||||
* -I input.bam
|
||||
* </pre>
|
||||
*
|
||||
* @author mhanna
|
||||
*/
|
||||
public class CountPairsWalker extends ReadPairWalker<Integer,Long> {
|
||||
@Output
|
||||
private PrintStream out;
|
||||
|
||||
/**
|
||||
* How many reads are the first in a pair, based on flag 0x0040 from the SAM spec.
|
||||
*/
|
||||
private long firstOfPair = 0;
|
||||
|
||||
/**
|
||||
* How many reads are the second in a pair, based on flag 0x0080 from the SAM spec.
|
||||
*/
|
||||
private long secondOfPair = 0;
|
||||
|
||||
/**
|
||||
* A breakdown of the total number of reads seen with exactly the same read name.
|
||||
*/
|
||||
private List<Long> pairCountsByType = new ExpandingArrayList<Long>();
|
||||
|
||||
/**
|
||||
* Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser.
|
||||
* @param reads Collection of reads having the same name.
|
||||
* @return Semantics defined by implementer.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(Collection<SAMRecord> reads) {
|
||||
if(pairCountsByType.get(reads.size()) != null)
|
||||
pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1);
|
||||
else
|
||||
pairCountsByType.set(reads.size(),1L);
|
||||
|
||||
for(SAMRecord read: reads) {
|
||||
if(read.getFirstOfPairFlag()) firstOfPair++;
|
||||
if(read.getSecondOfPairFlag()) secondOfPair++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* No pairs at the beginning of a traversal.
|
||||
* @return 0 always.
|
||||
*/
|
||||
@Override
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combine number of pairs seen in this iteration (always 1) with total number of pairs
|
||||
* seen in previous iterations.
|
||||
* @param value Pairs in this iteration (1), from the map function.
|
||||
* @param sum Count of all pairs in prior iterations.
|
||||
* @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1).
|
||||
*/
|
||||
@Override
|
||||
public Long reduce(Integer value, Long sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print summary statistics over the entire traversal.
|
||||
* @param sum A count of all read pairs viewed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Long sum) {
|
||||
out.printf("Total number of pairs : %d%n",sum);
|
||||
out.printf("Total number of first reads in pair : %d%n",firstOfPair);
|
||||
out.printf("Total number of second reads in pair: %d%n",secondOfPair);
|
||||
for(int i = 1; i < pairCountsByType.size(); i++) {
|
||||
if(pairCountsByType.get(i) == null)
|
||||
continue;
|
||||
out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.NGSPlatform;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
|
|||
|
|
@ -39,8 +39,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
|
@ -466,9 +466,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
|||
|
||||
if (vcfWriter != null && writeVariant) {
|
||||
if (!vcComp.hasAttribute("callStatus")) {
|
||||
MutableVariantContext mvc = new MutableVariantContext(vcComp);
|
||||
mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
|
||||
vcfWriter.add(mvc);
|
||||
vcfWriter.add(new VariantContextBuilder(vcComp).attribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF").make());
|
||||
}
|
||||
else
|
||||
vcfWriter.add(vcComp);
|
||||
|
|
|
|||
|
|
@ -260,7 +260,7 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
} else /* (mask != null && validate == null ) */ {
|
||||
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
|
||||
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )) {
|
||||
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
||||
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
|
||||
sequenceInvalid = true;
|
||||
|
|
@ -281,7 +281,7 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
|||
sequence.append('N');
|
||||
indelCounter--;
|
||||
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){
|
||||
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )){
|
||||
logger.debug("SNP in mask found at " + ref.getLocus().toString());
|
||||
|
||||
if ( lowerCaseSNPs ) {
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -186,7 +187,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
* File containing tribble-readable features for the IntervalStratificiation
|
||||
*/
|
||||
@Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false)
|
||||
protected IntervalBinding<Feature> intervalsFile = null;
|
||||
public IntervalBinding<Feature> intervalsFile = null;
|
||||
|
||||
// Variables
|
||||
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
|
||||
|
|
@ -264,9 +265,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
|
||||
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
|
||||
for ( VariantStratifier vs : getStratificationObjects() ) {
|
||||
if ( vs.getClass().getSimpleName().equals("Filter") )
|
||||
if ( vs.getName().equals("Filter") )
|
||||
byFilterIsEnabled = true;
|
||||
else if ( vs.getClass().getSimpleName().equals("Sample") )
|
||||
else if ( vs.getName().equals("Sample") )
|
||||
perSampleIsEnabled = true;
|
||||
}
|
||||
|
||||
|
|
@ -311,16 +312,17 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());
|
||||
|
||||
// --------- track --------- sample - VariantContexts -
|
||||
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
|
||||
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false);
|
||||
HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
|
||||
HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false);
|
||||
|
||||
// for each eval track
|
||||
for ( final RodBinding<VariantContext> evalRod : evals ) {
|
||||
final HashMap<String, Set<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap<String, Set<VariantContext>>(0);
|
||||
final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
|
||||
final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;
|
||||
|
||||
// for each sample stratifier
|
||||
for ( final String sampleName : sampleNamesForStratification ) {
|
||||
Set<VariantContext> evalSetBySample = evalSet.get(sampleName);
|
||||
Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
|
||||
if ( evalSetBySample == null ) {
|
||||
evalSetBySample = new HashSet<VariantContext>(1);
|
||||
evalSetBySample.add(null);
|
||||
|
|
@ -330,16 +332,14 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
for ( VariantContext eval : evalSetBySample ) {
|
||||
// deal with ancestral alleles if requested
|
||||
if ( eval != null && aastr != null ) {
|
||||
HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
|
||||
newAts.put("ANCESTRALALLELE", aastr);
|
||||
eval = VariantContext.modifyAttributes(eval, newAts);
|
||||
eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
|
||||
}
|
||||
|
||||
// for each comp track
|
||||
for ( final RodBinding<VariantContext> compRod : comps ) {
|
||||
// no sample stratification for comps
|
||||
final HashMap<String, Set<VariantContext>> compSetHash = compVCs.get(compRod);
|
||||
final Set<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet<VariantContext>(0) : compVCs.get(compRod).values().iterator().next();
|
||||
final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
|
||||
final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next();
|
||||
|
||||
// find the comp
|
||||
final VariantContext comp = findMatchingComp(eval, compSet);
|
||||
|
|
@ -383,7 +383,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
return null;
|
||||
}
|
||||
|
||||
private VariantContext findMatchingComp(final VariantContext eval, final Set<VariantContext> comps) {
|
||||
private VariantContext findMatchingComp(final VariantContext eval, final Collection<VariantContext> comps) {
|
||||
// if no comps, return null
|
||||
if ( comps == null || comps.isEmpty() )
|
||||
return null;
|
||||
|
|
@ -448,20 +448,18 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
TableType t = (TableType) field.get(ve);
|
||||
|
||||
String subTableName = ve.getClass().getSimpleName() + "." + field.getName();
|
||||
String subTableDesc = datamap.get(field).description();
|
||||
final DataPoint dataPointAnn = datamap.get(field);
|
||||
|
||||
GATKReportTable table;
|
||||
if (!report.hasTable(subTableName)) {
|
||||
report.addTable(subTableName, subTableDesc);
|
||||
report.addTable(subTableName, dataPointAnn.description());
|
||||
table = report.getTable(subTableName);
|
||||
|
||||
table.addPrimaryKey("entry", false);
|
||||
table.addColumn(subTableName, subTableName);
|
||||
|
||||
for ( VariantStratifier vs : stratificationObjects ) {
|
||||
String columnName = vs.getClass().getSimpleName();
|
||||
|
||||
table.addColumn(columnName, "unknown");
|
||||
table.addColumn(vs.getName(), "unknown");
|
||||
}
|
||||
|
||||
table.addColumn("row", "unknown");
|
||||
|
|
@ -485,9 +483,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
String r = (String) t.getRowKeys()[row];
|
||||
|
||||
for ( VariantStratifier vs : stratificationObjects ) {
|
||||
String columnName = vs.getClass().getSimpleName();
|
||||
|
||||
table.set(stateKey.toString() + r, columnName, stateKey.get(vs.getClass().getSimpleName()));
|
||||
final String columnName = vs.getName();
|
||||
table.set(stateKey.toString() + r, columnName, stateKey.get(columnName));
|
||||
}
|
||||
|
||||
for (int col = 0; col < t.getColumnKeys().length; col++) {
|
||||
|
|
@ -508,9 +505,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
GATKReportTable table = report.getTable(ve.getClass().getSimpleName());
|
||||
|
||||
for ( VariantStratifier vs : stratificationObjects ) {
|
||||
String columnName = vs.getClass().getSimpleName();
|
||||
String columnName = vs.getName();
|
||||
|
||||
table.set(stateKey.toString(), columnName, stateKey.get(vs.getClass().getSimpleName()));
|
||||
table.set(stateKey.toString(), columnName, stateKey.get(vs.getName()));
|
||||
}
|
||||
|
||||
table.set(stateKey.toString(), field.getName(), field.get(ve));
|
||||
|
|
|
|||
|
|
@ -1,35 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
||||
class NewCompEvalGenotypes {
|
||||
private GenomeLoc loc;
|
||||
private Genotype compGt;
|
||||
private Genotype evalGt;
|
||||
|
||||
public NewCompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) {
|
||||
this.loc = loc;
|
||||
this.compGt = compGt;
|
||||
this.evalGt = evalGt;
|
||||
}
|
||||
|
||||
public GenomeLoc getLocus() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public Genotype getCompGenotpye() {
|
||||
return compGt;
|
||||
}
|
||||
public Genotype getEvalGenotype() {
|
||||
return evalGt;
|
||||
}
|
||||
|
||||
public void setCompGenotype(Genotype compGt) {
|
||||
this.compGt = compGt;
|
||||
}
|
||||
|
||||
public void setEvalGenotype(Genotype evalGt) {
|
||||
this.evalGt = evalGt;
|
||||
}
|
||||
}
|
||||
|
|
@ -28,13 +28,13 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
@DataPoint(description = "number of eval sites at comp sites")
|
||||
long nVariantsAtComp = 0;
|
||||
|
||||
@DataPoint(description = "percentage of eval sites at comp sites")
|
||||
@DataPoint(description = "percentage of eval sites at comp sites", format = "%.2f" )
|
||||
double compRate = 0.0;
|
||||
|
||||
@DataPoint(description = "number of concordant sites")
|
||||
long nConcordant = 0;
|
||||
|
||||
@DataPoint(description = "the concordance rate")
|
||||
@DataPoint(description = "the concordance rate", format = "%.2f")
|
||||
double concordantRate = 0.0;
|
||||
|
||||
public int getComparisonOrder() {
|
||||
|
|
@ -72,7 +72,7 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
|
|||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
boolean evalIsGood = eval != null && eval.isPolymorphic();
|
||||
boolean evalIsGood = eval != null && eval.isPolymorphicInSamples();
|
||||
boolean compIsGood = comp != null && comp.isNotFiltered();
|
||||
|
||||
if (evalIsGood) nEvalVariants++; // count the number of eval events
|
||||
|
|
|
|||
|
|
@ -62,17 +62,17 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
public long nHomDerived = 0;
|
||||
|
||||
// calculations that get set in the finalizeEvaluation method
|
||||
@DataPoint(description = "heterozygosity per locus rate")
|
||||
@DataPoint(description = "heterozygosity per locus rate", format = "%.2e")
|
||||
public double heterozygosity = 0;
|
||||
@DataPoint(description = "heterozygosity per base pair")
|
||||
@DataPoint(description = "heterozygosity per base pair", format = "%.2f")
|
||||
public double heterozygosityPerBp = 0;
|
||||
@DataPoint(description = "heterozygosity to homozygosity ratio")
|
||||
@DataPoint(description = "heterozygosity to homozygosity ratio", format = "%.2f")
|
||||
public double hetHomRatio = 0;
|
||||
@DataPoint(description = "indel rate (insertion count + deletion count)")
|
||||
@DataPoint(description = "indel rate (insertion count + deletion count)", format = "%.2e")
|
||||
public double indelRate = 0;
|
||||
@DataPoint(description = "indel rate per base pair")
|
||||
@DataPoint(description = "indel rate per base pair", format = "%.2f")
|
||||
public double indelRatePerBp = 0;
|
||||
@DataPoint(description = "deletion to insertion ratio")
|
||||
@DataPoint(description = "deletion to insertion ratio", format = "%.2f")
|
||||
public double deletionInsertionRatio = 0;
|
||||
|
||||
private double perLocusRate(long n) {
|
||||
|
|
@ -103,7 +103,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
|
||||
// added in a proxy check for monomorphic status here.
|
||||
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
|
||||
if ( vc1.isMonomorphic() ) {
|
||||
if ( vc1.isMonomorphicInSamples() ) {
|
||||
nRefLoci++;
|
||||
} else {
|
||||
switch (vc1.getType()) {
|
||||
|
|
@ -157,8 +157,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
// A C A
|
||||
// A C C
|
||||
|
||||
for (Genotype g : vc1.getGenotypes().values()) {
|
||||
String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null;
|
||||
for (final Genotype g : vc1.getGenotypes()) {
|
||||
final String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null;
|
||||
|
||||
switch (g.getType()) {
|
||||
case NO_CALL:
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue