diff --git a/ivy.xml b/ivy.xml
index 96c1de844..ee24bc367 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -76,7 +76,7 @@
-
+
diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java
new file mode 100644
index 000000000..5005b6265
--- /dev/null
+++ b/public/java/src/net/sf/samtools/BAMFileReader.java
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package net.sf.samtools;
+
+
+import net.sf.samtools.util.*;
+import net.sf.samtools.SAMFileReader.ValidationStringency;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * Internal class for reading and querying BAM files.
+ */
+class BAMFileReader extends SAMFileReader.ReaderImplementation {
+ // True if reading from a File rather than an InputStream
+ private boolean mIsSeekable = false;
+
+ // For converting bytes into other primitive types
+ private BinaryCodec mStream = null;
+
+ // Underlying compressed data stream.
+ private final BAMInputStream mInputStream;
+ private SAMFileHeader mFileHeader = null;
+
+ // Populated if the file is seekable and an index exists
+ private File mIndexFile;
+ private BAMIndex mIndex = null;
+ private long mFirstRecordPointer = 0;
+ private CloseableIterator mCurrentIterator = null;
+
+ // If true, all SAMRecords are fully decoded as they are read.
+ private final boolean eagerDecode;
+
+ // For error-checking.
+ private ValidationStringency mValidationStringency;
+
+ // For creating BAMRecords
+ private SAMRecordFactory samRecordFactory;
+
+ /**
+ * Use the caching index reader implementation rather than the disk-hit-per-file model.
+ */
+ private boolean mEnableIndexCaching = false;
+
+ /**
+ * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
+ */
+ private boolean mEnableIndexMemoryMapping = true;
+
+ /**
+ * Add information about the origin (reader and position) to SAM records.
+ */
+ private SAMFileReader mFileReader = null;
+
+ /**
+ * Prepare to read BAM from a stream (not seekable)
+ * @param stream source of bytes.
+ * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
+ * @param validationStringency Controls how to handle invalidate reads or header lines.
+ */
+ BAMFileReader(final InputStream stream,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ mIndexFile = indexFile;
+ mIsSeekable = false;
+ mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
+ mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
+ this.eagerDecode = eagerDecode;
+ this.mValidationStringency = validationStringency;
+ this.samRecordFactory = factory;
+ readHeader(null);
+ }
+
+ /**
+ * Prepare to read BAM from a file (seekable)
+ * @param file source of bytes.
+ * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
+ * @param validationStringency Controls how to handle invalidate reads or header lines.
+ */
+ BAMFileReader(final File file,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
+ if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
+ System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
+ " is older than BAM " + file.getAbsolutePath());
+ }
+ }
+
+ BAMFileReader(final SeekableStream strm,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
+ indexFile,
+ eagerDecode,
+ strm.getSource(),
+ validationStringency,
+ factory);
+ }
+
+ private BAMFileReader(final BAMInputStream inputStream,
+ final File indexFile,
+ final boolean eagerDecode,
+ final String source,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ mIndexFile = indexFile;
+ mIsSeekable = true;
+ mInputStream = inputStream;
+ mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
+ this.eagerDecode = eagerDecode;
+ this.mValidationStringency = validationStringency;
+ this.samRecordFactory = factory;
+ readHeader(source);
+ mFirstRecordPointer = inputStream.getFilePointer();
+ }
+
+ /**
+ * If true, writes the source of every read into the source SAMRecords.
+ * @param enabled true to write source information into each SAMRecord.
+ */
+ void enableFileSource(final SAMFileReader reader, final boolean enabled) {
+ this.mFileReader = enabled ? reader : null;
+ }
+
+ /**
+ * If true, uses the caching version of the index reader.
+ * @param enabled true to write source information into each SAMRecord.
+ */
+ public void enableIndexCaching(final boolean enabled) {
+ if(mIndex != null)
+ throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
+ this.mEnableIndexCaching = enabled;
+ }
+
+ /**
+ * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
+ * This is slower but more scalable when accessing large numbers of BAM files sequentially.
+ * @param enabled True to use memory mapping, false to use regular I/O.
+ */
+ public void enableIndexMemoryMapping(final boolean enabled) {
+ if (mIndex != null) {
+ throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
+ }
+ this.mEnableIndexMemoryMapping = enabled;
+ }
+
+ @Override void enableCrcChecking(final boolean enabled) {
+ this.mInputStream.setCheckCrcs(enabled);
+ }
+
+ @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
+
+ /**
+ * @return true if ths is a BAM file, and has an index
+ */
+ public boolean hasIndex() {
+ return (mIndexFile != null);
+ }
+
+ /**
+ * Retrieves the index for the given file type. Ensure that the index is of the specified type.
+ * @return An index of the given type.
+ */
+ public BAMIndex getIndex() {
+ if(mIndexFile == null)
+ throw new SAMException("No index is available for this BAM file.");
+ if(mIndex == null)
+ mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
+ : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
+ return mIndex;
+ }
+
+ void close() {
+ if (mStream != null) {
+ mStream.close();
+ }
+ if (mIndex != null) {
+ mIndex.close();
+ }
+ mStream = null;
+ mFileHeader = null;
+ mIndex = null;
+ }
+
+ SAMFileHeader getFileHeader() {
+ return mFileHeader;
+ }
+
+ /**
+ * Set error-checking level for subsequent SAMRecord reads.
+ */
+ void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
+ this.mValidationStringency = validationStringency;
+ }
+
+ SAMFileReader.ValidationStringency getValidationStringency() {
+ return this.mValidationStringency;
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords in file order.
+ * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
+ * that iterator must be closed before getIterator() can be called again.
+ * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
+ * getIterator() begins its iteration where the last one left off. That is the best that can be
+ * done in that situation.
+ */
+ CloseableIterator getIterator() {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (mIsSeekable) {
+ try {
+ mInputStream.seek(mFirstRecordPointer);
+ } catch (IOException exc) {
+ throw new RuntimeException(exc.getMessage(), exc);
+ }
+ }
+ mCurrentIterator = new BAMFileIterator();
+ return mCurrentIterator;
+ }
+
+ @Override
+ CloseableIterator getIterator(final SAMFileSpan chunks) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!(chunks instanceof BAMFileSpan)) {
+ throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
+ }
+
+ // Create an iterator over the given chunk boundaries.
+ mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
+ return mCurrentIterator;
+ }
+
+ /**
+ * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
+ * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
+ * the rightmost bound is guaranteed to be after the last read in the file.
+ * @return An unbounded pointer to the first record in the BAM file.
+ */
+ @Override
+ SAMFileSpan getFilePointerSpanningReads() {
+ return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords that match the given interval.
+ * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
+ * before calling any of the methods that return an iterator.
+ *
+ * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
+ * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
+ * matches the specified interval.
+ *
+ * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
+ * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
+ *
+ * @param sequence Reference sequence sought.
+ * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
+ * A value of zero implies the start of the reference sequence.
+ * @param end A value of zero implies the end of the reference sequence.
+ * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
+ * specified by start and end. If false, the SAMRecords need only overlap the interval.
+ * @return Iterator for the matching SAMRecords
+ */
+ CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
+ return mCurrentIterator;
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords with the given alignment start.
+ * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
+ * before calling any of the methods that return an iterator.
+ *
+ * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
+ * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
+ * matches the specified interval.
+ *
+ * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
+ * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
+ *
+ * @param sequence Reference sequence sought.
+ * @param start Alignment start sought.
+ * @return Iterator for the matching SAMRecords.
+ */
+ CloseableIterator queryAlignmentStart(final String sequence, final int start) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
+ return mCurrentIterator;
+ }
+
+ public CloseableIterator queryUnmapped() {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ try {
+ final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
+ if (startOfLastLinearBin != -1) {
+ mInputStream.seek(startOfLastLinearBin);
+ } else {
+ // No mapped reads in file, just start at the first read in file.
+ mInputStream.seek(mFirstRecordPointer);
+ }
+ mCurrentIterator = new BAMFileIndexUnmappedIterator();
+ return mCurrentIterator;
+ } catch (IOException e) {
+ throw new RuntimeException("IOException seeking to unmapped reads", e);
+ }
+ }
+
+ /**
+ * Reads the header from the file or stream
+ * @param source Note that this is used only for reporting errors.
+ */
+ private void readHeader(final String source)
+ throws IOException {
+
+ final byte[] buffer = new byte[4];
+ mStream.readBytes(buffer);
+ if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
+ throw new IOException("Invalid BAM file header");
+ }
+
+ final int headerTextLength = mStream.readInt();
+ final String textHeader = mStream.readString(headerTextLength);
+ final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
+ headerCodec.setValidationStringency(mValidationStringency);
+ mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
+ source);
+
+ final int sequenceCount = mStream.readInt();
+ if (mFileHeader.getSequenceDictionary().size() > 0) {
+ // It is allowed to have binary sequences but no text sequences, so only validate if both are present
+ if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
+ throw new SAMFormatException("Number of sequences in text header (" +
+ mFileHeader.getSequenceDictionary().size() +
+ ") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
+ }
+ for (int i = 0; i < sequenceCount; i++) {
+ final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
+ final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
+ if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
+ throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
+ source);
+ }
+ if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
+ throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
+ source);
+ }
+ }
+ } else {
+ // If only binary sequences are present, copy them into mFileHeader
+ final List sequences = new ArrayList(sequenceCount);
+ for (int i = 0; i < sequenceCount; i++) {
+ sequences.add(readSequenceRecord(source));
+ }
+ mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
+ }
+ }
+
+ /**
+ * Reads a single binary sequence record from the file or stream
+ * @param source Note that this is used only for reporting errors.
+ */
+ private SAMSequenceRecord readSequenceRecord(final String source) {
+ final int nameLength = mStream.readInt();
+ if (nameLength <= 1) {
+ throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
+ }
+ final String sequenceName = mStream.readString(nameLength - 1);
+ // Skip the null terminator
+ mStream.readByte();
+ final int sequenceLength = mStream.readInt();
+ return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
+ }
+
+ /**
+ * Iterator for non-indexed sequential iteration through all SAMRecords in file.
+ * Starting point of iteration is wherever current file position is when the iterator is constructed.
+ */
+ private class BAMFileIterator implements CloseableIterator {
+ private SAMRecord mNextRecord = null;
+ private final BAMRecordCodec bamRecordCodec;
+ private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
+
+ BAMFileIterator() {
+ this(true);
+ }
+
+ /**
+ * @param advance Trick to enable subclass to do more setup before advancing
+ */
+ BAMFileIterator(final boolean advance) {
+ this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
+ this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
+
+ if (advance) {
+ advance();
+ }
+ }
+
+ public void close() {
+ if (mCurrentIterator != null && this != mCurrentIterator) {
+ throw new IllegalStateException("Attempt to close non-current iterator");
+ }
+ mCurrentIterator = null;
+ }
+
+ public boolean hasNext() {
+ return (mNextRecord != null);
+ }
+
+ public SAMRecord next() {
+ final SAMRecord result = mNextRecord;
+ advance();
+ return result;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Not supported: remove");
+ }
+
+ void advance() {
+ try {
+ mNextRecord = getNextRecord();
+
+ if (mNextRecord != null) {
+ ++this.samRecordIndex;
+ // Because some decoding is done lazily, the record needs to remember the validation stringency.
+ mNextRecord.setValidationStringency(mValidationStringency);
+
+ if (mValidationStringency != ValidationStringency.SILENT) {
+ final List validationErrors = mNextRecord.isValid();
+ SAMUtils.processValidationErrors(validationErrors,
+ this.samRecordIndex, BAMFileReader.this.getValidationStringency());
+ }
+ }
+ if (eagerDecode && mNextRecord != null) {
+ mNextRecord.eagerDecode();
+ }
+ } catch (IOException exc) {
+ throw new RuntimeException(exc.getMessage(), exc);
+ }
+ }
+
+ /**
+ * Read the next record from the input stream.
+ */
+ SAMRecord getNextRecord() throws IOException {
+ final long startCoordinate = mInputStream.getFilePointer();
+ final SAMRecord next = bamRecordCodec.decode();
+ final long stopCoordinate = mInputStream.getFilePointer();
+
+ if(mFileReader != null && next != null)
+ next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
+
+ return next;
+ }
+
+ /**
+ * @return The record that will be return by the next call to next()
+ */
+ protected SAMRecord peek() {
+ return mNextRecord;
+ }
+ }
+
+ /**
+ * Prepare to iterate through SAMRecords matching the target interval.
+ * @param sequence Desired reference sequence.
+ * @param start 1-based start of target interval, inclusive.
+ * @param end 1-based end of target interval, inclusive.
+ * @param queryType contained, overlapping, or starting-at query.
+ */
+ private CloseableIterator createIndexIterator(final String sequence,
+ final int start,
+ final int end,
+ final QueryType queryType) {
+ long[] filePointers = null;
+
+ // Hit the index to determine the chunk boundaries for the required data.
+ final SAMFileHeader fileHeader = getFileHeader();
+ final int referenceIndex = fileHeader.getSequenceIndex(sequence);
+ if (referenceIndex != -1) {
+ final BAMIndex fileIndex = getIndex();
+ final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
+ filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
+ }
+
+ // Create an iterator over the above chunk boundaries.
+ final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
+
+ // Add some preprocessing filters for edge-case reads that don't fit into this
+ // query type.
+ return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
+ }
+
+ enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
+
+ /**
+ * Look for BAM index file according to standard naming convention.
+ *
+ * @param dataFile BAM file name.
+ * @return Index file name, or null if not found.
+ */
+ private static File findIndexFile(final File dataFile) {
+ // If input is foo.bam, look for foo.bai
+ final String bamExtension = ".bam";
+ File indexFile;
+ final String fileName = dataFile.getName();
+ if (fileName.endsWith(bamExtension)) {
+ final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
+ indexFile = new File(dataFile.getParent(), bai);
+ if (indexFile.exists()) {
+ return indexFile;
+ }
+ }
+
+ // If foo.bai doesn't exist look for foo.bam.bai
+ indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
+ if (indexFile.exists()) {
+ return indexFile;
+ } else {
+ return null;
+ }
+ }
+
+ private class BAMFileIndexIterator extends BAMFileIterator {
+
+ private long[] mFilePointers = null;
+ private int mFilePointerIndex = 0;
+ private long mFilePointerLimit = -1;
+
+ /**
+ * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
+ * @param filePointers the block / offset combination, stored in chunk format.
+ */
+ BAMFileIndexIterator(final long[] filePointers) {
+ super(false); // delay advance() until after construction
+ mFilePointers = filePointers;
+ advance();
+ }
+
+ SAMRecord getNextRecord()
+ throws IOException {
+ // Advance to next file block if necessary
+ while (mInputStream.getFilePointer() >= mFilePointerLimit) {
+ if (mFilePointers == null ||
+ mFilePointerIndex >= mFilePointers.length) {
+ return null;
+ }
+ final long startOffset = mFilePointers[mFilePointerIndex++];
+ final long endOffset = mFilePointers[mFilePointerIndex++];
+ mInputStream.seek(startOffset);
+ mFilePointerLimit = endOffset;
+ }
+ // Pull next record from stream
+ return super.getNextRecord();
+ }
+ }
+
+ /**
+ * A decorating iterator that filters out records that are outside the bounds of the
+ * given query parameters.
+ */
+ private class BAMQueryFilteringIterator implements CloseableIterator {
+ /**
+ * The wrapped iterator.
+ */
+ private final CloseableIterator wrappedIterator;
+
+ /**
+ * The next record to be returned. Will be null if no such record exists.
+ */
+ private SAMRecord mNextRecord;
+
+ private final int mReferenceIndex;
+ private final int mRegionStart;
+ private final int mRegionEnd;
+ private final QueryType mQueryType;
+
+ public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) {
+ this.wrappedIterator = iterator;
+ final SAMFileHeader fileHeader = getFileHeader();
+ mReferenceIndex = fileHeader.getSequenceIndex(sequence);
+ mRegionStart = start;
+ if (queryType == QueryType.STARTING_AT) {
+ mRegionEnd = mRegionStart;
+ } else {
+ mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
+ }
+ mQueryType = queryType;
+ mNextRecord = advance();
+ }
+
+ /**
+ * Returns true if a next element exists; false otherwise.
+ */
+ public boolean hasNext() {
+ return mNextRecord != null;
+ }
+
+ /**
+ * Gets the next record from the given iterator.
+ * @return The next SAM record in the iterator.
+ */
+ public SAMRecord next() {
+ if(!hasNext())
+ throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
+ final SAMRecord currentRead = mNextRecord;
+ mNextRecord = advance();
+ return currentRead;
+ }
+
+ /**
+ * Closes down the existing iterator.
+ */
+ public void close() {
+ if (this != mCurrentIterator) {
+ throw new IllegalStateException("Attempt to close non-current iterator");
+ }
+ mCurrentIterator = null;
+ }
+
+ /**
+ * @throws UnsupportedOperationException always.
+ */
+ public void remove() {
+ throw new UnsupportedOperationException("Not supported: remove");
+ }
+
+ SAMRecord advance() {
+ while (true) {
+ // Pull next record from stream
+ if(!wrappedIterator.hasNext())
+ return null;
+
+ final SAMRecord record = wrappedIterator.next();
+ // If beyond the end of this reference sequence, end iteration
+ final int referenceIndex = record.getReferenceIndex();
+ if (referenceIndex != mReferenceIndex) {
+ if (referenceIndex < 0 ||
+ referenceIndex > mReferenceIndex) {
+ return null;
+ }
+ // If before this reference sequence, continue
+ continue;
+ }
+ if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
+ // Quick exit to avoid expensive alignment end calculation
+ return record;
+ }
+ final int alignmentStart = record.getAlignmentStart();
+ // If read is unmapped but has a coordinate, return it if the coordinate is within
+ // the query region, regardless of whether the mapped mate will be returned.
+ final int alignmentEnd;
+ if (mQueryType == QueryType.STARTING_AT) {
+ alignmentEnd = -1;
+ } else {
+ alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
+ record.getAlignmentEnd(): alignmentStart);
+ }
+
+ if (alignmentStart > mRegionEnd) {
+ // If scanned beyond target region, end iteration
+ return null;
+ }
+ // Filter for overlap with region
+ if (mQueryType == QueryType.CONTAINED) {
+ if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
+ return record;
+ }
+ } else if (mQueryType == QueryType.OVERLAPPING) {
+ if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
+ return record;
+ }
+ } else {
+ if (alignmentStart == mRegionStart) {
+ return record;
+ }
+ }
+ }
+ }
+ }
+
+ private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
+ private BAMFileIndexUnmappedIterator() {
+ while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ advance();
+ }
+ }
+ }
+
+}
diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
index 623f46291..4692c6671 100644
--- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
+++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
@@ -25,6 +25,7 @@
package net.sf.samtools;
import net.sf.picard.util.PeekableIterator;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Arrays;
@@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
super();
}
+ /**
+ * Create a new GATKBAMFileSpan from an existing BAMFileSpan.
+ * @param sourceFileSpan
+ */
+ public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
+ if(!(sourceFileSpan instanceof BAMFileSpan))
+ throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
+ BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
+ for(Chunk chunk: sourceBAMFileSpan.getChunks())
+ add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
+ }
+
/**
* Convenience constructor to construct a BAM file span from
* a single chunk.
diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java
index f590809e2..5d349e72e 100644
--- a/public/java/src/net/sf/samtools/GATKChunk.java
+++ b/public/java/src/net/sf/samtools/GATKChunk.java
@@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
super.setChunkEnd(value);
}
+ public long getBlockStart() {
+ return getChunkStart() >>> 16;
+ }
+
+ public int getBlockOffsetStart() {
+ return (int)(getChunkStart() & 0xFFFF);
+ }
+
+ public long getBlockEnd() {
+ return getChunkEnd() >>> 16;
+ }
+
+ public int getBlockOffsetEnd() {
+ return ((int)getChunkEnd() & 0xFFFF);
+ }
+
/**
* Computes an approximation of the uncompressed size of the
* chunk, in bytes. Can be used to determine relative weights
diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java
new file mode 100644
index 000000000..d825c23d5
--- /dev/null
+++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package net.sf.samtools.util;
+
+import java.io.IOException;
+
+/**
+ * An input stream formulated for use reading BAM files. Supports
+ */
+public interface BAMInputStream {
+ /**
+ * Seek to the given position in the file. Note that pos is a special virtual file pointer,
+ * not an actual byte offset.
+ *
+ * @param pos virtual file pointer
+ */
+ public void seek(final long pos) throws IOException;
+
+ /**
+ * @return virtual file pointer that can be passed to seek() to return to the current position. This is
+ * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
+ * the two.
+ */
+ public long getFilePointer();
+
+ /**
+ * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
+ * and check it against the value stored in the GZIP header. CRC checking is an expensive
+ * operation and should be used accordingly.
+ */
+ public void setCheckCrcs(final boolean check);
+
+ public int read() throws java.io.IOException;
+
+ public int read(byte[] bytes) throws java.io.IOException;
+
+ public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
+
+ public long skip(long l) throws java.io.IOException;
+
+ public int available() throws java.io.IOException;
+
+ public void close() throws java.io.IOException;
+
+ public void mark(int i);
+
+ public void reset() throws java.io.IOException;
+
+ public boolean markSupported();
+}
diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
new file mode 100755
index 000000000..fae2fc89b
--- /dev/null
+++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
@@ -0,0 +1,483 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.samtools.util;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+import net.sf.samtools.FileTruncatedException;
+
+/*
+ * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
+ * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
+ * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
+ * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
+ *
+ * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
+ */
+public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
+ private InputStream mStream = null;
+ private SeekableStream mFile = null;
+ private byte[] mFileBuffer = null;
+ private byte[] mCurrentBlock = null;
+ private int mCurrentOffset = 0;
+ private long mBlockAddress = 0;
+ private int mLastBlockLength = 0;
+ private final BlockGunzipper blockGunzipper = new BlockGunzipper();
+
+
+ /**
+ * Note that seek() is not supported if this ctor is used.
+ */
+ public BlockCompressedInputStream(final InputStream stream) {
+ mStream = IOUtil.toBufferedStream(stream);
+ mFile = null;
+ }
+
+ /**
+ * Use this ctor if you wish to call seek()
+ */
+ public BlockCompressedInputStream(final File file)
+ throws IOException {
+ mFile = new SeekableFileStream(file);
+ mStream = null;
+
+ }
+
+ public BlockCompressedInputStream(final URL url) {
+ mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
+ mStream = null;
+ }
+
+ /**
+ * For providing some arbitrary data source. No additional buffering is
+ * provided, so if the underlying source is not buffered, wrap it in a
+ * SeekableBufferedStream before passing to this ctor.
+ */
+ public BlockCompressedInputStream(final SeekableStream strm) {
+ mFile = strm;
+ mStream = null;
+ }
+
+ /**
+ * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
+ * and check it against the value stored in the GZIP header. CRC checking is an expensive
+ * operation and should be used accordingly.
+ */
+ public void setCheckCrcs(final boolean check) {
+ this.blockGunzipper.setCheckCrcs(check);
+ }
+
+ /**
+ * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
+ * next caller of a method for this input stream. The next caller might be the same thread or another thread.
+ * Note that although the next caller can read this many bytes without blocking, the available() method call itself
+ * may block in order to fill an internal buffer if it has been exhausted.
+ */
+ public int available()
+ throws IOException {
+ if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
+ readBlock();
+ }
+ if (mCurrentBlock == null) {
+ return 0;
+ }
+ return mCurrentBlock.length - mCurrentOffset;
+ }
+
+ /**
+ * Closes the underlying InputStream or RandomAccessFile
+ */
+ public void close()
+ throws IOException {
+ if (mFile != null) {
+ mFile.close();
+ mFile = null;
+ } else if (mStream != null) {
+ mStream.close();
+ mStream = null;
+ }
+ // Encourage garbage collection
+ mFileBuffer = null;
+ mCurrentBlock = null;
+ }
+
+ /**
+ * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
+ * If no byte is available because the end of the stream has been reached, the value -1 is returned.
+ * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
+
+ * @return the next byte of data, or -1 if the end of the stream is reached.
+ */
+ public int read()
+ throws IOException {
+ return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
+ }
+
+ /**
+ * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
+ * actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
+ * or an exception is thrown.
+ *
+ * read(buf) has the same effect as read(buf, 0, buf.length).
+ *
+ * @param buffer the buffer into which the data is read.
+ * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
+ * the stream has been reached.
+ */
+ public int read(final byte[] buffer)
+ throws IOException {
+ return read(buffer, 0, buffer.length);
+ }
+
+ private volatile ByteArrayOutputStream buf = null;
+ private static final byte eol = '\n';
+ private static final byte eolCr = '\r';
+
+ /**
+ * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
+ * carriage return ('\r') or carriage return followed by a line feed ("\r\n").
+ *
+ * @return A String containing the contents of the line, excluding the line terminating
+ * character, or null if the end of the stream has been reached
+ *
+ * @exception IOException If an I/O error occurs
+ * @
+ */
+ public String readLine() throws IOException {
+ int available = available();
+ if (available == 0) {
+ return null;
+ }
+ if(null == buf){ // lazy initialisation
+ buf = new ByteArrayOutputStream(8192);
+ }
+ buf.reset();
+ boolean done = false;
+ boolean foundCr = false; // \r found flag
+ while (!done) {
+ int linetmpPos = mCurrentOffset;
+ int bCnt = 0;
+ while((available-- > 0)){
+ final byte c = mCurrentBlock[linetmpPos++];
+ if(c == eol){ // found \n
+ done = true;
+ break;
+ } else if(foundCr){ // previous char was \r
+ --linetmpPos; // current char is not \n so put it back
+ done = true;
+ break;
+ } else if(c == eolCr){ // found \r
+ foundCr = true;
+ continue; // no ++bCnt
+ }
+ ++bCnt;
+ }
+ if(mCurrentOffset < linetmpPos){
+ buf.write(mCurrentBlock, mCurrentOffset, bCnt);
+ mCurrentOffset = linetmpPos;
+ }
+ available = available();
+ if(available == 0){
+ // EOF
+ done = true;
+ }
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
+ * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
+ *
+ * This method blocks until input data is available, end of file is detected, or an exception is thrown.
+ *
+ * @param buffer buffer into which data is read.
+ * @param offset the start offset in array b at which the data is written.
+ * @param length the maximum number of bytes to read.
+ * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
+ * the stream has been reached.
+ */
+ public int read(final byte[] buffer, int offset, int length)
+ throws IOException {
+ final int originalLength = length;
+ while (length > 0) {
+ final int available = available();
+ if (available == 0) {
+ // Signal EOF to caller
+ if (originalLength == length) {
+ return -1;
+ }
+ break;
+ }
+ final int copyLength = Math.min(length, available);
+ System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
+ mCurrentOffset += copyLength;
+ offset += copyLength;
+ length -= copyLength;
+ }
+ return originalLength - length;
+ }
+
+ /**
+ * Seek to the given position in the file. Note that pos is a special virtual file pointer,
+ * not an actual byte offset.
+ *
+ * @param pos virtual file pointer
+ */
+ public void seek(final long pos)
+ throws IOException {
+ if (mFile == null) {
+ throw new IOException("Cannot seek on stream based file");
+ }
+ // Decode virtual file pointer
+ // Upper 48 bits is the byte offset into the compressed stream of a block.
+ // Lower 16 bits is the byte offset into the uncompressed stream inside the block.
+ final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
+ final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
+ final int available;
+ if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
+ available = mCurrentBlock.length;
+ } else {
+ mFile.seek(compressedOffset);
+ mBlockAddress = compressedOffset;
+ mLastBlockLength = 0;
+ readBlock();
+ available = available();
+ }
+ if (uncompressedOffset > available ||
+ (uncompressedOffset == available && !eof())) {
+ throw new IOException("Invalid file pointer: " + pos);
+ }
+ mCurrentOffset = uncompressedOffset;
+ }
+
+ private boolean eof() throws IOException {
+ if (mFile.eof()) {
+ return true;
+ }
+ // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
+ return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ }
+
+ /**
+ * @return virtual file pointer that can be passed to seek() to return to the current position. This is
+ * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
+ * the two.
+ */
+ public long getFilePointer() {
+ if (mCurrentOffset == mCurrentBlock.length) {
+ // If current offset is at the end of the current block, file pointer should point
+ // to the beginning of the next block.
+ return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
+ }
+ return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
+ }
+
+ public static long getFileBlock(final long bgzfOffset) {
+ return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
+ }
+
+ /**
+ * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
+ * @return true if the given file looks like a valid BGZF file.
+ */
+ public static boolean isValidFile(final InputStream stream)
+ throws IOException {
+ if (!stream.markSupported()) {
+ throw new RuntimeException("Cannot test non-buffered stream");
+ }
+ stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
+ final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ stream.reset();
+ return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
+ }
+
+ private static boolean isValidBlockHeader(final byte[] buffer) {
+ return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
+ (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
+ (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
+ buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
+ buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
+ buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
+ }
+
+ private void readBlock()
+ throws IOException {
+
+ if (mFileBuffer == null) {
+ mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
+ }
+ int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ if (count == 0) {
+ // Handle case where there is no empty gzip block at end.
+ mCurrentOffset = 0;
+ mBlockAddress += mLastBlockLength;
+ mCurrentBlock = new byte[0];
+ return;
+ }
+ if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
+ throw new IOException("Premature end of file");
+ }
+ final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
+ if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
+ throw new IOException("Unexpected compressed block length: " + blockLength);
+ }
+ final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
+ count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
+ if (count != remaining) {
+ throw new FileTruncatedException("Premature end of file");
+ }
+ inflateBlock(mFileBuffer, blockLength);
+ mCurrentOffset = 0;
+ mBlockAddress += mLastBlockLength;
+ mLastBlockLength = blockLength;
+ }
+
+ private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
+ throws IOException {
+ final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
+ byte[] buffer = mCurrentBlock;
+ mCurrentBlock = null;
+ if (buffer == null || buffer.length != uncompressedLength) {
+ try {
+ buffer = new byte[uncompressedLength];
+ } catch (NegativeArraySizeException e) {
+ throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
+ }
+ }
+ blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
+ mCurrentBlock = buffer;
+ }
+
+ private int readBytes(final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ if (mFile != null) {
+ return readBytes(mFile, buffer, offset, length);
+ } else if (mStream != null) {
+ return readBytes(mStream, buffer, offset, length);
+ } else {
+ return 0;
+ }
+ }
+
+ private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ int bytesRead = 0;
+ while (bytesRead < length) {
+ final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
+ if (count <= 0) {
+ break;
+ }
+ bytesRead += count;
+ }
+ return bytesRead;
+ }
+
+ private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ int bytesRead = 0;
+ while (bytesRead < length) {
+ final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
+ if (count <= 0) {
+ break;
+ }
+ bytesRead += count;
+ }
+ return bytesRead;
+ }
+
+ private int unpackInt16(final byte[] buffer, final int offset) {
+ return ((buffer[offset] & 0xFF) |
+ ((buffer[offset+1] & 0xFF) << 8));
+ }
+
+ private int unpackInt32(final byte[] buffer, final int offset) {
+ return ((buffer[offset] & 0xFF) |
+ ((buffer[offset+1] & 0xFF) << 8) |
+ ((buffer[offset+2] & 0xFF) << 16) |
+ ((buffer[offset+3] & 0xFF) << 24));
+ }
+
+ public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
+
+ public static FileTermination checkTermination(final File file)
+ throws IOException {
+ final long fileSize = file.length();
+ if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
+ return FileTermination.DEFECTIVE;
+ }
+ final RandomAccessFile raFile = new RandomAccessFile(file, "r");
+ try {
+ raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
+ raFile.readFully(buf);
+ if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
+ return FileTermination.HAS_TERMINATOR_BLOCK;
+ }
+ final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
+ buf = new byte[bufsize];
+ raFile.seek(fileSize - bufsize);
+ raFile.read(buf);
+ for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
+ i >= 0; --i) {
+ if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
+ buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
+ continue;
+ }
+ final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
+ byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
+ if (buf.length - i == totalBlockSizeMinusOne + 1) {
+ return FileTermination.HAS_HEALTHY_LAST_BLOCK;
+ } else {
+ return FileTermination.DEFECTIVE;
+ }
+ }
+ return FileTermination.DEFECTIVE;
+ } finally {
+ raFile.close();
+ }
+ }
+
+ private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
+ for (int i = 0; i < length; ++i) {
+ if (preamble[i] != buf[i + startOffset]) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
+
+
diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
index bed1e710e..b0b57f7fc 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
@@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
* used to indicate an error occured
*
* @param msg the message
- * @param e the error
+ * @param t the error
*/
- public static void exitSystemWithError(String msg, final Exception e) {
+ public static void exitSystemWithError(String msg, final Throwable t) {
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("stack trace %n");
- e.printStackTrace();
+ t.printStackTrace();
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
@@ -394,8 +394,8 @@ public abstract class CommandLineProgram {
*
* @param e the exception occured
*/
- public static void exitSystemWithError(Exception e) {
- exitSystemWithError(e.getMessage(), e);
+ public static void exitSystemWithError(Throwable t) {
+ exitSystemWithError(t.getMessage(), t);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index b8488dc9a..d3db35c07 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -99,8 +99,8 @@ public class CommandLineGATK extends CommandLineExecutable {
} catch (net.sf.samtools.SAMException e) {
// Let's try this out and see how it is received by our users
exitSystemWithSamError(e);
- } catch (Exception e) {
- exitSystemWithError(e);
+ } catch (Throwable t) {
+ exitSystemWithError(t);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index f8e87aa58..f2e0b5d0c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
@@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
*/
private Collection filters;
+ /**
+ * Controls the allocation of threads between CPU vs IO.
+ */
+ private ThreadAllocation threadAllocation;
+
/**
* A currently hacky unique name for this GATK instance
*/
@@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
+ // Determine how the threads should be divided between CPU vs. IO.
+ determineThreadAllocation();
+
// Prepare the data for traversal.
initializeDataSources();
@@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
// create the output streams "
initializeOutputStreams(microScheduler.getOutputTracker());
- ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
+ Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results
return microScheduler.execute(this.walker, shardStrategy);
@@ -266,6 +275,16 @@ public class GenomeAnalysisEngine {
return Collections.unmodifiableList(filters);
}
+ /**
+ * Parse out the thread allocation from the given command-line argument.
+ */
+ private void determineThreadAllocation() {
+ Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
+ Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null;
+ Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null;
+ this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
+ }
+
/**
* Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package.
@@ -286,7 +305,7 @@ public class GenomeAnalysisEngine {
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
}
- return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
+ return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
}
protected DownsamplingMethod getDownsamplingMethod() {
@@ -397,103 +416,49 @@ public class GenomeAnalysisEngine {
* @param intervals intervals
* @return the sharding strategy
*/
- protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
+ protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
- // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
- // sharding system; it's required with the new sharding system only for locus walkers.
- if(readsDataSource != null && !readsDataSource.hasIndex() ) {
- if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
+
+ // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
+ if(!readsDataSource.isEmpty()) {
+ if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
- if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
+ if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
- Shard.ShardType shardType;
if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
- shardType = Shard.ShardType.LOCUS;
+ if(intervals == null)
+ return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
+ else
+ return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
+ }
+ else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
+ // Apply special validation to read pair walkers.
+ if(walker instanceof ReadPairWalker) {
+ if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
+ throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
+ if(intervals != null && !intervals.isEmpty())
+ throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
+ }
+
+ if(intervals == null)
+ return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
+ else
+ return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
}
- else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
- shardType = Shard.ShardType.READ;
else
- throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
-
- List region;
- if(intervals != null)
- region = intervals.toList();
- else {
- region = new ArrayList();
- for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
- region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
- }
-
- return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
+ throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
+ }
+ else {
+ final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000;
+ if(intervals == null)
+ return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
+ else
+ return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
}
-
- ShardStrategy shardStrategy;
- ShardStrategyFactory.SHATTER_STRATEGY shardType;
-
- long SHARD_SIZE = 100000L;
-
- if (walker instanceof LocusWalker) {
- if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
-
- if (intervals != null && !intervals.isEmpty()) {
- if (readsDataSource == null)
- throw new IllegalArgumentException("readsDataSource is null");
- if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
-
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser(),
- intervals);
- } else
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,getGenomeLocParser());
- } else if (walker instanceof ReadWalker ||
- walker instanceof DuplicateWalker) {
- shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
-
- if (intervals != null && !intervals.isEmpty()) {
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- shardType,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser(),
- intervals);
- } else {
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- shardType,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser());
- }
- } else if (walker instanceof ReadPairWalker) {
- if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
- if(intervals != null && !intervals.isEmpty())
- throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
-
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser());
- } else
- throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
-
- return shardStrategy;
}
protected boolean flashbackData() {
@@ -751,6 +716,8 @@ public class GenomeAnalysisEngine {
return new SAMDataSource(
samReaderIDs,
+ threadAllocation,
+ argCollection.numberOfBAMFileHandles,
genomeLocParser,
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
@@ -763,8 +730,7 @@ public class GenomeAnalysisEngine {
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
- argCollection.defaultBaseQualities,
- !argCollection.disableLowMemorySharding);
+ argCollection.defaultBaseQualities);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 8078a1ea4..64b63dcd2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -194,10 +194,14 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
- @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
- public int numberOfThreads = 1;
+ /** How many threads should be allocated to this analysis. */
+ @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
+ public Integer numberOfThreads = 1;
- @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false)
+ @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
+ public Integer numberOfBAMFileHandles = null;
+
+ @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false)
public List readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
@@ -292,9 +296,6 @@ public class GATKArgumentCollection {
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
- @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
- public boolean disableLowMemorySharding = false;
-
// --------------------------------------------------------------------------------------------------------------
//
// methods
@@ -365,7 +366,11 @@ public class GATKArgumentCollection {
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
return false;
}
- if (other.numberOfThreads != this.numberOfThreads) {
+ if (!other.numberOfThreads.equals(this.numberOfThreads)) {
+ return false;
+ }
+ if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
+ (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
return false;
}
if (other.intervalMerging != this.intervalMerging) {
@@ -389,9 +394,6 @@ public class GATKArgumentCollection {
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;
- if (disableLowMemorySharding != other.disableLowMemorySharding)
- return false;
-
return true;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java
deleted file mode 100644
index de938e845..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.exceptions.StingException;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.channels.FileChannel;
-import java.util.Iterator;
-
-/**
- * Created by IntelliJ IDEA.
- * User: mhanna
- * Date: Feb 7, 2011
- * Time: 2:46:34 PM
- * To change this template use File | Settings | File Templates.
- */
-public class BAMBlockStartIterator implements Iterator {
- /**
- * How large is a BGZF header?
- */
- private static int BGZF_HEADER_SIZE = 18;
-
- /**
- * Where within the header does the BLOCKSIZE actually live?
- */
- private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
-
- private FileChannel bamInputChannel;
- private ByteBuffer headerByteBuffer;
-
- private long nextLocation = 0;
-
- public BAMBlockStartIterator(File bamFile) {
- try {
- FileInputStream bamInputStream = new FileInputStream(bamFile);
- bamInputChannel = bamInputStream.getChannel();
-
- headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
- headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
-
- }
- catch(IOException ex) {
- throw new StingException("Could not open file",ex);
- }
- }
-
- public boolean hasNext() {
- return nextLocation != -1;
- }
-
- public Long next() {
- long currentLocation = nextLocation;
- advance();
- return currentLocation;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
- }
-
- private void advance() {
- int readStatus;
-
- headerByteBuffer.clear();
- try {
- readStatus = bamInputChannel.read(headerByteBuffer);
- }
- catch(IOException ex) {
- throw new StingException("Could not read header data",ex);
- }
-
- if(readStatus == -1) {
- nextLocation = -1;
- try {
- bamInputChannel.close();
- }
- catch(IOException ex) {
- throw new StingException("Could not close input file",ex);
- }
- return;
- }
-
- headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
- int blockSize = headerByteBuffer.getShort();
-
- try {
- bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
- nextLocation = bamInputChannel.position();
- }
- catch(IOException ex) {
- throw new StingException("Could not reposition input stream",ex);
- }
- }
-
- public static void main(String argv[]) throws IOException {
- BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
- int i = 0;
- while(blockStartIterator.hasNext())
- System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java
deleted file mode 100644
index 4d91fb45f..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.GATKBin;
-import net.sf.samtools.GATKChunk;
-import net.sf.samtools.LinearIndex;
-
-import java.util.*;
-
-/**
- * Represents the contents of a bam index file for one reference.
- * A BAM index (.bai) file contains information for all references in the bam file.
- * This class describes the data present in the index file for one of these references;
- * including the bins, chunks, and linear index.
- */
-class BAMIndexContent {
- /**
- * The reference sequence for the data currently loaded.
- */
- private final int mReferenceSequence;
-
- /**
- * A list of all bins in the above reference sequence.
- */
- private final BinList mBinList;
-
- /**
- * The linear index for the reference sequence above.
- */
- private final LinearIndex mLinearIndex;
-
-
- /**
- * @param referenceSequence Content corresponds to this reference.
- * @param bins Array of bins represented by this content, possibly sparse
- * @param numberOfBins Number of non-null bins
- * @param linearIndex Additional index used to optimize queries
- */
- BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
- this.mReferenceSequence = referenceSequence;
- this.mBinList = new BinList(bins, numberOfBins);
- this.mLinearIndex = linearIndex;
- }
-
- /**
- * Reference for this Content
- */
- public int getReferenceSequence() {
- return mReferenceSequence;
- }
-
- /**
- * Does this content have anything in this bin?
- */
- public boolean containsBin(final GATKBin bin) {
- return mBinList.getBin(bin.getBinNumber()) != null;
- }
-
- /**
- * @return iterable list of bins represented by this content
- */
- public BinList getBins() {
- return mBinList;
- }
-
- /**
- * @return the number of non-null bins represented by this content
- */
- int getNumberOfNonNullBins() {
- return mBinList.getNumberOfNonNullBins();
- }
-
- /**
- * @return all chunks associated with all bins in this content
- */
- public List getAllChunks() {
- List allChunks = new ArrayList();
- for (GATKBin b : mBinList)
- if (b.getChunkList() != null) {
- allChunks.addAll(Arrays.asList(b.getChunkList()));
- }
- return Collections.unmodifiableList(allChunks);
- }
-
- /**
- * @return the linear index represented by this content
- */
- public LinearIndex getLinearIndex() {
- return mLinearIndex;
- }
-
- /**
- * This class is used to encapsulate the list of Bins store in the BAMIndexContent
- * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
- */
- class BinList implements Iterable {
-
- private final GATKBin[] mBinArray;
- public final int numberOfNonNullBins;
- public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
-
- /**
- * @param binArray a sparse array representation of the bins. The index into the array is the bin number.
- * @param numberOfNonNullBins
- */
- BinList(GATKBin[] binArray, int numberOfNonNullBins) {
- this.mBinArray = binArray;
- this.numberOfNonNullBins = numberOfNonNullBins;
- this.maxBinNumber = mBinArray.length - 1;
- }
-
- GATKBin getBin(int binNumber) {
- if (binNumber > maxBinNumber) return null;
- return mBinArray[binNumber];
- }
-
- int getNumberOfNonNullBins() {
- return numberOfNonNullBins;
- }
-
- /**
- * Gets an iterator over all non-null bins.
- *
- * @return An iterator over all bins.
- */
- public Iterator iterator() {
- return new BinIterator();
- }
-
- private class BinIterator implements Iterator {
- /**
- * Stores the bin # of the Bin currently in use.
- */
- private int nextBin;
-
- public BinIterator() {
- nextBin = 0;
- }
-
- /**
- * Are there more bins in this set, waiting to be returned?
- *
- * @return True if more bins are remaining.
- */
- public boolean hasNext() {
- while (nextBin <= maxBinNumber) {
- if (getBin(nextBin) != null) return true;
- nextBin++;
- }
- return false;
- }
-
- /**
- * Gets the next bin in the provided BinList.
- *
- * @return the next available bin in the BinList.
- */
- public GATKBin next() {
- if (!hasNext())
- throw new NoSuchElementException("This BinIterator is currently empty");
- GATKBin result = getBin(nextBin);
- nextBin++;
- return result;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Unable to remove from a bin iterator");
- }
- }
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java
deleted file mode 100644
index 15a372ca6..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java
+++ /dev/null
@@ -1,29 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.Bin;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Models a bin at which all BAM files in the merged input stream overlap.
- */
-class BAMOverlap {
- public final int start;
- public final int stop;
-
- private final Map bins = new HashMap();
-
- public BAMOverlap(final int start, final int stop) {
- this.start = start;
- this.stop = stop;
- }
-
- public void addBin(final SAMReaderID id, final Bin bin) {
- bins.put(id,bin);
- }
-
- public Bin getBin(final SAMReaderID id) {
- return bins.get(id);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
index 521bcd5a3..762722fcd 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
@@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator {
/**
* Create a new BAM schedule based on the given index.
- * @param indexFiles Index files.
+ * @param dataSource The SAM data source to use.
* @param intervals List of
*/
- public BAMSchedule(final Map indexFiles, final List intervals) {
+ public BAMSchedule(final SAMDataSource dataSource, final List intervals) {
if(intervals.isEmpty())
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
- referenceSequence = intervals.get(0).getContigIndex();
+ referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
createScheduleFile();
- readerIDs.addAll(indexFiles.keySet());
+ readerIDs.addAll(dataSource.getReaderIDs());
for(final SAMReaderID reader: readerIDs) {
- final GATKBAMIndex index = indexFiles.get(reader);
+ final GATKBAMIndex index = dataSource.getIndex(reader);
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
@@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator {
if(selectedIterators.isEmpty())
return;
+ // Create the target schedule entry
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
+
+ // For each schedule entry with data, load the data into the merged schedule.
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
PeekableIterator scheduleIterator = scheduleIterators.get(reader);
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
@@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator {
scheduleIterator.next();
}
+ // For each schedule entry without data, add a blank entry.
+ for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
+ mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
+ }
+
nextScheduleEntry = mergedScheduleEntry;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index 47eb55b28..dca4cc771 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileSpan;
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*;
@@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator {
private FilePointer nextFilePointer = null;
- private final GenomeLocSortedSet loci;
+ private GenomeLocSortedSet loci;
+ private PeekableIterator locusIterator;
+ private GenomeLoc currentLocus;
- private final PeekableIterator locusIterator;
+ public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
+ for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
+ // Match only on sequence name; trust startup validation to make sure all the sequences match.
+ if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
+ intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
+ }
+ scheduler.populateFilteredIntervalList(intervals);
+ return scheduler;
+ }
- private GenomeLoc currentLocus;
+ public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ scheduler.populateUnfilteredIntervalList(parser);
+ return scheduler;
+ }
- public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ scheduler.populateFilteredIntervalList(loci);
+ return scheduler;
+ }
+
+
+ private BAMScheduler(final SAMDataSource dataSource) {
this.dataSource = dataSource;
- for(SAMReaderID reader: dataSource.getReaderIDs())
- indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
+ for(SAMReaderID reader: dataSource.getReaderIDs()) {
+ GATKBAMIndex index = dataSource.getIndex(reader);
+ if(index != null)
+ indexFiles.put(reader,dataSource.getIndex(reader));
+ }
+ }
+
+ /**
+ * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
+ * @param loci The list of locations to search and iterate over.
+ */
+ private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
this.loci = loci;
- locusIterator = new PeekableIterator(loci.iterator());
- if(locusIterator.hasNext())
- currentLocus = locusIterator.next();
- advance();
+ if(!indexFiles.isEmpty()) {
+ // If index data is available, start up the iterator.
+ locusIterator = new PeekableIterator(loci.iterator());
+ if(locusIterator.hasNext())
+ currentLocus = locusIterator.next();
+ advance();
+ }
+ else {
+ // Otherwise, seed the iterator with a single file pointer over the entire region.
+ nextFilePointer = generatePointerOverEntireFileset();
+ for(GenomeLoc locus: loci)
+ nextFilePointer.addLocation(locus);
+ locusIterator = new PeekableIterator(Collections.emptyList().iterator());
+ }
+ }
+
+ /**
+ * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
+ * from just before the start of the region to the end of the region.
+ */
+ private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
+ this.loci = new GenomeLocSortedSet(parser);
+ locusIterator = new PeekableIterator(Collections.emptyList().iterator());
+ nextFilePointer = generatePointerOverEntireFileset();
+ }
+
+ /**
+ * Generate a span that runs from the end of the BAM header to the end of the fle.
+ * @return A file pointer over the specified region.
+ */
+ private FilePointer generatePointerOverEntireFileset() {
+ FilePointer filePointer = new FilePointer();
+ Map currentPosition = dataSource.getCurrentPosition();
+ for(SAMReaderID reader: dataSource.getReaderIDs())
+ filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
+ return filePointer;
}
public boolean hasNext() {
@@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator {
if(!hasNext())
throw new NoSuchElementException("No next element available in interval sharder");
FilePointer currentFilePointer = nextFilePointer;
+ nextFilePointer = null;
advance();
+
return currentFilePointer;
}
@@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator {
if(loci.isEmpty())
return;
- nextFilePointer = null;
while(nextFilePointer == null && currentLocus != null) {
// special case handling of the unmapped shard.
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
- nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
+ nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
currentLocus = null;
continue;
}
@@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator {
int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null;
- BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
+ BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
// No overlapping data at all.
if(scheduleEntry != null) {
@@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator {
}
else {
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
- //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
for(SAMReaderID reader: indexFiles.keySet())
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
}
@@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator {
// Early exit if no bins were found.
if(coveredRegion == null) {
// for debugging only: maximum split is 16384.
- if(currentLocus.size() > 16384) {
- GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
- nextFilePointer.addLocation(splitContigs[0]);
- currentLocus = splitContigs[1];
- }
- else {
- nextFilePointer.addLocation(currentLocus);
- currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
- }
+ nextFilePointer.addLocation(currentLocus);
+ currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
continue;
}
// Early exit if only part of the first interval was found.
if(currentLocus.startsBefore(coveredRegion)) {
- // for debugging only: maximum split is 16384.
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
nextFilePointer.addLocation(splitContigs[0]);
@@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator {
/**
* Get the next overlapping tree of bins associated with the given BAM file.
- * @param indices BAM indices.
* @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci.
*/
- private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) {
+ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
+ // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
+ // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
+ // we'll be using the correct contig index for the BAMs.
+ // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
+ final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
+
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
- if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
+ if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
if(bamScheduleIterator != null)
bamScheduleIterator.close();
- lastReferenceSequenceLoaded = currentLocus.getContigIndex();
+ lastReferenceSequenceLoaded = currentContigIndex;
// Naive algorithm: find all elements in current contig for proper schedule creation.
List lociInContig = new LinkedList();
for(GenomeLoc locus: loci) {
- if(locus.getContigIndex() == lastReferenceSequenceLoaded)
+ if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
lociInContig.add(locus);
}
- bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig));
+ bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig));
}
if(!bamScheduleIterator.hasNext())
@@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator {
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
}
+ /**
+ * Create a span from the given start point to the end of the file.
+ * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
+ * @return A file span from the given point to the end of the file.
+ */
+ private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
+ return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
+ }
+
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java
new file mode 100644
index 000000000..f468d2020
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.LinkedList;
+import java.util.Queue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+/**
+ * Preloads BGZF blocks in preparation for unzipping and data processing.
+ * TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
+ * TODO: be replaced with a central thread management strategy.
+ */
+public class BGZFBlockLoadingDispatcher {
+ /**
+ * The file handle cache, used when allocating blocks from the dispatcher.
+ */
+ private final FileHandleCache fileHandleCache;
+
+ private final ExecutorService threadPool;
+
+ private final Queue inputQueue;
+
+ public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
+ threadPool = Executors.newFixedThreadPool(numThreads);
+ fileHandleCache = new FileHandleCache(numFileHandles);
+ inputQueue = new LinkedList();
+
+ threadPool.execute(new BlockLoader(this,fileHandleCache,true));
+ }
+
+ /**
+ * Initiates a request for a new block load.
+ * @param readerPosition Position at which to load.
+ */
+ void queueBlockLoad(final SAMReaderPosition readerPosition) {
+ synchronized(inputQueue) {
+ inputQueue.add(readerPosition);
+ inputQueue.notify();
+ }
+ }
+
+ /**
+ * Claims the next work request from the queue.
+ * @return The next work request, or null if none is available.
+ */
+ SAMReaderPosition claimNextWorkRequest() {
+ synchronized(inputQueue) {
+ while(inputQueue.isEmpty()) {
+ try {
+ inputQueue.wait();
+ }
+ catch(InterruptedException ex) {
+ throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
+ }
+ }
+ return inputQueue.poll();
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
new file mode 100644
index 000000000..e377f865d
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
+import net.sf.samtools.util.BAMInputStream;
+import net.sf.samtools.util.BlockCompressedFilePointerUtil;
+import net.sf.samtools.util.BlockCompressedInputStream;
+import net.sf.samtools.util.RuntimeEOFException;
+import net.sf.samtools.util.SeekableStream;
+import org.broad.tribble.util.BlockCompressedStreamConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+/**
+ * Presents decompressed blocks to the SAMFileReader.
+ */
+public class BlockInputStream extends SeekableStream implements BAMInputStream {
+ /**
+ * Mechanism for triggering block loads.
+ */
+ private final BGZFBlockLoadingDispatcher dispatcher;
+
+ /**
+ * The reader whose data is supplied by this input stream.
+ */
+ private final SAMReaderID reader;
+
+ /**
+ * Length of the input stream.
+ */
+ private final long length;
+
+ /**
+ * The latest error reported by an asynchronous block load.
+ */
+ private Throwable error;
+
+ /**
+ * Current position.
+ */
+ private SAMReaderPosition position;
+
+ /**
+ * A stream of compressed data blocks.
+ */
+ private final ByteBuffer buffer;
+
+ /**
+ * Offsets of the given blocks in the buffer.
+ */
+ private LinkedList blockOffsets = new LinkedList();
+
+ /**
+ * Source positions of the given blocks in the buffer.
+ */
+ private LinkedList blockPositions = new LinkedList();
+
+ /**
+ * Provides a lock to wait for more data to arrive.
+ */
+ private final Object lock = new Object();
+
+ /**
+ * An input stream to use when comparing data back to what it should look like.
+ */
+ private final BlockCompressedInputStream validatingInputStream;
+
+ /**
+ * Has the buffer been filled since last request?
+ */
+ private boolean bufferFilled = false;
+
+ /**
+ * Create a new block presenting input stream with a dedicated buffer.
+ * @param dispatcher the block loading messenger.
+ * @param reader the reader for which to load data.
+ * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
+ */
+ BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
+ this.reader = reader;
+ this.length = reader.samFile.length();
+
+ buffer = ByteBuffer.wrap(new byte[64*1024]);
+ buffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ // The state of the buffer assumes that the range of data written into the buffer appears in the range
+ // [position,limit), while extra capacity exists in the range [limit,capacity)
+ buffer.limit(0);
+
+ this.dispatcher = dispatcher;
+ // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
+ this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
+
+ try {
+ if(validate) {
+ System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
+ validatingInputStream = new BlockCompressedInputStream(reader.samFile);
+ // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
+ // Poke the stream to start reading data.
+ validatingInputStream.available();
+ }
+ else
+ validatingInputStream = null;
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+
+ public long length() {
+ return length;
+ }
+
+ public long getFilePointer() {
+ long filePointer;
+ synchronized(lock) {
+ if(buffer.remaining() > 0) {
+ // If there's data in the buffer, figure out from whence it came.
+ final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
+ final int blockOffset = buffer.position();
+ filePointer = blockAddress << 16 | blockOffset;
+ }
+ else {
+ // Otherwise, find the next position to load.
+ filePointer = position.getBlockAddress() << 16;
+ }
+ }
+
+ if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
+ throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
+ BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
+ BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
+
+ return filePointer;
+ }
+
+ public void seek(long target) {
+ // TODO: Validate the seek point.
+ //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
+ synchronized(lock) {
+ clearBuffers();
+ position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
+ waitForBufferFill();
+ buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
+
+ if(validatingInputStream != null) {
+ try {
+ validatingInputStream.seek(target);
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+ }
+ }
+
+ private void clearBuffers() {
+ this.position.reset();
+
+ // Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
+ // Indicate no data to be read.
+ buffer.clear();
+ buffer.limit(0);
+
+ blockOffsets.clear();
+ blockPositions.clear();
+ }
+
+ public boolean eof() {
+ synchronized(lock) {
+ // TODO: Handle multiple empty BGZF blocks at end of the file.
+ return position != null && position.getBlockAddress() >= length;
+ }
+ }
+
+ public void setCheckCrcs(final boolean check) {
+ // TODO: Implement
+ }
+
+ /**
+ * Submits a new access plan for the given dataset.
+ * @param position The next seek point for BAM data in this reader.
+ */
+ public void submitAccessPlan(final SAMReaderPosition position) {
+ //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
+ synchronized(lock) {
+ // Assume that the access plan is going to tell us to start where we are and move forward.
+ // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
+ if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
+ position.advancePosition(this.position.getBlockAddress());
+ }
+ this.position = position;
+ }
+
+ private void compactBuffer() {
+ // Compact buffer to maximize storage space.
+ int bytesToRemove = 0;
+
+ // Look ahead to see if we can compact away the first block in the series.
+ while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
+ bytesToRemove += blockOffsets.remove();
+ blockPositions.remove();
+ }
+
+ // If we end up with an empty block at the end of the series, compact this as well.
+ if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
+ bytesToRemove += buffer.position();
+ blockOffsets.remove();
+ blockPositions.remove();
+ }
+
+ int finalBufferStart = buffer.position() - bytesToRemove;
+ int finalBufferSize = buffer.remaining();
+
+ buffer.position(bytesToRemove);
+ buffer.compact();
+
+ buffer.position(finalBufferStart);
+ buffer.limit(finalBufferStart+finalBufferSize);
+ }
+
+ /**
+ * Push contents of incomingBuffer into the end of this buffer.
+ * MUST be called from a thread that is NOT the reader thread.
+ * @param incomingBuffer The data being pushed into this input stream.
+ * @param position target position for the data.
+ */
+ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
+ synchronized(lock) {
+ try {
+ compactBuffer();
+ // Open up the buffer for more reading.
+ buffer.limit(buffer.capacity());
+
+ // Advance the position to take the most recent read into account.
+ long lastReadPosition = position.getBlockAddress();
+
+ byte[] validBytes = null;
+ if(validatingInputStream != null) {
+ validBytes = new byte[incomingBuffer.remaining()];
+
+ byte[] currentBytes = new byte[incomingBuffer.remaining()];
+ int pos = incomingBuffer.position();
+ int lim = incomingBuffer.limit();
+ incomingBuffer.get(currentBytes);
+
+ incomingBuffer.limit(lim);
+ incomingBuffer.position(pos);
+
+ long currentFilePointer = validatingInputStream.getFilePointer();
+ validatingInputStream.seek(lastReadPosition << 16);
+ validatingInputStream.read(validBytes);
+ validatingInputStream.seek(currentFilePointer);
+
+ if(!Arrays.equals(validBytes,currentBytes))
+ throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
+ }
+
+ this.position = position;
+ position.advancePosition(filePosition);
+
+ if(buffer.remaining() < incomingBuffer.remaining()) {
+ //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
+ lock.wait();
+ //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
+ }
+
+ // Queue list of block offsets / block positions.
+ blockOffsets.add(buffer.position());
+ blockPositions.add(lastReadPosition);
+
+ buffer.put(incomingBuffer);
+
+ // Set up the buffer for reading.
+ buffer.flip();
+ bufferFilled = true;
+
+ lock.notify();
+ }
+ catch(Exception ex) {
+ reportException(ex);
+ lock.notify();
+ }
+ }
+ }
+
+ void reportException(Throwable t) {
+ synchronized(lock) {
+ this.error = t;
+ lock.notify();
+ }
+ }
+
+ private void checkForErrors() {
+ synchronized(lock) {
+ if(error != null) {
+ ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
+ toThrow.setStackTrace(error.getStackTrace());
+ throw toThrow;
+ }
+ }
+ }
+
+ /**
+ * Reads the next byte of data from the input stream.
+ * @return Next byte of data, from 0->255, as an int.
+ */
+ @Override
+ public int read() {
+ byte[] singleByte = new byte[1];
+ read(singleByte);
+ return singleByte[0];
+ }
+
+ /**
+ * Fills the given byte array to the extent possible.
+ * @param bytes byte array to be filled.
+ * @return The number of bytes actually read.
+ */
+ @Override
+ public int read(byte[] bytes) {
+ return read(bytes,0,bytes.length);
+ }
+
+ @Override
+ public int read(byte[] bytes, final int offset, final int length) {
+ int remaining = length;
+ synchronized(lock) {
+ while(remaining > 0) {
+ // Check for error conditions during last read.
+ checkForErrors();
+
+ // If completely out of space, queue up another buffer fill.
+ waitForBufferFill();
+
+ // Couldn't manage to load any data at all; abort and return what's available.
+ if(buffer.remaining() == 0)
+ break;
+
+ int numBytesToCopy = Math.min(buffer.remaining(),remaining);
+ buffer.get(bytes,length-remaining+offset,numBytesToCopy);
+ remaining -= numBytesToCopy;
+
+ //if(remaining > 0)
+ // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
+ // TODO: Assert that we don't copy across a block boundary
+ }
+
+ // Notify any waiting threads that some of the contents of the buffer were removed.
+ if(length-remaining > 0)
+ lock.notify();
+ }
+
+ if(validatingInputStream != null) {
+ byte[] validBytes = new byte[length];
+ try {
+ validatingInputStream.read(validBytes,offset,length);
+ for(int i = offset; i < offset+length; i++) {
+ if(bytes[i] != validBytes[i]) {
+ System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
+ throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
+ }
+ }
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+
+ return length - remaining;
+ }
+
+ public void close() {
+ if(validatingInputStream != null) {
+ try {
+ validatingInputStream.close();
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+ }
+
+ public String getSource() {
+ return reader.getSamFilePath();
+ }
+
+ private void waitForBufferFill() {
+ synchronized(lock) {
+ bufferFilled = false;
+ if(buffer.remaining() == 0 && !eof()) {
+ //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
+ dispatcher.queueBlockLoad(position);
+ try {
+ lock.wait();
+ }
+ catch(InterruptedException ex) {
+ // TODO: handle me.
+ throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
+ }
+
+ if(bufferFilled && buffer.remaining() == 0)
+ throw new RuntimeEOFException("No more data left in InputStream");
+ }
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
new file mode 100644
index 000000000..ab4299802
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import org.broad.tribble.util.BlockCompressedStreamConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.util.zip.DataFormatException;
+import java.util.zip.Inflater;
+
+/**
+ * An engine for loading blocks.
+ */
+class BlockLoader implements Runnable {
+ /**
+ * Coordinates the input queue.
+ */
+ private BGZFBlockLoadingDispatcher dispatcher;
+
+ /**
+ * A cache from which to retrieve open file handles.
+ */
+ private final FileHandleCache fileHandleCache;
+
+ /**
+ * Whether asynchronous decompression should happen.
+ */
+ private final boolean decompress;
+
+ /**
+ * An direct input buffer for incoming data from disk.
+ */
+ private final ByteBuffer inputBuffer;
+
+ public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
+ this.dispatcher = dispatcher;
+ this.fileHandleCache = fileHandleCache;
+ this.decompress = decompress;
+
+ this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ public void run() {
+ for(;;) {
+ SAMReaderPosition readerPosition = null;
+ try {
+ readerPosition = dispatcher.claimNextWorkRequest();
+ FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
+
+ long blockAddress = readerPosition.getBlockAddress();
+ //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
+
+ ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
+ long nextBlockAddress = position(inputStream);
+ fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
+
+ ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
+ int bytesCopied = block.remaining();
+
+ BlockInputStream bamInputStream = readerPosition.getInputStream();
+ bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
+
+ //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
+ }
+ catch(Throwable error) {
+ if(readerPosition != null && readerPosition.getInputStream() != null)
+ readerPosition.getInputStream().reportException(error);
+ }
+ }
+
+ }
+
+ private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
+ FileChannel channel = inputStream.getChannel();
+
+ // Read the block header
+ channel.position(blockAddress);
+
+ int uncompressedDataSize = 0;
+ int bufferSize = 0;
+
+ do {
+ inputBuffer.clear();
+ inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ channel.read(inputBuffer);
+
+ // Read out the size of the full BGZF block into a two bit short container, then 'or' that
+ // value into an int buffer to transfer the bitwise contents into an int.
+ inputBuffer.flip();
+ if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
+ throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
+
+ // Verify that the file was read at a valid point.
+ if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
+ unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
+ unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
+ unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
+ unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
+ unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
+ throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
+ }
+
+ inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
+ bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
+
+ // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
+ inputBuffer.limit(bufferSize);
+ inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ channel.read(inputBuffer);
+
+ // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
+ uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
+ //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
+ }
+ while(uncompressedDataSize == 0 && channel.position() < channel.size());
+
+ // Prepare the buffer for reading.
+ inputBuffer.flip();
+
+ return inputBuffer;
+ }
+
+ private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
+ final int compressedBufferSize = bgzfBlock.remaining();
+
+ // Determine the uncompressed buffer size (
+ bgzfBlock.position(bgzfBlock.limit()-4);
+ int uncompressedBufferSize = bgzfBlock.getInt();
+ byte[] uncompressedContent = new byte[uncompressedBufferSize];
+
+ // Bound the CDATA section of the buffer.
+ bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
+ bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ byte[] compressedContent = new byte[bgzfBlock.remaining()];
+ ByteBuffer.wrap(compressedContent).put(bgzfBlock);
+
+ // Decompress the buffer.
+ final Inflater inflater = new Inflater(true);
+ inflater.setInput(compressedContent);
+ int bytesUncompressed = inflater.inflate(uncompressedContent);
+ if(bytesUncompressed != uncompressedBufferSize)
+ throw new ReviewedStingException("Error decompressing block");
+
+ return ByteBuffer.wrap(uncompressedContent);
+ }
+
+ private long position(final FileInputStream inputStream) throws IOException {
+ return inputStream.getChannel().position();
+ }
+
+ private int unpackUByte8(final ByteBuffer buffer,final int position) {
+ return buffer.get(position) & 0xFF;
+ }
+
+ private int unpackUInt16(final ByteBuffer buffer,final int position) {
+ // Read out the size of the full BGZF block into a two bit short container, then 'or' that
+ // value into an int buffer to transfer the bitwise contents into an int.
+ return buffer.getShort(position) & 0xFFFF;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java
new file mode 100644
index 000000000..29de6eb37
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+
+/**
+ * Caches frequently used file handles. Right now, caches only a single file handle.
+ * TODO: Generalize to support arbitrary file handle caches.
+ */
+public class FileHandleCache {
+ /**
+ * The underlying data structure storing file handles.
+ */
+ private final FileHandleStorage fileHandleStorage;
+
+ /**
+ * How many file handles should be kept open at once.
+ */
+ private final int cacheSize;
+
+ /**
+ * A uniquifier: assign a unique ID to every instance of a file handle.
+ */
+ private final Map keyCounter = new HashMap();
+
+ /**
+ * A shared lock, private so that outside users cannot notify it.
+ */
+ private final Object lock = new Object();
+
+ /**
+ * Indicates how many file handles are outstanding at this point.
+ */
+ private int numOutstandingFileHandles = 0;
+
+ /**
+ * Create a new file handle cache of the given cache size.
+ * @param cacheSize how many readers to hold open at once.
+ */
+ public FileHandleCache(final int cacheSize) {
+ this.cacheSize = cacheSize;
+ fileHandleStorage = new FileHandleStorage();
+ }
+
+ /**
+ * Retrieves or opens a file handle for the given reader ID.
+ * @param key The ke
+ * @return A file input stream from the cache, if available, or otherwise newly opened.
+ */
+ public FileInputStream claimFileInputStream(final SAMReaderID key) {
+ synchronized(lock) {
+ FileInputStream inputStream = findExistingEntry(key);
+ if(inputStream == null) {
+ try {
+ // If the cache is maxed out, wait for another file handle to emerge.
+ if(numOutstandingFileHandles >= cacheSize)
+ lock.wait();
+ }
+ catch(InterruptedException ex) {
+ throw new ReviewedStingException("Interrupted while waiting for a file handle");
+ }
+ inputStream = openInputStream(key);
+ }
+ numOutstandingFileHandles++;
+
+ //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
+ return inputStream;
+ }
+ }
+
+ /**
+ * Releases the current reader and returns it to the cache.
+ * @param key The reader.
+ * @param inputStream The stream being used.
+ */
+ public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
+ synchronized(lock) {
+ numOutstandingFileHandles--;
+ UniqueKey newID = allocateKey(key);
+ fileHandleStorage.put(newID,inputStream);
+ // Let any listeners know that another file handle has become available.
+ lock.notify();
+ }
+ }
+
+ /**
+ * Finds an existing entry in the storage mechanism.
+ * @param key Reader.
+ * @return a cached stream, if available. Otherwise,
+ */
+ private FileInputStream findExistingEntry(final SAMReaderID key) {
+ int existingHandles = getMostRecentUniquifier(key);
+
+ // See if any of the keys currently exist in the repository.
+ for(int i = 0; i <= existingHandles; i++) {
+ UniqueKey uniqueKey = new UniqueKey(key,i);
+ if(fileHandleStorage.containsKey(uniqueKey))
+ return fileHandleStorage.remove(uniqueKey);
+ }
+
+ return null;
+ }
+
+ /**
+ * Gets the most recent uniquifier used for the given reader.
+ * @param reader Reader for which to determine uniqueness.
+ * @return
+ */
+ private int getMostRecentUniquifier(final SAMReaderID reader) {
+ if(keyCounter.containsKey(reader))
+ return keyCounter.get(reader);
+ else return -1;
+ }
+
+ private UniqueKey allocateKey(final SAMReaderID reader) {
+ int uniquifier = getMostRecentUniquifier(reader)+1;
+ keyCounter.put(reader,uniquifier);
+ return new UniqueKey(reader,uniquifier);
+ }
+
+ private FileInputStream openInputStream(final SAMReaderID reader) {
+ try {
+ return new FileInputStream(reader.getSamFilePath());
+ }
+ catch(IOException ex) {
+ throw new StingException("Unable to open input file");
+ }
+ }
+
+ private void closeInputStream(final FileInputStream inputStream) {
+ try {
+ inputStream.close();
+ }
+ catch(IOException ex) {
+ throw new StingException("Unable to open input file");
+ }
+ }
+
+ /**
+ * Actually contains the file handles, purging them as they get too old.
+ */
+ private class FileHandleStorage extends LinkedHashMap {
+ /**
+ * Remove the oldest entry
+ * @param entry Entry to consider removing.
+ * @return True if the cache size has been exceeded. False otherwise.
+ */
+ @Override
+ protected boolean removeEldestEntry(Map.Entry entry) {
+ synchronized (lock) {
+ if(size() > cacheSize) {
+ keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
+ closeInputStream(entry.getValue());
+
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Uniquifies a key by adding a numerical uniquifier.
+ */
+ private class UniqueKey {
+ /**
+ * The file handle's key.
+ */
+ private final SAMReaderID key;
+
+ /**
+ * A uniquifier, so that multiple of the same reader can exist in the cache.
+ */
+ private final int uniqueID;
+
+ public UniqueKey(final SAMReaderID reader, final int uniqueID) {
+ this.key = reader;
+ this.uniqueID = uniqueID;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if(!(other instanceof UniqueKey))
+ return false;
+ UniqueKey otherUniqueKey = (UniqueKey)other;
+ return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
+ }
+
+ @Override
+ public int hashCode() {
+ return key.hashCode();
+ }
+ }
+
+
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
index e4141f61c..df7827250 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
@@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
@@ -40,28 +41,25 @@ import java.util.*;
*/
public class FilePointer {
protected final SortedMap fileSpans = new TreeMap();
- protected final BAMOverlap overlap;
- protected final List locations;
+ protected final List locations = new ArrayList();
/**
* Does this file pointer point into an unmapped region?
*/
protected final boolean isRegionUnmapped;
- public FilePointer() {
- this((BAMOverlap)null);
- }
-
- public FilePointer(final GenomeLoc location) {
- this.overlap = null;
- this.locations = Collections.singletonList(location);
- this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
- }
-
- public FilePointer(final BAMOverlap overlap) {
- this.overlap = overlap;
- this.locations = new ArrayList();
- this.isRegionUnmapped = false;
+ public FilePointer(final GenomeLoc... locations) {
+ this.locations.addAll(Arrays.asList(locations));
+ boolean foundMapped = false, foundUnmapped = false;
+ for(GenomeLoc location: locations) {
+ if(GenomeLoc.isUnmapped(location))
+ foundUnmapped = true;
+ else
+ foundMapped = true;
+ }
+ if(foundMapped && foundUnmapped)
+ throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
+ this.isRegionUnmapped = foundUnmapped;
}
/**
@@ -217,4 +215,20 @@ public class FilePointer {
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
combined.addFileSpans(initialElement.getKey(),fileSpan);
}
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("FilePointer:%n");
+ builder.append("\tlocations = {");
+ builder.append(Utils.join(";",locations));
+ builder.append("}%n\tregions = %n");
+ for(Map.Entry entry: fileSpans.entrySet()) {
+ builder.append(entry.getKey());
+ builder.append("= {");
+ builder.append(entry.getValue());
+ builder.append("}");
+ }
+ return builder.toString();
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
index 4ddf28dce..f78693c27 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
@@ -25,419 +25,58 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
-import net.sf.samtools.AbstractBAMFileIndex;
-import net.sf.samtools.Bin;
-import net.sf.samtools.BrowseableBAMIndex;
-import net.sf.samtools.SAMSequenceRecord;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.utils.GenomeLoc;
+import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.collections.Pair;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import java.util.*;
+import java.util.Iterator;
/**
- * Shard intervals based on position within the BAM file.
- *
- * @author mhanna
- * @version 0.1
+ * Handles the process of aggregating BAM intervals into individual shards.
+ * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
*/
-public class IntervalSharder {
- private static Logger logger = Logger.getLogger(IntervalSharder.class);
+public class IntervalSharder implements Iterator {
+ /**
+ * The iterator actually laying out the data for BAM scheduling.
+ */
+ private final PeekableIterator wrappedIterator;
- public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- return new IntervalSharder.FilePointerIterator(dataSource,loci);
+ /**
+ * The parser, for interval manipulation.
+ */
+ private final GenomeLocParser parser;
+
+ public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
+ return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
+ }
+
+ public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
+ return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
+ }
+
+ public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
+ }
+
+ private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
+ wrappedIterator = new PeekableIterator(scheduler);
+ this.parser = parser;
+ }
+
+ public boolean hasNext() {
+ return wrappedIterator.hasNext();
}
/**
- * A lazy-loading iterator over file pointers.
+ * Accumulate shards where there's no additional cost to processing the next shard in the sequence.
+ * @return The next file pointer to process.
*/
- private static class FilePointerIterator implements Iterator {
- final SAMDataSource dataSource;
- final GenomeLocSortedSet loci;
- final PeekableIterator locusIterator;
- final Queue cachedFilePointers = new LinkedList();
-
- public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- this.dataSource = dataSource;
- this.loci = loci;
- locusIterator = new PeekableIterator(loci.iterator());
- advance();
- }
-
- public boolean hasNext() {
- return !cachedFilePointers.isEmpty();
- }
-
- public FilePointer next() {
- if(!hasNext())
- throw new NoSuchElementException("FilePointerIterator iteration is complete");
- FilePointer filePointer = cachedFilePointers.remove();
- if(cachedFilePointers.isEmpty())
- advance();
- return filePointer;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
- }
-
- private void advance() {
- GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
- String contig = null;
-
- // If the next section of the BAM to be processed is unmapped, handle this region separately.
- while(locusIterator.hasNext() && nextBatch.isEmpty()) {
- contig = null;
- while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
- GenomeLoc nextLocus = locusIterator.next();
- contig = nextLocus.getContig();
- nextBatch.add(nextLocus);
- }
- }
-
- if(nextBatch.size() > 0) {
- cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
- }
- }
+ public FilePointer next() {
+ FilePointer current = wrappedIterator.next();
+ while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
+ current = current.combine(parser,wrappedIterator.next());
+ return current;
}
- /**
- * Merge / split intervals based on an awareness of the structure of the BAM file.
- * @param dataSource
- * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
- * @param loci
- * @return
- */
- private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
- // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
- if(contig == null) {
- FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
- for(SAMReaderID id: dataSource.getReaderIDs())
- filePointer.addFileSpans(id,null);
- return Collections.singletonList(filePointer);
- }
-
- // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
- List filePointers = new ArrayList();
- FilePointer lastFilePointer = null;
- BAMOverlap lastBAMOverlap = null;
-
- Map readerToIndexMap = new HashMap();
- IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
- for(SAMReaderID id: dataSource.getReaderIDs()) {
- final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
- // If this contig can't be found in the reference, skip over it.
- if(referenceSequence == null && contig != null)
- continue;
- final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
- binMerger.addReader(id,
- index,
- referenceSequence.getSequenceIndex(),
- index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
- // Cache the reader for later data lookup.
- readerToIndexMap.put(id,index);
- }
-
- PeekableIterator binIterator = new PeekableIterator(binMerger);
-
- for(GenomeLoc location: loci) {
- if(!location.getContig().equals(contig))
- throw new ReviewedStingException("Location outside bounds of contig");
-
- if(!binIterator.hasNext())
- break;
-
- int locationStart = location.getStart();
- final int locationStop = location.getStop();
-
- // Advance to first bin.
- while(binIterator.peek().stop < locationStart)
- binIterator.next();
-
- // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
- // sure the extending bin is not pruned from the list.
- List bamOverlaps = new ArrayList();
- while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
- bamOverlaps.add(binIterator.next());
- if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
- bamOverlaps.add(binIterator.peek());
-
- // Bins found; try to match bins with locations.
- Iterator bamOverlapIterator = bamOverlaps.iterator();
-
- while(locationStop >= locationStart) {
- int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
- int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
-
- while(binStop < locationStart && bamOverlapIterator.hasNext()) {
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
- filePointers.add(lastFilePointer);
-
- lastBAMOverlap = bamOverlapIterator.next();
- lastFilePointer = new FilePointer(lastBAMOverlap);
- binStart = lastFilePointer.overlap.start;
- binStop = lastFilePointer.overlap.stop;
- }
-
- if(locationStart < binStart) {
- // The region starts before the first bin in the sequence. Add the region occurring before the sequence.
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
- filePointers.add(lastFilePointer);
- lastFilePointer = null;
- lastBAMOverlap = null;
- }
-
- final int regionStop = Math.min(locationStop,binStart-1);
-
- GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
- lastFilePointer = new FilePointer(subset);
-
- locationStart = regionStop + 1;
- }
- else if(locationStart > binStop) {
- // The region starts after the last bin in the sequence. Add the region occurring after the sequence.
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
- filePointers.add(lastFilePointer);
- lastFilePointer = null;
- lastBAMOverlap = null;
- }
-
- GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
- filePointers.add(new FilePointer(subset));
-
- locationStart = locationStop + 1;
- }
- else {
- if(lastFilePointer == null)
- throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
-
- // The start of the region overlaps the bin. Add the overlapping subset.
- final int regionStop = Math.min(locationStop,binStop);
- lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
- locationStart = regionStop + 1;
- }
- }
- }
-
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
- filePointers.add(lastFilePointer);
-
- // Lookup the locations for every file pointer in the index.
- for(SAMReaderID id: readerToIndexMap.keySet()) {
- BrowseableBAMIndex index = readerToIndexMap.get(id);
- for(FilePointer filePointer: filePointers)
- filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
- }
-
- return filePointers;
- }
-
- private static class BinMergingIterator implements Iterator {
- private PriorityQueue binQueue = new PriorityQueue();
- private Queue pendingOverlaps = new LinkedList();
-
- public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) {
- binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
- }
-
- public boolean hasNext() {
- return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
- }
-
- public BAMOverlap next() {
- if(!hasNext())
- throw new NoSuchElementException("No elements left in merging iterator");
- if(pendingOverlaps.isEmpty())
- advance();
- return pendingOverlaps.remove();
- }
-
- public void advance() {
- List bins = new ArrayList();
- int boundsStart, boundsStop;
-
- // Prime the pump
- if(binQueue.isEmpty())
- return;
- bins.add(getNextBin());
- boundsStart = bins.get(0).getStart();
- boundsStop = bins.get(0).getStop();
-
- // Accumulate all the bins that overlap the current bin, in sorted order.
- while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
- ReaderBin bin = getNextBin();
- bins.add(bin);
- boundsStart = Math.min(boundsStart,bin.getStart());
- boundsStop = Math.max(boundsStop,bin.getStop());
- }
-
- List> range = new ArrayList>();
- int start = bins.get(0).getStart();
- int stop = bins.get(0).getStop();
- while(start <= boundsStop) {
- // Find the next stopping point.
- for(ReaderBin bin: bins) {
- stop = Math.min(stop,bin.getStop());
- if(start < bin.getStart())
- stop = Math.min(stop,bin.getStart()-1);
- }
-
- range.add(new Pair(start,stop));
- // If the last entry added included the last element, stop.
- if(stop >= boundsStop)
- break;
-
- // Find the next start.
- start = stop + 1;
- for(ReaderBin bin: bins) {
- if(start >= bin.getStart() && start <= bin.getStop())
- break;
- else if(start < bin.getStart()) {
- start = bin.getStart();
- break;
- }
- }
- }
-
- // Add the next series of BAM overlaps to the window.
- for(Pair window: range) {
- BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
- for(ReaderBin bin: bins)
- bamOverlap.addBin(bin.id,bin.bin);
- pendingOverlaps.add(bamOverlap);
- }
- }
-
- public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
-
- private ReaderBin peekNextBin() {
- if(binQueue.isEmpty())
- throw new NoSuchElementException("No more bins are available");
- BinQueueState current = binQueue.peek();
- return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
- }
-
- private ReaderBin getNextBin() {
- if(binQueue.isEmpty())
- throw new NoSuchElementException("No more bins are available");
- BinQueueState current = binQueue.remove();
- ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
- if(current.hasNextBin())
- binQueue.add(current);
- return readerBin;
- }
-
- }
-
- /**
- * Filters out bins not at the lowest level in the tree.
- */
- private static class LowestLevelBinFilteringIterator implements Iterator {
- private BrowseableBAMIndex index;
- private Iterator wrappedIterator;
-
- private Bin nextBin;
-
- public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) {
- this.index = index;
- this.wrappedIterator = iterator;
- advance();
- }
-
- public boolean hasNext() {
- return nextBin != null;
- }
-
- public Bin next() {
- Bin bin = nextBin;
- advance();
- return bin;
- }
-
- public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
-
- private void advance() {
- nextBin = null;
- while(wrappedIterator.hasNext() && nextBin == null) {
- Bin bin = wrappedIterator.next();
- if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
- nextBin = bin;
- }
- }
- }
+ public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
}
-
-class BinQueueState implements Comparable {
- private final SAMReaderID id;
- private final BrowseableBAMIndex index;
- private final int referenceSequence;
- private final PeekableIterator bins;
-
- private int firstLocusInCurrentBin;
- private int lastLocusInCurrentBin;
-
- public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) {
- this.id = id;
- this.index = index;
- this.referenceSequence = referenceSequence;
- this.bins = new PeekableIterator(bins);
- refreshLocusInBinCache();
- }
-
- public SAMReaderID getReaderID() {
- return id;
- }
-
- public BrowseableBAMIndex getIndex() {
- return index;
- }
-
- public int getReferenceSequence() {
- return referenceSequence;
- }
-
- public boolean hasNextBin() {
- return bins.hasNext();
- }
-
- public Bin peekNextBin() {
- return bins.peek();
- }
-
- public Bin nextBin() {
- Bin nextBin = bins.next();
- refreshLocusInBinCache();
- return nextBin;
- }
-
- public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
- if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
- if(!this.bins.hasNext()) return -1;
- if(!this.bins.hasNext()) return 1;
-
- // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
- if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
- other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
- throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
- }
-
- // Straight integer subtraction works here because lhsStart, rhsStart always positive.
- if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
- return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
-
- // Straight integer subtraction works here because lhsStop, rhsStop always positive.
- return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
- }
-
- private void refreshLocusInBinCache() {
- firstLocusInCurrentBin = -1;
- lastLocusInCurrentBin = -1;
- if(bins.hasNext()) {
- Bin bin = bins.peek();
- firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
- lastLocusInCurrentBin = index.getLastLocusInBin(bin);
- }
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
new file mode 100644
index 000000000..585b63457
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import java.util.Iterator;
+
+/**
+ * Batch granular file pointers into potentially larger shards.
+ */
+public class LocusShardBalancer extends ShardBalancer {
+ /**
+ * Convert iterators of file pointers into balanced iterators of shards.
+ * @return An iterator over balanced shards.
+ */
+ public Iterator iterator() {
+ return new Iterator() {
+ public boolean hasNext() {
+ return filePointers.hasNext();
+ }
+
+ public Shard next() {
+ FilePointer current = filePointers.next();
+ while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
+ current = current.combine(parser,filePointers.next());
+ return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
+ }
+ };
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java
deleted file mode 100755
index a5ca07853..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMFileSpan;
-import net.sf.samtools.SAMSequenceRecord;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A sharding strategy for loci based on reading of the index.
- */
-public class LocusShardStrategy implements ShardStrategy {
- /**
- * The data source to use when performing this sharding.
- */
- private final SAMDataSource reads;
-
- /**
- * the parser for creating shards
- */
- private GenomeLocParser genomeLocParser;
-
- /**
- * An iterator through the available file pointers.
- */
- private final Iterator filePointerIterator;
-
- /**
- * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
- * @param reads Data source from which to load index data.
- * @param locations List of locations for which to load data.
- */
- public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
- this.reads = reads;
- this.genomeLocParser = genomeLocParser;
-
- if(!reads.isEmpty()) {
- GenomeLocSortedSet intervals;
- if(locations == null) {
- // If no locations were passed in, shard the entire BAM file.
- SAMFileHeader header = reads.getHeader();
- intervals = new GenomeLocSortedSet(genomeLocParser);
-
- for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
- // Check this sequence against the reference sequence dictionary.
- // TODO: Do a better job of merging reads + reference.
- SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
- if(refSequenceRecord != null) {
- final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
- intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
- }
- }
- }
- else
- intervals = locations;
-
- if(reads.isLowMemoryShardingEnabled()) {
- /*
- Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
- List filePointers = new ArrayList();
- while(filePointerIterator.hasNext())
- filePointers.add(filePointerIterator.next());
- this.filePointerIterator = filePointers.iterator();
- */
- this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
- }
- else
- this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
- }
- else {
- final int maxShardSize = 100000;
- List filePointers = new ArrayList();
- if(locations == null) {
- for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
- for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
- final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
- filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
- }
- }
- }
- else {
- for(GenomeLoc interval: locations) {
- while(interval.size() > maxShardSize) {
- filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
- interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
- }
- filePointers.add(new FilePointer(interval));
- }
- }
- filePointerIterator = filePointers.iterator();
- }
-
- }
-
- /**
- * returns true if there are additional shards
- *
- * @return false if we're done processing shards
- */
- public boolean hasNext() {
- return filePointerIterator.hasNext();
- }
-
- public long shardNumber = 0;
-
- /**
- * gets the next Shard
- *
- * @return the next shard
- */
- public LocusShard next() {
- FilePointer nextFilePointer = filePointerIterator.next();
- Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
-
- /*
- System.out.printf("Shard %d: interval = {",++shardNumber);
- for(GenomeLoc locus: nextFilePointer.locations)
- System.out.printf("%s;",locus);
- System.out.printf("}; ");
-
- if(fileSpansBounding == null)
- System.out.printf("no shard data%n");
- else {
- SortedMap sortedSpans = new TreeMap(fileSpansBounding);
- for(Map.Entry entry: sortedSpans.entrySet()) {
- System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
- }
- }
- */
-
- return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
- }
-
- /** we don't support the remove command */
- public void remove() {
- throw new UnsupportedOperationException("ShardStrategies don't support remove()");
- }
-
- /**
- * makes the IntervalShard iterable, i.e. usable in a for loop.
- *
- * @return
- */
- public Iterator iterator() {
- return this;
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
deleted file mode 100644
index bf5f33dc3..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.picard.util.PeekableIterator;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-
-import java.util.Iterator;
-
-/**
- * Handles the process of aggregating BAM intervals into individual shards.
- */
-public class LowMemoryIntervalSharder implements Iterator {
- /**
- * The iterator actually laying out the data for BAM scheduling.
- */
- private final PeekableIterator wrappedIterator;
-
- /**
- * The parser, for interval manipulation.
- */
- private final GenomeLocParser parser;
-
- public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci));
- parser = loci.getGenomeLocParser();
- }
-
- public boolean hasNext() {
- return wrappedIterator.hasNext();
- }
-
- /**
- * Accumulate shards where there's no additional cost to processing the next shard in the sequence.
- * @return The next file pointer to process.
- */
- public FilePointer next() {
- FilePointer current = wrappedIterator.next();
- while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
- current = current.combine(parser,wrappedIterator.next());
- return current;
- }
-
- public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java
deleted file mode 100644
index 278eeb898..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-import java.util.List;
-
-/**
- * A single, monolithic shard bridging all available data.
- * @author mhanna
- * @version 0.1
- */
-public class MonolithicShard extends Shard {
- /**
- * Creates a new monolithic shard of the given type.
- * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
- * @param locs Intervals that this monolithic shard should process.
- */
- public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) {
- super(parser, shardType, locs, readsDataSource, null, false);
- if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
- throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
- }
-
- /**
- * String representation of this shard.
- * @return "entire genome".
- */
- @Override
- public String toString() {
- return "entire genome";
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java
deleted file mode 100644
index 28b737f28..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * Create a giant shard representing all the data in the input BAM(s).
- *
- * @author mhanna
- * @version 0.1
- */
-public class MonolithicShardStrategy implements ShardStrategy {
- /**
- * The single shard associated with this sharding strategy.
- */
- private MonolithicShard shard;
-
- /**
- * Create a new shard strategy for shards of the given type.
- * @param shardType The shard type.
- */
- public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) {
- shard = new MonolithicShard(parser,readsDataSource,shardType,region);
- }
-
- /**
- * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
- * will only return another copy of the active iterator.
- * @return A copy of this.
- */
- public Iterator iterator() {
- return this;
- }
-
- /**
- * Returns true if the monolithic shard has not yet been consumed, or false otherwise.
- * @return True if shard has been consumed, false otherwise.
- */
- public boolean hasNext() {
- return shard != null;
- }
-
- /**
- * Returns the monolithic shard if it has not already been retrieved.
- * @return The monolithic shard.
- * @throws NoSuchElementException if no such data exists.
- */
- public Shard next() {
- if(shard == null)
- throw new NoSuchElementException("Monolithic shard has already been retrived.");
-
- Shard working = shard;
- shard = null;
- return working;
- }
-
- /**
- * Mandated by the interface, but is unsupported in this context. Will throw an exception always.
- */
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a shard strategy");
- }
-
- /**
- * Mandated by the interface, but is unsupported in this context. Will throw an exception always.
- * @param size adjust the next size to this
- */
- public void adjustNextShardSize( long size ) {
- throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
- }
-
-}
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
index 4d9c9092d..5f40c0ea5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
@@ -35,10 +35,15 @@ import java.util.Map;
* @version 0.1
*/
public class ReadShard extends Shard {
+ /**
+ * What is the maximum number of reads which should go into a read shard.
+ */
+ public static final int MAX_READS = 10000;
+
/**
* The reads making up this shard.
*/
- private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS);
+ private final Collection reads = new ArrayList(MAX_READS);
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) {
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
@@ -66,7 +71,7 @@ public class ReadShard extends Shard {
* @return True if this shard's buffer is full (and the shard can buffer reads).
*/
public boolean isBufferFull() {
- return reads.size() > ReadShardStrategy.MAX_READS;
+ return reads.size() > ReadShard.MAX_READS;
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
new file mode 100644
index 000000000..fa8a7d454
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.SAMFileSpan;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
+/**
+ * Divide up large file pointers containing reads into more manageable subcomponents.
+ */
+public class ReadShardBalancer extends ShardBalancer {
+ /**
+ * Convert iterators of file pointers into balanced iterators of shards.
+ * @return An iterator over balanced shards.
+ */
+ public Iterator iterator() {
+ return new Iterator() {
+ /**
+ * The cached shard to be returned next. Prefetched in the peekable iterator style.
+ */
+ private Shard nextShard = null;
+
+ /**
+ * The file pointer currently being processed.
+ */
+ private FilePointer currentFilePointer;
+
+ /**
+ * Ending position of the last shard in the file.
+ */
+ private Map position = readsDataSource.getCurrentPosition();
+
+ {
+ if(filePointers.hasNext())
+ currentFilePointer = filePointers.next();
+ advance();
+ }
+
+ public boolean hasNext() {
+ return nextShard != null;
+ }
+
+ public Shard next() {
+ if(!hasNext())
+ throw new NoSuchElementException("No next read shard available");
+ Shard currentShard = nextShard;
+ advance();
+ return currentShard;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
+ }
+
+ private void advance() {
+ Map shardPosition;
+ nextShard = null;
+
+ Map selectedReaders = new HashMap();
+ while(selectedReaders.size() == 0 && currentFilePointer != null) {
+ shardPosition = currentFilePointer.fileSpans;
+
+ for(SAMReaderID id: shardPosition.keySet()) {
+ SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
+ if(!fileSpan.isEmpty())
+ selectedReaders.put(id,fileSpan);
+ }
+
+ if(selectedReaders.size() > 0) {
+ Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
+ readsDataSource.fillShard(shard);
+
+ if(!shard.isBufferEmpty()) {
+ nextShard = shard;
+ break;
+ }
+ }
+
+ selectedReaders.clear();
+ currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
+ }
+
+ position = readsDataSource.getCurrentPosition();
+ }
+ };
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
deleted file mode 100755
index 5ea75dbb0..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.SAMFileSpan;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-
-import java.util.*;
-
-/**
- * The sharding strategy for reads using a simple counting mechanism. Each read shard
- * has a specific number of reads (default to 10K) which is configured in the constructor.
- * @author aaron
- * @version 1.0
- * @date Apr 14, 2009
- */
-public class ReadShardStrategy implements ShardStrategy {
- /**
- * What is the maximum number of reads which should go into a read shard.
- */
- protected static final int MAX_READS = 10000;
-
- /**
- * The data source used to shard.
- */
- private final SAMDataSource dataSource;
-
- /**
- * The intervals to be processed.
- */
- private final GenomeLocSortedSet locations;
-
- /**
- * The cached shard to be returned next. Prefetched in the peekable iterator style.
- */
- private Shard nextShard = null;
-
- /** our storage of the genomic locations they'd like to shard over */
- private final List filePointers = new ArrayList();
-
- /**
- * Iterator over the list of file pointers.
- */
- private final Iterator filePointerIterator;
-
- /**
- * The file pointer currently being processed.
- */
- private FilePointer currentFilePointer;
-
- /**
- * Ending position of the last shard in the file.
- */
- private Map position;
-
- /**
- * An indicator whether the strategy has sharded into the unmapped region.
- */
- private boolean isIntoUnmappedRegion = false;
-
- private final GenomeLocParser parser;
-
- /**
- * Create a new read shard strategy, loading read shards from the given BAM file.
- * @param dataSource Data source from which to load shards.
- * @param locations intervals to use for sharding.
- */
- public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
- this.dataSource = dataSource;
- this.parser = parser;
- this.position = this.dataSource.getCurrentPosition();
- this.locations = locations;
-
- if(locations != null)
- filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
- else
- filePointerIterator = filePointers.iterator();
-
- if(filePointerIterator.hasNext())
- currentFilePointer = filePointerIterator.next();
-
- advance();
- }
-
- /**
- * do we have another read shard?
- * @return True if any more data is available. False otherwise.
- */
- public boolean hasNext() {
- return nextShard != null;
- }
-
- /**
- * Retrieves the next shard, if available.
- * @return The next shard, if available.
- * @throws java.util.NoSuchElementException if no such shard is available.
- */
- public Shard next() {
- if(!hasNext())
- throw new NoSuchElementException("No next read shard available");
- Shard currentShard = nextShard;
- advance();
- return currentShard;
- }
-
- public void advance() {
- Map shardPosition = new HashMap();
- nextShard = null;
-
- if(locations != null) {
- Map selectedReaders = new HashMap();
- while(selectedReaders.size() == 0 && currentFilePointer != null) {
- shardPosition = currentFilePointer.fileSpans;
-
- for(SAMReaderID id: shardPosition.keySet()) {
- SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
- if(!fileSpan.isEmpty())
- selectedReaders.put(id,fileSpan);
- }
-
- if(selectedReaders.size() > 0) {
- Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
- dataSource.fillShard(shard);
-
- if(!shard.isBufferEmpty()) {
- nextShard = shard;
- break;
- }
- }
-
- selectedReaders.clear();
- currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
- }
- }
- else {
- // todo -- this nulling of intervals is a bit annoying since readwalkers without
- // todo -- any -L values need to be special cased throughout the code.
- Shard shard = new ReadShard(parser,dataSource,position,null,false);
- dataSource.fillShard(shard);
- nextShard = !shard.isBufferEmpty() ? shard : null;
- }
-
- this.position = dataSource.getCurrentPosition();
- }
-
- /**
- * @throws UnsupportedOperationException always.
- */
- public void remove() {
- throw new UnsupportedOperationException("Remove not supported");
- }
-
- /**
- * Convenience method for using ShardStrategy in an foreach loop.
- * @return A iterator over shards.
- */
- public Iterator iterator() {
- return this;
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java
deleted file mode 100644
index c76c1d8ae..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.Bin;
-import net.sf.samtools.BrowseableBAMIndex;
-
-/**
- * Created by IntelliJ IDEA.
- * User: mhanna
- * Date: Feb 2, 2011
- * Time: 4:36:40 PM
- * To change this template use File | Settings | File Templates.
- */
-class ReaderBin {
- public final SAMReaderID id;
- public final BrowseableBAMIndex index;
- public final int referenceSequence;
- public final Bin bin;
-
- public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
- this.id = id;
- this.index = index;
- this.referenceSequence = referenceSequence;
- this.bin = bin;
- }
-
- public int getStart() {
- return index.getFirstLocusInBin(bin);
- }
-
- public int getStop() {
- return index.getLastLocusInBin(bin);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index 8452aadfd..0a1eb0563 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.*;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -71,7 +73,7 @@ public class SAMDataSource {
/**
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
*/
- private final GenomeLocParser genomeLocParser;
+ protected final GenomeLocParser genomeLocParser;
/**
* Identifiers for the readers driving this data source.
@@ -91,13 +93,18 @@ public class SAMDataSource {
/**
* How far along is each reader?
*/
- private final Map readerPositions = new HashMap();
+ private final Map readerPositions = new HashMap();
/**
* The merged header.
*/
private final SAMFileHeader mergedHeader;
+ /**
+ * The constituent headers of the unmerged files.
+ */
+ private final Map headers = new HashMap();
+
/**
* The sort order of the BAM files. Files without a sort order tag are assumed to be
* in coordinate order.
@@ -131,17 +138,24 @@ public class SAMDataSource {
private final SAMResourcePool resourcePool;
/**
- * Whether to enable the new low-memory sharding mechanism.
+ * Asynchronously loads BGZF blocks.
*/
- private boolean enableLowMemorySharding = false;
+ private final BGZFBlockLoadingDispatcher dispatcher;
+
+ /**
+ * How are threads allocated.
+ */
+ private final ThreadAllocation threadAllocation;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
*/
- public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocParser) {
+ public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
this(
samFiles,
+ threadAllocation,
+ numFileHandles,
genomeLocParser,
false,
SAMFileReader.ValidationStringency.STRICT,
@@ -150,8 +164,7 @@ public class SAMDataSource {
new ValidationExclusion(),
new ArrayList(),
false,
- false,
- true);
+ false);
}
/**
@@ -159,6 +172,8 @@ public class SAMDataSource {
*/
public SAMDataSource(
Collection samFiles,
+ ThreadAllocation threadAllocation,
+ Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness,
@@ -167,9 +182,10 @@ public class SAMDataSource {
ValidationExclusion exclusionList,
Collection supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
- boolean generateExtendedEvents,
- boolean enableLowMemorySharding) {
+ boolean generateExtendedEvents) {
this( samFiles,
+ threadAllocation,
+ numFileHandles,
genomeLocParser,
useOriginalBaseQualities,
strictness,
@@ -182,8 +198,7 @@ public class SAMDataSource {
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
- (byte) -1,
- enableLowMemorySharding);
+ (byte) -1);
}
/**
@@ -205,6 +220,8 @@ public class SAMDataSource {
*/
public SAMDataSource(
Collection samFiles,
+ ThreadAllocation threadAllocation,
+ Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness,
@@ -217,13 +234,19 @@ public class SAMDataSource {
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
- byte defaultBaseQualities,
- boolean enableLowMemorySharding) {
- this.enableLowMemorySharding(enableLowMemorySharding);
+ byte defaultBaseQualities) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
readerIDs = samFiles;
+
+ this.threadAllocation = threadAllocation;
+ // TODO: Consider a borrowed-thread dispatcher implementation.
+ if(this.threadAllocation.getNumIOThreads() > 0)
+ dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
+ else
+ dispatcher = null;
+
validationStringency = strictness;
for (SAMReaderID readerID : samFiles) {
if (!readerID.samFile.canRead())
@@ -235,10 +258,13 @@ public class SAMDataSource {
SAMReaders readers = resourcePool.getAvailableReaders();
// Determine the sort order.
- for(SAMFileReader reader: readers.values()) {
+ for(SAMReaderID readerID: readerIDs) {
// Get the sort order, forcing it to coordinate if unsorted.
+ SAMFileReader reader = readers.getReader(readerID);
SAMFileHeader header = reader.getFileHeader();
+ headers.put(readerID,header);
+
if ( header.getReadGroups().isEmpty() ) {
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
@@ -275,7 +301,7 @@ public class SAMDataSource {
qmode,
refReader,
defaultBaseQualities);
-
+
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
@@ -296,12 +322,10 @@ public class SAMDataSource {
originalToMergedReadGroupMappings.put(id,mappingToMerged);
}
- if(enableLowMemorySharding) {
- for(SAMReaderID id: readerIDs) {
- File indexFile = findIndexFile(id.samFile);
- if(indexFile != null)
- bamIndices.put(id,new GATKBAMIndex(indexFile));
- }
+ for(SAMReaderID id: readerIDs) {
+ File indexFile = findIndexFile(id.samFile);
+ if(indexFile != null)
+ bamIndices.put(id,new GATKBAMIndex(indexFile));
}
resourcePool.releaseReaders(readers);
@@ -314,22 +338,6 @@ public class SAMDataSource {
*/
public ReadProperties getReadsInfo() { return readProperties; }
- /**
- * Enable experimental low-memory sharding.
- * @param enable True to enable sharding. False otherwise.
- */
- public void enableLowMemorySharding(final boolean enable) {
- enableLowMemorySharding = enable;
- }
-
- /**
- * Returns whether low-memory sharding is enabled.
- * @return True if enabled, false otherwise.
- */
- public boolean isLowMemoryShardingEnabled() {
- return enableLowMemorySharding;
- }
-
/**
* Checks to see whether any reads files are supplying data.
* @return True if no reads files are supplying data to the traversal; false otherwise.
@@ -368,7 +376,7 @@ public class SAMDataSource {
* Retrieves the current position within the BAM file.
* @return A mapping of reader to current position.
*/
- public Map getCurrentPosition() {
+ public Map getCurrentPosition() {
return readerPositions;
}
@@ -381,7 +389,7 @@ public class SAMDataSource {
}
public SAMFileHeader getHeader(SAMReaderID id) {
- return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader();
+ return headers.get(id);
}
/**
@@ -404,45 +412,21 @@ public class SAMDataSource {
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
}
- /**
- * No read group collisions at this time because only one SAM file is currently supported.
- * @return False always.
- */
- public boolean hasReadGroupCollisions() {
- return hasReadGroupCollisions;
- }
-
/**
* True if all readers have an index.
* @return True if all readers have an index.
*/
public boolean hasIndex() {
- if(enableLowMemorySharding)
- return readerIDs.size() == bamIndices.size();
- else {
- for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
- if(!reader.hasIndex())
- return false;
- }
- return true;
- }
+ return readerIDs.size() == bamIndices.size();
}
/**
* Gets the index for a particular reader. Always preloaded.
- * TODO: Should return object of type GATKBAMIndex, but cannot because there
- * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
- * TODO: sharding system goes live.
* @param id Id of the reader.
* @return The index. Will preload the index if necessary.
*/
- public Object getIndex(final SAMReaderID id) {
- if(enableLowMemorySharding)
- return bamIndices.get(id);
- else {
- SAMReaders readers = resourcePool.getReadersWithoutLocking();
- return readers.getReader(id).getBrowseableIndex();
- }
+ public GATKBAMIndex getIndex(final SAMReaderID id) {
+ return bamIndices.get(id);
}
/**
@@ -454,7 +438,7 @@ public class SAMDataSource {
}
/**
- * Gets the cumulative read metrics for shards already processed.
+ * Gets the cumulative read metrics for shards already processed.
* @return Cumulative read metrics.
*/
public ReadMetrics getCumulativeReadMetrics() {
@@ -507,10 +491,6 @@ public class SAMDataSource {
}
public StingSAMIterator seek(Shard shard) {
- // todo: refresh monolithic sharding implementation
- if(shard instanceof MonolithicShard)
- return seekMonolithic(shard);
-
if(shard.buffersReads()) {
return shard.iterator();
}
@@ -540,7 +520,7 @@ public class SAMDataSource {
*/
private void initializeReaderPositions(SAMReaders readers) {
for(SAMReaderID id: getReaderIDs())
- readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads());
+ readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
}
/**
@@ -548,7 +528,6 @@ public class SAMDataSource {
* @param readers Readers from which to load data.
* @param shard The shard specifying the data limits.
* @param enableVerification True to verify. For compatibility with old sharding strategy.
- * TODO: Collapse this flag when the two sharding systems are merged.
* @return An iterator over the selected data.
*/
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
@@ -559,14 +538,20 @@ public class SAMDataSource {
for(SAMReaderID id: getReaderIDs()) {
CloseableIterator iterator = null;
- if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
- continue;
- iterator = shard.getFileSpans().get(id) != null ?
- readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
- readers.getReader(id).queryUnmapped();
+
+ // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
+ // TODO: Kill this check once we've proven that the design elements are gone.
+ if(shard.getFileSpans().get(id) == null)
+ throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
+
+ if(threadAllocation.getNumIOThreads() > 0) {
+ BlockInputStream inputStream = readers.getInputStream(id);
+ inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
+ }
+ iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
- if(shard.getGenomeLocs() != null)
+ if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
mergingIterator.addIterator(readers.getReader(id),iterator);
}
@@ -584,33 +569,6 @@ public class SAMDataSource {
readProperties.defaultBaseQualities());
}
- /**
- * A stopgap measure to handle monolithic sharding
- * @param shard the (monolithic) shard.
- * @return An iterator over the monolithic shard.
- */
- private StingSAMIterator seekMonolithic(Shard shard) {
- SAMReaders readers = resourcePool.getAvailableReaders();
-
- // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
- SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
- MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
- for(SAMReaderID id: getReaderIDs())
- mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
-
- return applyDecoratingIterators(shard.getReadMetrics(),
- shard instanceof ReadShard,
- readProperties.useOriginalBaseQualities(),
- new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
- readProperties.getDownsamplingMethod().toFraction,
- readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
- readProperties.getSupplementalFilters(),
- readProperties.getBAQCalculationMode(),
- readProperties.getBAQQualityMode(),
- readProperties.getRefReader(),
- readProperties.defaultBaseQualities());
- }
-
/**
* Adds this read to the given shard.
* @param shard The shard to which to add the read.
@@ -618,7 +576,7 @@ public class SAMDataSource {
* @param read The read to add to the shard.
*/
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
- SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
+ GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
shard.addRead(read);
readerPositions.put(id,endChunk);
}
@@ -689,19 +647,6 @@ public class SAMDataSource {
this.maxEntries = maxEntries;
}
- /**
- * Dangerous internal method; retrieves any set of readers, whether in iteration or not.
- * Used to handle non-exclusive, stateless operations, such as index queries.
- * @return Any collection of SAMReaders, whether in iteration or not.
- */
- protected SAMReaders getReadersWithoutLocking() {
- synchronized(this) {
- if(allResources.size() == 0)
- createNewResource();
- }
- return allResources.get(0);
- }
-
/**
* Choose a set of readers from the pool to use for this query. When complete,
* @return
@@ -753,6 +698,11 @@ public class SAMDataSource {
*/
private final Map readers = new LinkedHashMap();
+ /**
+ * The inptu streams backing
+ */
+ private final Map inputStreams = new LinkedHashMap();
+
/**
* Derive a new set of readers from the Reads metadata.
* @param readerIDs reads to load.
@@ -760,12 +710,20 @@ public class SAMDataSource {
*/
public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) {
for(SAMReaderID readerID: readerIDs) {
- SAMFileReader reader = new SAMFileReader(readerID.samFile);
+ File indexFile = findIndexFile(readerID.samFile);
+
+ SAMFileReader reader = null;
+
+ if(threadAllocation.getNumIOThreads() > 0) {
+ BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
+ reader = new SAMFileReader(blockInputStream,indexFile,false);
+ inputStreams.put(readerID,blockInputStream);
+ }
+ else
+ reader = new SAMFileReader(readerID.samFile,indexFile,false);
reader.setSAMRecordFactory(factory);
+
reader.enableFileSource(true);
- reader.enableIndexMemoryMapping(false);
- if(!enableLowMemorySharding)
- reader.enableIndexCaching(true);
reader.setValidationStringency(validationStringency);
final SAMFileHeader header = reader.getFileHeader();
@@ -786,6 +744,15 @@ public class SAMDataSource {
return readers.get(id);
}
+ /**
+ * Retrieve the input stream backing a reader.
+ * @param id The ID of the reader to retrieve.
+ * @return the reader associated with the given id.
+ */
+ public BlockInputStream getInputStream(final SAMReaderID id) {
+ return inputStreams.get(id);
+ }
+
/**
* Searches for the reader id of this reader.
* @param reader Reader for which to search.
@@ -883,7 +850,7 @@ public class SAMDataSource {
* Filters out reads that do not overlap the current GenomeLoc.
* Note the custom implementation: BAM index querying returns all reads that could
* possibly overlap the given region (and quite a few extras). In order not to drag
- * down performance, this implementation is highly customized to its task.
+ * down performance, this implementation is highly customized to its task.
*/
private class IntervalOverlapFilteringIterator implements CloseableIterator {
/**
@@ -903,7 +870,7 @@ public class SAMDataSource {
/**
* Custom representation of interval bounds.
- * Makes it simpler to track current position.
+ * Makes it simpler to track current position.
*/
private int[] intervalContigIndices;
private int[] intervalStarts;
@@ -941,7 +908,7 @@ public class SAMDataSource {
i++;
}
}
-
+
advance();
}
@@ -1070,6 +1037,40 @@ public class SAMDataSource {
return indexFile;
}
+
+ /**
+ * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
+ * will be as granular as possible given our current knowledge of the best ways to split up BAM files.
+ * @return An iterator that spans all reads in all BAM files.
+ */
+ public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
+ shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
+ return shardBalancer;
+ }
+
+ /**
+ * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
+ * read that has been assigned
+ * @return
+ */
+ public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
+ shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
+ return shardBalancer;
+ }
+
+ /**
+ * Create a schedule for processing the initialized BAM file using the given interval list.
+ * The returned schedule should be as granular as possible.
+ * @param intervals The list of intervals for which to create the schedule.
+ * @return A granular iterator over file pointers.
+ */
+ public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
+ if(intervals == null)
+ throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
+ shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
+ return shardBalancer;
+ }
}
+
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java
new file mode 100644
index 000000000..f9f6539a7
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.picard.util.PeekableIterator;
+import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.List;
+
+/**
+* Created by IntelliJ IDEA.
+* User: mhanna
+* Date: 10/14/11
+* Time: 10:47 PM
+* To change this template use File | Settings | File Templates.
+*/
+class SAMReaderPosition {
+ private final SAMReaderID reader;
+ private final BlockInputStream inputStream;
+
+ private final List positions;
+ private PeekableIterator positionIterator;
+
+ /**
+ * Stores the next block address to read, or -1 if no such block is available.
+ */
+ private long nextBlockAddress;
+
+
+ SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
+ this.reader = reader;
+ this.inputStream = inputStream;
+
+ this.positions = fileSpan.getGATKChunks();
+ initialize();
+ }
+
+ public SAMReaderID getReader() {
+ return reader;
+ }
+
+ public BlockInputStream getInputStream() {
+ return inputStream;
+ }
+
+ /**
+ * Retrieves the next block address to be read.
+ * @return Next block address to be read.
+ */
+ public long getBlockAddress() {
+ return nextBlockAddress;
+ }
+
+ public void reset() {
+ initialize();
+ }
+
+ /**
+ * Resets the SAM reader position to its original state.
+ */
+ private void initialize() {
+ this.positionIterator = new PeekableIterator(positions.iterator());
+ if(positionIterator.hasNext())
+ nextBlockAddress = positionIterator.peek().getBlockStart();
+ else
+ nextBlockAddress = -1;
+ }
+
+ /**
+ * Advances the current position to the next block to read, given the current position in the file.
+ * @param filePosition The current position within the file.
+ */
+ void advancePosition(final long filePosition) {
+ nextBlockAddress = filePosition;
+
+ // Check the current file position against the iterator; if the iterator is before the current file position,
+ // draw the iterator forward. Remember when performing the check that coordinates are half-open!
+ try {
+ while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
+ positionIterator.next();
+ // Check to see if the iterator has more data available.
+ if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
+ nextBlockAddress = positionIterator.peek().getBlockStart();
+ break;
+ }
+ }
+ }
+ catch(Exception ex) {
+ throw new ReviewedStingException("");
+ }
+ }
+
+ private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
+ return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java
new file mode 100644
index 000000000..962208086
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java
@@ -0,0 +1,21 @@
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.picard.util.PeekableIterator;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+
+import java.util.Iterator;
+
+/**
+ * Balances maximally granular file pointers into shards of reasonable size.
+ */
+public abstract class ShardBalancer implements Iterable {
+ protected SAMDataSource readsDataSource;
+ protected PeekableIterator filePointers;
+ protected GenomeLocParser parser;
+
+ public void initialize(final SAMDataSource readsDataSource, final Iterator filePointers, final GenomeLocParser parser) {
+ this.readsDataSource = readsDataSource;
+ this.filePointers = new PeekableIterator(filePointers);
+ this.parser = parser;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java
deleted file mode 100644
index 989cf9fce..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import java.util.Iterator;
-/**
- *
- * User: aaron
- * Date: Apr 10, 2009
- * Time: 4:55:37 PM
- *
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2009 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever. Neither
- * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
- *
- */
-
-/**
- * @author aaron
- * @version 1.0
- * @date Apr 10, 2009
- *
- * Interface ShardStrategy
- *
- * The base interface for the sharding strategy; before we had a base abstract
- * class, but not this will be an interface to accomidate read based sharding
- */
-public interface ShardStrategy extends Iterator, Iterable {
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java
deleted file mode 100644
index 780b41ef7..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMSequenceDictionary;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-/**
- *
- * User: aaron
- * Date: Apr 6, 2009
- * Time: 7:09:22 PM
- *
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2009 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever. Neither
- * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
- *
- */
-
-
-/**
- * @author aaron
- * @version 1.0
- * @date Apr 6, 2009
- *
- * Class ShardStrategyFactory
- *
- * The Shard Strategy Factory, use this class to create and transfer shard strategies
- * between different approaches.
- */
-public class ShardStrategyFactory {
- public enum SHATTER_STRATEGY {
- MONOLITHIC, // Put all of the available data into one shard.
- LOCUS_EXPERIMENTAL,
- READS_EXPERIMENTAL
- }
-
- /**
- * get a new shatter strategy
- *
- * @param readsDataSource File pointer to BAM.
- * @param referenceDataSource File pointer to reference.
- * @param strat what's our strategy - SHATTER_STRATEGY type
- * @param dic the seq dictionary
- * @param startingSize the starting size
- * @return a shard strategy capable of dividing input data into shards.
- */
- static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
- return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
- }
-
- /**
- * get a new shatter strategy
- *
- * @param readsDataSource File pointer to BAM.
- * @param referenceDataSource File pointer to reference.
- * @param strat what's our strategy - SHATTER_STRATEGY type
- * @param dic the seq dictionary
- * @param startingSize the starting size
- * @return a shard strategy capable of dividing input data into shards.
- */
- static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
- switch (strat) {
- case LOCUS_EXPERIMENTAL:
- return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
- case READS_EXPERIMENTAL:
- return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
- default:
- throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
- }
-
- }
-
-
- /**
- * get a new shatter strategy
- *
- * @param readsDataSource File pointer to BAM.
- * @param referenceDataSource File pointer to reference.
- * @param strat what's our strategy - SHATTER_STRATEGY type
- * @param dic the seq dictionary
- * @param startingSize the starting size
- * @return a shard strategy capable of dividing input data into shards.
- */
- static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
- return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
-
- }
-
- /**
- * get a new shatter strategy
- *
- * @param readsDataSource The reads used to shatter this file.
- * @param referenceDataSource The reference used to shatter this file.
- * @param strat what's our strategy - SHATTER_STRATEGY type
- * @param dic the seq dictionary
- * @param startingSize the starting size
- * @return A strategy for shattering this data.
- */
- static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
- switch (strat) {
- case LOCUS_EXPERIMENTAL:
- return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
- case READS_EXPERIMENTAL:
- return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
- default:
- throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
- }
-
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
index 673df6dfa..577db0965 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
@@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
-import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
+import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
// initialize reads
List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
- SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
+ SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
// intervals
GenomeLocSortedSet intervalSortedSet = null;
@@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
- LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
+ IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
@@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
- sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
+ sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
index c8c79bb14..2c33a19b8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
@@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex;
import net.sf.picard.reference.FastaSequenceIndexBuilder;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
+import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
+import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
+import org.broadinstitute.sting.gatk.datasources.reads.Shard;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
@@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
/**
* Loads reference data from fasta file
* Looks for fai and dict files, and tries to create them if they don't exist
*/
public class ReferenceDataSource {
- private IndexedFastaSequenceFile index;
+ private IndexedFastaSequenceFile reference;
/** our log, which we want to capture anything from this class */
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
@@ -173,7 +185,7 @@ public class ReferenceDataSource {
logger.info("Treating existing index file as complete.");
}
- index = new CachingIndexedFastaSequenceFile(fastaFile);
+ reference = new CachingIndexedFastaSequenceFile(fastaFile);
} catch (IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
@@ -192,6 +204,52 @@ public class ReferenceDataSource {
* @return IndexedFastaSequenceFile that was created from file
*/
public IndexedFastaSequenceFile getReference() {
- return this.index;
+ return this.reference;
+ }
+
+ /**
+ * Creates an iterator for processing the entire reference.
+ * @param readsDataSource the reads datasource to embed in the locus shard.
+ * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
+ * @param maxShardSize The maximum shard size which can be used to create this list.
+ * @return Creates a schedule for performing a traversal over the entire reference.
+ */
+ public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
+ List shards = new ArrayList();
+ for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
+ for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
+ final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
+ shards.add(new LocusShard(parser,
+ readsDataSource,
+ Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
+ null));
+ }
+ }
+ return shards;
+ }
+
+ /**
+ * Creates an iterator for processing the entire reference.
+ * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
+ * @param intervals the list of intervals to use when processing the reference.
+ * @param maxShardSize The maximum shard size which can be used to create this list.
+ * @return Creates a schedule for performing a traversal over the entire reference.
+ */
+ public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
+ List shards = new ArrayList();
+ for(GenomeLoc interval: intervals) {
+ while(interval.size() > maxShardSize) {
+ shards.add(new LocusShard(intervals.getGenomeLocParser(),
+ readsDataSource,
+ Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
+ null));
+ interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
+ }
+ shards.add(new LocusShard(intervals.getGenomeLocParser(),
+ readsDataSource,
+ Collections.singletonList(interval),
+ null));
+ }
+ return shards;
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index 162baed00..b0043e68c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
@@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
}
- public Object execute( Walker walker, ShardStrategy shardStrategy ) {
+ public Object execute( Walker walker, Iterable shardStrategy ) {
// Fast fail for walkers not supporting TreeReducible interface.
if (!( walker instanceof TreeReducible ))
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index deafcd0cc..ff5e1064b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
@@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param walker Computation to perform over dataset.
* @param shardStrategy A strategy for sharding the data.
*/
- public Object execute(Walker walker, ShardStrategy shardStrategy) {
+ public Object execute(Walker walker, Iterable shardStrategy) {
walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index e731b9864..d013db7e8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @param reads the informations associated with the reads
* @param reference the reference file
* @param rods the rods to include in the traversal
- * @param nThreadsToUse Number of threads to utilize.
+ * @param threadAllocation Number of threads to utilize.
*
* @return The best-fit microscheduler.
*/
- public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) {
- if (walker instanceof TreeReducible && nThreadsToUse > 1) {
+ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) {
+ if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
if(walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
if(walker instanceof ReadWalker)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
- logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse));
- return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse);
+ logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
+ return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
} else {
- if(nThreadsToUse > 1)
+ if(threadAllocation.getNumCPUThreads() > 1)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
}
@@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
*
* @return the return type of the walker
*/
- public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
+ public abstract Object execute(Walker walker, Iterable shardStrategy);
/**
* Retrieves the object responsible for tracking and managing output.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java
new file mode 100644
index 000000000..0c81af07b
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.resourcemanagement;
+
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+/**
+ * Models how threads are distributed between various components of the GATK.
+ */
+public class ThreadAllocation {
+ /**
+ * The number of CPU threads to be used by the GATK.
+ */
+ private final int numCPUThreads;
+
+ /**
+ * Number of threads to devote exclusively to IO. Default is 0.
+ */
+ private final int numIOThreads;
+
+ public int getNumCPUThreads() {
+ return numCPUThreads;
+ }
+
+ public int getNumIOThreads() {
+ return numIOThreads;
+ }
+
+ /**
+ * Construct the default thread allocation.
+ */
+ public ThreadAllocation() {
+ this(1,null,null);
+ }
+
+ /**
+ * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
+ * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
+ * @param totalThreads Complete number of threads to allocate.
+ * @param numCPUThreads Total number of threads allocated to the traversal.
+ * @param numIOThreads Total number of threads allocated exclusively to IO.
+ */
+ public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
+ // If no allocation information is present, allocate all threads to CPU
+ if(numCPUThreads == null && numIOThreads == null) {
+ this.numCPUThreads = totalThreads;
+ this.numIOThreads = 0;
+ }
+ // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
+ else if(numIOThreads == null) {
+ if(numCPUThreads > totalThreads)
+ throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
+ this.numCPUThreads = numCPUThreads;
+ this.numIOThreads = totalThreads - numCPUThreads;
+ }
+ // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
+ else if(numCPUThreads == null) {
+ if(numIOThreads > totalThreads)
+ throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
+ this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
+ this.numIOThreads = numIOThreads;
+ }
+ else {
+ if(numCPUThreads + numIOThreads != totalThreads)
+ throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
+ this.numCPUThreads = numCPUThreads;
+ this.numIOThreads = numIOThreads;
+ }
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
index b39fdd79d..a14d999ea 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
@@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial
return infoDB.getSample(paternalID);
}
+ public ArrayList getParents(){
+ ArrayList parents = new ArrayList(2);
+ Sample parent = getMother();
+ if(parent != null)
+ parents.add(parent);
+ parent = getFather();
+ if(parent != null)
+ parents.add(parent);
+ return parents;
+ }
+
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
index 8098de5b1..ab38b69cd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
@@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
- public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); }
+ public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index 85977bf8e..1956dac6c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
// We refuse to parse SnpEff output files generated by unsupported versions, or
// lacking a SnpEff version number in the VCF header:
- public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
+ public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" };
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
@@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
public enum InfoFieldKey {
EFFECT_KEY ("SNPEFF_EFFECT", -1),
IMPACT_KEY ("SNPEFF_IMPACT", 0),
- CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
- AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
- GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
- GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
- TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
- EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
- FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
+ FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1),
+ CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2),
+ AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3),
+ GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4),
+ GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5),
+ TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7),
+ EXON_ID_KEY ("SNPEFF_EXON_ID", 8);
// Actual text of the key
private final String keyName;
@@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
// are validated against this list.
public enum EffectType {
// High-impact effects:
- FRAME_SHIFT (EffectFunctionalClass.NONE, false),
- STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
- START_LOST (EffectFunctionalClass.NONE, false),
- SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
- SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
- EXON_DELETED (EffectFunctionalClass.NONE, false),
- STOP_LOST (EffectFunctionalClass.NONE, false),
+ SPLICE_SITE_ACCEPTOR,
+ SPLICE_SITE_DONOR,
+ START_LOST,
+ EXON_DELETED,
+ FRAME_SHIFT,
+ STOP_GAINED,
+ STOP_LOST,
// Moderate-impact effects:
- NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
- CODON_CHANGE (EffectFunctionalClass.NONE, false),
- CODON_INSERTION (EffectFunctionalClass.NONE, false),
- CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
- CODON_DELETION (EffectFunctionalClass.NONE, false),
- CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
- UTR_5_DELETED (EffectFunctionalClass.NONE, false),
- UTR_3_DELETED (EffectFunctionalClass.NONE, false),
+ NON_SYNONYMOUS_CODING,
+ CODON_CHANGE,
+ CODON_INSERTION,
+ CODON_CHANGE_PLUS_CODON_INSERTION,
+ CODON_DELETION,
+ CODON_CHANGE_PLUS_CODON_DELETION,
+ UTR_5_DELETED,
+ UTR_3_DELETED,
// Low-impact effects:
- SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
- SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
- NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
- SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
- NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
- START_GAINED (EffectFunctionalClass.NONE, false),
+ SYNONYMOUS_START,
+ NON_SYNONYMOUS_START,
+ START_GAINED,
+ SYNONYMOUS_CODING,
+ SYNONYMOUS_STOP,
+ NON_SYNONYMOUS_STOP,
// Modifiers:
- NONE (EffectFunctionalClass.NONE, true),
- CHROMOSOME (EffectFunctionalClass.NONE, true),
- INTERGENIC (EffectFunctionalClass.NONE, true),
- UPSTREAM (EffectFunctionalClass.NONE, true),
- UTR_5_PRIME (EffectFunctionalClass.NONE, true),
- CDS (EffectFunctionalClass.NONE, true),
- GENE (EffectFunctionalClass.NONE, true),
- TRANSCRIPT (EffectFunctionalClass.NONE, true),
- EXON (EffectFunctionalClass.NONE, true),
- INTRON (EffectFunctionalClass.NONE, true),
- UTR_3_PRIME (EffectFunctionalClass.NONE, true),
- DOWNSTREAM (EffectFunctionalClass.NONE, true),
- INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
- INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
- REGULATION (EffectFunctionalClass.NONE, true),
- CUSTOM (EffectFunctionalClass.NONE, true),
- WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
-
- private final EffectFunctionalClass functionalClass;
- private final boolean isModifier;
-
- EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
- this.functionalClass = functionalClass;
- this.isModifier = isModifier;
- }
-
- public EffectFunctionalClass getFunctionalClass() {
- return functionalClass;
- }
-
- public boolean isModifier() {
- return isModifier;
- }
+ NONE,
+ CHROMOSOME,
+ CUSTOM,
+ CDS,
+ GENE,
+ TRANSCRIPT,
+ EXON,
+ INTRON_CONSERVED,
+ UTR_5_PRIME,
+ UTR_3_PRIME,
+ DOWNSTREAM,
+ INTRAGENIC,
+ INTERGENIC,
+ INTERGENIC_CONSERVED,
+ UPSTREAM,
+ REGULATION,
+ INTRON
}
- // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
- // classifying some of the LOW impact effects as MODIFIERs.
+ // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER.
public enum EffectImpact {
MODIFIER (0),
LOW (1),
@@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
UNKNOWN
}
- // We assign a functional class to each SnpEff effect.
+ // SnpEff assigns a functional class to each effect.
public enum EffectFunctionalClass {
NONE (0),
SILENT (1),
@@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
public List getKeyNames() {
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
InfoFieldKey.IMPACT_KEY.getKeyName(),
+ InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(),
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
- InfoFieldKey.EXON_ID_KEY.getKeyName(),
- InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
+ InfoFieldKey.EXON_ID_KEY.getKeyName()
);
}
@@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return Arrays.asList(
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+ new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())),
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
+ new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
@@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
protected static class SnpEffEffect {
private EffectType effect;
private EffectImpact impact;
+ private EffectFunctionalClass functionalClass;
private String codonChange;
private String aminoAcidChange;
private String geneName;
@@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
private String parseError = null;
private boolean isWellFormed = true;
- private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
- private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
- private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
+ private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11;
- // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
- // errors come after warnings, not vice versa:
- private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
- private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
+ // If there is either a warning OR an error, it will be in the last field. If there is both
+ // a warning AND an error, the warning will be in the second-to-last field, and the error will
+ // be in the last field.
+ private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1;
+ private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2;
+ private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1;
- private static final int SNPEFF_CODING_FIELD_INDEX = 5;
+ // Position of the field indicating whether the effect is coding or non-coding. This field is used
+ // in selecting the most significant effect, but is not included in the annotations we return
+ // since it can be deduced from the SNPEFF_GENE_BIOTYPE field.
+ private static final int SNPEFF_CODING_FIELD_INDEX = 6;
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
parseEffectName(effectName);
@@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
private void parseEffectMetadata ( String[] effectMetadata ) {
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
- if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
- parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
+ if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) {
+ parseError(String.format("SnpEff issued the following warning or error: \"%s\"",
+ effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR]));
}
- else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
- parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
+ else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) {
+ parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"",
+ effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR],
+ effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR]));
}
else {
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
@@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return;
}
- if ( effect != null && effect.isModifier() ) {
- impact = EffectImpact.MODIFIER;
+ // The impact field will never be empty, and should always contain one of the enumerated values:
+ try {
+ impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
- else {
+ catch ( IllegalArgumentException e ) {
+ parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
+ }
+
+ // The functional class field will be empty when the effect has no functional class associated with it:
+ if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) {
try {
- impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
+ functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
- parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
+ parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]));
}
}
+ else {
+ functionalClass = EffectFunctionalClass.NONE;
+ }
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
+ // The coding field will be empty when SnpEff has no coding info for the effect:
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
@@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return true;
}
else if ( impact.isSameImpactAs(other.impact) ) {
- return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
+ return functionalClass.isHigherPriorityThan(other.functionalClass);
}
return false;
@@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
+ addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString());
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
- addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
return annotations;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java
index b5987963f..106bb1982 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java
@@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
* Time: 6:46:09 PM
* To change this template use File | Settings | File Templates.
*/
-enum DiploidGenotype {
+public enum DiploidGenotype {
AA ('A', 'A'),
AC ('A', 'C'),
AG ('A', 'G'),
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
index 666fe88a3..295cf8688 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
-import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
byte obsBase = elt.getBase();
+ byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( elt.isReducedRead() ) {
// reduced read representation
- byte qual = elt.getQual();
- if ( BaseUtils.isRegularBase( elt.getBase() )) {
+ if ( BaseUtils.isRegularBase( obsBase )) {
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
return elt.getRepresentativeCount(); // we added nObs bases here
- } else // odd bases or deletions => don't use them
- return 0;
- } else {
- byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
- return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
+ }
+
+ // odd bases or deletions => don't use them
+ return 0;
}
+
+ return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
}
public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
@@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
* @return
*/
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
- if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) {
+ if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
return 0;
- } else {
- byte qual = p.getQual();
- if ( qual > SAMUtils.MAX_PHRED_SCORE )
- throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
- if ( capBaseQualsAtMappingQual )
- qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
- if ( (int)qual < minBaseQual )
- qual = (byte)0;
+ byte qual = p.getQual();
- return qual;
- }
+ if ( qual > SAMUtils.MAX_PHRED_SCORE )
+ throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
+ if ( capBaseQualsAtMappingQual )
+ qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
+ if ( (int)qual < minBaseQual )
+ qual = (byte)0;
+
+ return qual;
}
// -----------------------------------------------------------------------------------------------------------------
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
index 489e963e8..74c55dbfe 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
-import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Map;
@@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param priors priors to use for GLs
* @param GLs hash of sample->GL to fill in
* @param alternateAlleleToUse the alternate allele to use, null if not set
- *
- * @param useBAQedPileup
+ * @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @return genotype likelihoods per sample for AA, AB, BB
*/
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
@@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map GLs,
- Allele alternateAlleleToUse, boolean useBAQedPileup);
+ Allele alternateAlleleToUse,
+ boolean useBAQedPileup);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( BaseUtils.isRegularBase( p.getBase() ) )
- count++;
+ count += p.getRepresentativeCount();
}
return count;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index bdd4e2c65..369c2d0c6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -258,7 +258,7 @@ public class UnifiedGenotyper extends LocusWalker result = new HashSet();
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
- result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
+ result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"));
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
return result;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 2d71ea8a8..8585104d5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -7,35 +7,80 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
-import java.io.File;
-import java.io.FileNotFoundException;
+import java.io.PrintStream;
import java.util.*;
/**
- * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio,
- * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype
- * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents
- * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the
- * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to
- * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally
- * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or
- * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
- * begin.
+ * Computes the most likely genotype combination and phases trios and parent/child pairs
+ *
+ *
+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases
+ * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability.
+ * Ambiguous sites are:
+ *
+ * - Sites where all individuals are heterozygous
+ * - Sites where there is a Mendelian violation
+ *
+ * Missing genotypes are handled as follows:
+ *
+ * - In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
+ * - In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
+ * - In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
+ * - In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
+ *
+ *
+ * Input
+ *
+ *
+ * - A VCF variant set containing trio(s) and/or parent/child pair(s).
+ * - A PED pedigree file containing the description of the individuals relationships.
+ *
+ *
+ *
+ * Options
+ *
+ *
+ * - MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype
+ * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability,
+ * and each individual genotype, depth, allelic depth and likelihoods.
+ * - DeNovoPrior: Mutation prio; default is 1e-8
+ *
+ *
+ *
+ * Output
+ *
+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous..
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PhaseByTransmission \
+ * -V input.vcf \
+ * -ped input.ped \
+ * -o output.vcf
+ *
+ *
*/
-public class PhaseByTransmission extends RodWalker {
+public class PhaseByTransmission extends RodWalker, HashMap> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
- @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.")
- public ArrayList familySpecs = null;
+ @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
+ private PrintStream mvFile = null;
+
+ @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8")
+ private double deNovoPrior=1e-8;
@Output
protected VCFWriter vcfWriter = null;
@@ -43,241 +88,633 @@ public class PhaseByTransmission extends RodWalker {
private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP";
private final String SOURCE_NAME = "PhaseByTransmission";
- private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
+ public final double NO_TRANSMISSION_PROB = -1.0;
- private class Trio {
- private String mother;
- private String father;
- private String child;
+ private ArrayList trios = new ArrayList();
- public Trio(String mother, String father, String child) {
- this.mother = mother;
- this.father = father;
- this.child = child;
- }
+ //Matrix of priors for all genotype combinations
+ private EnumMap>> mvCountMatrix;
- public Trio(String familySpec) {
- String[] pieces = familySpec.split("[\\+\\=]");
+ //Matrix of allele transmission
+ private EnumMap>> transmissionMatrix;
- this.mother = pieces[0];
- this.father = pieces[1];
- this.child = pieces[2];
- }
+ //Metrics counters hash keys
+ private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
+ private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
+ private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
+ private final Byte NUM_TRIO_HET_HET_HET = 3;
+ private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10;
+ private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
+ private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
+ private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
+ private final Byte NUM_PAIR_HET_HET = 8;
+ private final Byte NUM_PAIR_VIOLATIONS = 9;
+ private final Byte NUM_GENOTYPES_MODIFIED = 11;
- public String getMother() { return mother; }
- public String getFather() { return father; }
- public String getChild() { return child; }
+ //Random number generator
+ private Random rand = new Random();
+
+ private enum FamilyMember {
+ MOTHER,
+ FATHER,
+ CHILD
}
- private ArrayList trios = new ArrayList();
+ //Stores a conceptual trio or parent/child pair genotype combination along with its phasing.
+ //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method.
+ private class TrioPhase {
- public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) {
- if (familySpecs != null) {
- // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
- // spec list set, and treat the entries as if they had been specified on the command line.
- ArrayList specs = new ArrayList();
- for (String familySpec : familySpecs) {
- File specFile = new File(familySpec);
+ //Create 2 fake alleles
+ //The actual bases will never be used but the Genotypes created using the alleles will be.
+ private final Allele REF = Allele.create("A",true);
+ private final Allele VAR = Allele.create("A",false);
+ private final Allele NO_CALL = Allele.create(".",false);
+ private final String DUMMY_NAME = "DummySample";
- try {
- XReadLines reader = new XReadLines(specFile);
+ private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
- List lines = reader.readLines();
- for (String line : lines) {
- specs.add(new Trio(line));
- }
- } catch (FileNotFoundException e) {
- specs.add(new Trio(familySpec)); // not a file, so must be a family spec
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(REF);
+ alleles.add(REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(REF);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else{
+ return null;
+ }
+ return alleles;
+ }
+
+ private boolean isPhasable(Genotype.Type genotype){
+ return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR;
+ }
+
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true));
+ }
+ else
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false));
+ }
+
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+
+ //Special case for Het/Het as it is ambiguous
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
+ return;
+ }
+
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+
+ //If there is a possible phasing between the mother and child => phase
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
+ }
+ //This is a Mendelian Violation => Do not phase
+ else{
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
+ }
+ }
+
+ //Phases a family by transmission
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
+
+ //Build all possible child genotypes for the given parent's genotypes
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
}
}
- return specs;
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
+ else
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
+
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
+ else
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
+
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false));
}
- return new ArrayList();
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Take care of cases where one or more family members are no call
+ if(!isPhasable(child)){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else if(!isPhasable(mother)){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(!isPhasable(father)){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else
+ phasePairAlleles(father, child, FamilyMember.FATHER);
+ }
+ else if(!isPhasable(father)){
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ phaseFamilyAlleles(mother, father, child);
+ }
+ }
+
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
+ }
+
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+
+ int phredScoreTransmission = -1;
+ if(transmissionProb != NO_TRANSMISSION_PROB)
+ phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
+
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ //In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
+ if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
+ return genotype;
+
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
+
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
+
+ }
+
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double log10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ log10Error = genotype.getLog10PError();
+ else
+ log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
+
+ return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased());
+ }
+
+
}
/**
- * Parse the familial relationship specification, and initialize VCF writer
+ * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer
*/
public void initialize() {
- trios = getFamilySpecsFromCommandLineInput(familySpecs);
-
ArrayList rodNames = new ArrayList();
rodNames.add(variantCollection.variants.getName());
-
Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ //Get the trios from the families passed as ped
+ setTrios();
+ if(trios.size()<1)
+ throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted.");
+
+
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
+
+ buildMatrices();
+
+ if(mvFile != null)
+ mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL");
+
}
- private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) {
- double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector());
- double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector());
- double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector());
+ /**
+ * Select trios and parent/child pairs only
+ */
+ private void setTrios(){
- int momIndex = mom.getType().ordinal() - 1;
- int dadIndex = dad.getType().ordinal() - 1;
- int childIndex = child.getType().ordinal() - 1;
-
- return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex];
- }
-
- private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) {
- List homRefAlleles = new ArrayList();
- homRefAlleles.add(refAllele);
- homRefAlleles.add(refAllele);
- Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getLog10PError(), null, g.getAttributes(), false);
-
- List hetAlleles = new ArrayList();
- hetAlleles.add(refAllele);
- hetAlleles.add(altAllele);
- Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getLog10PError(), null, g.getAttributes(), false);
-
- List homVarAlleles = new ArrayList();
- homVarAlleles.add(altAllele);
- homVarAlleles.add(altAllele);
- Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getLog10PError(), null, g.getAttributes(), false);
-
- ArrayList genotypes = new ArrayList();
- genotypes.add(homRef);
- genotypes.add(het);
- genotypes.add(homVar);
-
- return genotypes;
- }
-
- private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) {
- List alleles = g.getAlleles();
- int matchingAlleles = 0;
-
- for (Allele a : alleles) {
- if (!alleleToMatch.equals(a)) {
- matchingAlleles++;
+ Map> families = this.getSampleDB().getFamilies();
+ Set family;
+ ArrayList parents;
+ for(String familyID : families.keySet()){
+ family = families.get(familyID);
+ if(family.size()<2 || family.size()>3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size()));
}
- }
-
- return matchingAlleles;
- }
-
- private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) {
- int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0;
- int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0;
-
- int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0;
- int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0;
-
- int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child);
- int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child);
-
- return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles);
- }
-
- private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) {
- Set possiblePhasedChildGenotypes = new HashSet();
-
- for (Allele momAllele : mom.getAlleles()) {
- for (Allele dadAllele : dad.getAlleles()) {
- ArrayList possiblePhasedChildAlleles = new ArrayList();
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(dadAllele);
-
- Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getLog10PError(), child.getFilters(), child.getAttributes(), true);
-
- possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype);
- }
- }
-
- ArrayList finalGenotypes = new ArrayList();
-
- for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) {
- if (child.sameGenotype(phasedChildGenotype, true)) {
- Allele momTransmittedAllele = phasedChildGenotype.getAllele(0);
- Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1);
-
- ArrayList phasedMomAlleles = new ArrayList();
- phasedMomAlleles.add(momTransmittedAllele);
- phasedMomAlleles.add(momUntransmittedAllele);
-
- Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getLog10PError(), mom.getFilters(), mom.getAttributes(), true);
-
- Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1);
- Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1);
-
- ArrayList phasedDadAlleles = new ArrayList();
- phasedDadAlleles.add(dadTransmittedAllele);
- phasedDadAlleles.add(dadUntransmittedAllele);
-
- Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getLog10PError(), dad.getFilters(), dad.getAttributes(), true);
-
- finalGenotypes.add(phasedMomGenotype);
- finalGenotypes.add(phasedDadGenotype);
- finalGenotypes.add(phasedChildGenotype);
-
- return finalGenotypes;
- }
- }
-
- finalGenotypes.add(mom);
- finalGenotypes.add(dad);
- finalGenotypes.add(child);
-
- return finalGenotypes;
- }
-
- private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) {
- ArrayList finalGenotypes = new ArrayList();
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
-
- if (mother.isCalled() && father.isCalled() && child.isCalled()) {
- ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother);
- ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father);
- ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child);
-
- double bestConfigurationLikelihood = 0.0;
- double bestPrior = 0.0;
- Genotype bestMotherGenotype = mother;
- Genotype bestFatherGenotype = father;
- Genotype bestChildGenotype = child;
-
- double norm = 0.0;
-
- for (Genotype motherGenotype : possibleMotherGenotypes) {
- for (Genotype fatherGenotype : possibleFatherGenotypes) {
- for (Genotype childGenotype : possibleChildGenotypes) {
- double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
- double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype);
- norm += prior*configurationLikelihood;
-
- if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestPrior = prior;
- bestMotherGenotype = motherGenotype;
- bestFatherGenotype = fatherGenotype;
- bestChildGenotype = childGenotype;
- }
+ else{
+ for(Sample familyMember : family){
+ parents = familyMember.getParents();
+ if(parents.size()>0){
+ if(family.containsAll(parents))
+ this.trios.add(familyMember);
+ else
+ logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
+ break;
}
}
}
- if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) {
- Map attributes = new HashMap();
- attributes.putAll(bestChildGenotype.getAttributes());
- attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
- bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);
+ }
- finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+
+
+ }
+
+ //Create the transmission matrices
+ private void buildMatrices(){
+ mvCountMatrix = new EnumMap>>(Genotype.Type.class);
+ transmissionMatrix = new EnumMap>>(Genotype.Type.class);
+ for(Genotype.Type mother : Genotype.Type.values()){
+ mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ for(Genotype.Type father : Genotype.Type.values()){
+ mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ for(Genotype.Type child : Genotype.Type.values()){
+ mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
+ transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
+ }
+ }
+ }
+ }
+
+ //Returns the number of Mendelian Violations for a given genotype combination.
+ //If one of the parents genotype is missing, it will consider it as a parent/child pair
+ //If the child genotype or both parents genotypes are missing, 0 is returned.
+ private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return 0;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return 0;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
}
}
- return finalGenotypes;
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return 0;
+ else return (parents.size()-parentsNumRefAlleles);
+ }
+
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return 0;
+ else return parents.size()-parentsNumAltAlleles;
+ }
+
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return 0;
+
+ //MV
+ return 1;
+ }
+
+ //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
+ private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
+ int count = 0;
+ if(motherOriginal!=motherNew)
+ count++;
+ if(fatherOriginal!=fatherNew)
+ count++;
+ if(childOriginal!=childNew)
+ count++;
+ return count;
+ }
+
+ //Get a Map of genotype likelihoods.
+ //In case of null, unavailable or no call, all likelihoods are 1/3.
+ private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
+ if(genotype == null || !genotype.isCalled()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
+ likelihoods.put(Genotype.Type.HET,1.0/3.0);
+ likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
+ return likelihoods;
+ }
+ return genotype.getLikelihoods().getAsMap(true);
+ }
+
+ //Returns the Genotype.Type; returns UNVAILABLE if given null
+ private Genotype.Type getTypeSafeNull(Genotype genotype){
+ if(genotype == null)
+ return Genotype.Type.UNAVAILABLE;
+ return genotype.getType();
+ }
+
+
+ /**
+ * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair.
+ * @param ref: Reference allele
+ * @param alt: Alternative allele
+ * @param mother: Mother's genotype
+ * @param father: Father's genotype
+ * @param child: Child's genotype
+ * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
+ * @return
+ */
+ private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
+
+ //Check whether it is a pair or trio
+ //Always assign the first parent as the parent having genotype information in pairs
+ //Always assign the mother as the first parent in trios
+ int parentsCalled = 0;
+ Map firstParentLikelihoods;
+ Map secondParentLikelihoods;
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
+ ArrayList bestChildGenotype = new ArrayList();
+ Genotype.Type pairSecondParentGenotype = null;
+ if(mother == null || !mother.isCalled()){
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ bestFirstParentGenotype.add(getTypeSafeNull(father));
+ bestSecondParentGenotype.add(getTypeSafeNull(mother));
+ pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
+ if(father != null && father.isCalled())
+ parentsCalled = 1;
+ }
+ else{
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
+ if(father == null || !father.isCalled()){
+ parentsCalled = 1;
+ pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
+ }else{
+ parentsCalled = 2;
+ }
+ }
+ Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
+ bestChildGenotype.add(getTypeSafeNull(child));
+
+ //Prior vars
+ double bestConfigurationLikelihood = 0.0;
+ double norm = 0.0;
+ int configuration_index =0;
+ ArrayList bestMVCount = new ArrayList();
+ bestMVCount.add(0);
+
+ //Get the most likely combination
+ //Only check for most likely combination if at least a parent and the child have genotypes
+ if(child.isCalled() && parentsCalled > 0){
+ int mvCount;
+ int cumulativeMVCount = 0;
+ double configurationLikelihood = 0;
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){
+ for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
+ //For parent/child pairs, sum over the possible genotype configurations of the missing parent
+ if(parentsCalled<2){
+ cumulativeMVCount += mvCount;
+ configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ }
+ //Evaluate configurations of trios
+ else{
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(mvCount);
+ bestFirstParentGenotype.clear();
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.clear();
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(mvCount);
+ }
+ }
+ }
+ //Evaluate configurations of parent/child pairs
+ if(parentsCalled<2){
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(cumulativeMVCount/3);
+ bestChildGenotype.clear();
+ bestFirstParentGenotype.clear();
+ bestSecondParentGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(cumulativeMVCount/3);
+ }
+ configurationLikelihood = 0;
+ }
+ }
+ }
+
+ //normalize the best configuration probability
+ bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
+
+ //In case of multiple equally likely combinations, take a random one
+ if(bestFirstParentGenotype.size()>1){
+ configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1);
+ }
+
+ }
+ else{
+ bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
+ }
+
+ TrioPhase phasedTrioGenotypes;
+ if(parentsCalled < 2 && mother == null || !mother.isCalled())
+ phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ else
+ phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
+ return bestMVCount.get(configuration_index);
+
+ }
+
+
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){
+
+ //Increment metrics counters
+ if(parent.isCalled() && child.isCalled()){
+ counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
+ if(parent.isPhased())
+ counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
+ else{
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
+ if(parent.isHet() && child.isHet())
+ counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
+ }
+ }else{
+ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
+ }
+
+ }
+
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){
+
+ //Increment metrics counters
+ if(mother.isCalled() && father.isCalled() && child.isCalled()){
+ counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
+ if(mother.isPhased())
+ counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
+
+ else{
+ if(mvCount > 0){
+ if(mvCount >1)
+ counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
+ else
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ }
+ else if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
+
+ }
+ }else{
+ counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
+ }
}
/**
@@ -289,53 +726,153 @@ public class PhaseByTransmission extends RodWalker {
* @return null
*/
@Override
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
+ String mvfLine;
+
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
- GenotypesContext genotypesContext = GenotypesContext.create(vc.getGenotypes().size());
+ GenotypesContext genotypeMap = vc.getGenotypes();
- for (Trio trio : trios) {
- Genotype mother = vc.getGenotype(trio.getMother());
- Genotype father = vc.getGenotype(trio.getFather());
- Genotype child = vc.getGenotype(trio.getChild());
+ int mvCount;
- ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child);
+ for (Sample sample : trios) {
+ Genotype mother = vc.getGenotype(sample.getMaternalID());
+ Genotype father = vc.getGenotype(sample.getPaternalID());
+ Genotype child = vc.getGenotype(sample.getID());
+
+ //Keep only trios and parent/child pairs
+ if(mother == null && father == null || child == null)
+ continue;
+
+ ArrayList trioGenotypes = new ArrayList(3);
+ mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
Genotype phasedChild = trioGenotypes.get(2);
- genotypesContext.add(phasedMother, phasedFather, phasedChild);
+ //Fill the genotype map with the new genotypes and increment metrics counters
+ genotypeMap.add(phasedChild);
+ if(mother != null){
+ genotypeMap.add(phasedMother);
+ if(father != null){
+ genotypeMap.add(phasedFather);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ }
+ else{
+ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters);
+ if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
+ }
+ else{
+ genotypeMap.add(phasedFather);
+ updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
+ if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
+
+ //Report violation if set so
+ //TODO: ADAPT FOR PAIRS TOO!!
+ if(mvCount>0 && mvFile != null)
+ mvFile.println(mvfLine);
+
}
- VariantContext newvc = new VariantContextBuilder(vc).genotypes(genotypesContext).make();
-
- vcfWriter.add(newvc);
+ vcfWriter.add(new VariantContextBuilder(vc).genotypes(genotypeMap).make());
}
-
- return null;
+ return metricsCounters;
}
/**
- * Provide an initial value for reduce computations.
+ * Initializes the reporting counters.
*
- * @return Initial value of reduce.
+ * @return All counters initialized to 0
*/
@Override
- public Integer reduceInit() {
- return null;
+ public HashMap reduceInit() {
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
+ return metricsCounters;
}
/**
- * Reduces a single map with the accumulator provided as the ReduceType.
+ * Adds the value of the site phased to the reporting counters.
*
- * @param value result of the map.
- * @param sum accumulator for the reduce.
+ * @param value Site values
+ * @param sum accumulator for the reporting counters
* @return accumulator with result of the map taken into account.
*/
@Override
- public Integer reduce(Integer value, Integer sum) {
- return null;
+ public HashMap reduce(HashMap value, HashMap sum) {
+ sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
+ sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
+ sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
+ sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET));
+ sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS));
+ sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED));
+ sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL));
+ sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
+ sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
+ sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
+ sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS));
+ sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED));
+
+ return sum;
+ }
+
+
+ /**
+ * Reports statistics on the phasing by transmission process.
+ * @param result Accumulator with all counters.
+ */
+ @Override
+ public void onTraversalDone(HashMap result) {
+ logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
+ logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
+ logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
+ logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS));
+ logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
+ logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
+ logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET));
+ logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS));
+ logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED));
+
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
deleted file mode 100644
index e770418c1..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2010.
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.walkers.ReadPairWalker;
-import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
-
-import java.io.PrintStream;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Counts the number of read pairs encountered in a file sorted in
- * query name order. Breaks counts down by total pairs and number
- * of paired reads.
- *
- *
- * Input
- *
- * One or more bam files.
- *
- *
- * Output
- *
- * Number of pairs seen.
- *
- *
- * Examples
- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- * -R ref.fasta \
- * -T CountPairs \
- * -o output.txt \
- * -I input.bam
- *
- *
- * @author mhanna
- */
-public class CountPairsWalker extends ReadPairWalker {
- @Output
- private PrintStream out;
-
- /**
- * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec.
- */
- private long firstOfPair = 0;
-
- /**
- * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec.
- */
- private long secondOfPair = 0;
-
- /**
- * A breakdown of the total number of reads seen with exactly the same read name.
- */
- private List pairCountsByType = new ExpandingArrayList();
-
- /**
- * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser.
- * @param reads Collection of reads having the same name.
- * @return Semantics defined by implementer.
- */
- @Override
- public Integer map(Collection reads) {
- if(pairCountsByType.get(reads.size()) != null)
- pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1);
- else
- pairCountsByType.set(reads.size(),1L);
-
- for(SAMRecord read: reads) {
- if(read.getFirstOfPairFlag()) firstOfPair++;
- if(read.getSecondOfPairFlag()) secondOfPair++;
- }
-
- return 1;
- }
-
- /**
- * No pairs at the beginning of a traversal.
- * @return 0 always.
- */
- @Override
- public Long reduceInit() {
- return 0L;
- }
-
- /**
- * Combine number of pairs seen in this iteration (always 1) with total number of pairs
- * seen in previous iterations.
- * @param value Pairs in this iteration (1), from the map function.
- * @param sum Count of all pairs in prior iterations.
- * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1).
- */
- @Override
- public Long reduce(Integer value, Long sum) {
- return value + sum;
- }
-
- /**
- * Print summary statistics over the entire traversal.
- * @param sum A count of all read pairs viewed.
- */
- @Override
- public void onTraversalDone(Long sum) {
- out.printf("Total number of pairs : %d%n",sum);
- out.printf("Total number of first reads in pair : %d%n",firstOfPair);
- out.printf("Total number of second reads in pair: %d%n",secondOfPair);
- for(int i = 1; i < pairCountsByType.size(); i++) {
- if(pairCountsByType.get(i) == null)
- continue;
- out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i));
- }
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 9c24360c5..babc88966 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -270,8 +270,8 @@ public class SelectVariants extends RodWalker {
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
- * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable
- * number of variants. Use --select_random_fraction for larger numbers of variants.
+ * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
+ * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
private int numRandom = 0;
@@ -527,7 +527,7 @@ public class SelectVariants extends RodWalker {
}
}
if (SELECT_RANDOM_NUMBER) {
- randomlyAddVariant(++variantNumber, sub, ref.getBase());
+ randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub);
@@ -691,7 +691,7 @@ public class SelectVariants extends RodWalker {
return new VariantContextBuilder(builder.make()).attributes(attributes).make();
}
- private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) {
+ private void randomlyAddVariant(int rank, VariantContext vc) {
if (nVariantsAdded < numRandom)
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
index e10bcbaa0..8cba183da 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
@@ -554,4 +554,54 @@ public class GenomeLocParser {
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
}
+ /**
+ * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
+ * @param loc The original loc
+ * @param maxBasePairs The maximum number of basePairs
+ * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
+ */
+ @Requires({"loc != null", "maxBasePairs > 0"})
+ public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
+ if (GenomeLoc.isUnmapped(loc))
+ return null;
+ String contigName = loc.getContig();
+ SAMSequenceRecord contig = contigInfo.getSequence(contigName);
+ int contigIndex = contig.getSequenceIndex();
+
+ int start = loc.getStart() - maxBasePairs;
+ int stop = loc.getStart() - 1;
+
+ if (start < 1)
+ start = 1;
+ if (stop < 1)
+ return null;
+
+ return createGenomeLoc(contigName, contigIndex, start, stop, true);
+ }
+
+ /**
+ * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
+ * @param loc The original loc
+ * @param maxBasePairs The maximum number of basePairs
+ * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
+ */
+ @Requires({"loc != null", "maxBasePairs > 0"})
+ public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
+ if (GenomeLoc.isUnmapped(loc))
+ return null;
+ String contigName = loc.getContig();
+ SAMSequenceRecord contig = contigInfo.getSequence(contigName);
+ int contigIndex = contig.getSequenceIndex();
+ int contigLength = contig.getSequenceLength();
+
+ int start = loc.getStop() + 1;
+ int stop = loc.getStop() + maxBasePairs;
+
+ if (start > contigLength)
+ return null;
+ if (stop > contigLength)
+ stop = contigLength;
+
+ return createGenomeLoc(contigName, contigIndex, start, stop, true);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
index 6e4ddddc4..8c1061494 100644
--- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
@@ -171,6 +171,9 @@ public class ReadClipper {
clippedRead = op.apply(algorithm, clippedRead);
}
wasClipped = true;
+ ops.clear();
+ if ( clippedRead.isEmpty() )
+ return new GATKSAMRecord( clippedRead.getHeader() );
return clippedRead;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e); // this should never happen
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
index 1aafafc27..92c8840fb 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
@@ -353,7 +353,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
// some exceptions
if ( key.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
- if ( Math.abs(g.getLog10PError() + Genotype.NO_LOG10_PERROR) < 1e-6)
+ if ( ! g.hasLog10PError() )
val = VCFConstants.MISSING_VALUE_v4;
else {
val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index f0e164c87..159b145a0 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -233,8 +233,12 @@ public class IntervalUtils {
*
* Returns a null string if there are no differences, otherwise returns a string describing the difference
* (useful for UnitTests). Assumes both lists are sorted
+ *
+ * @param masterArg sorted master genome locs
+ * @param testArg sorted test genome locs
+ * @return null string if there are no difference, otherwise a string describing the difference
*/
- public static final String equateIntervals(List masterArg, List testArg) {
+ public static String equateIntervals(List masterArg, List testArg) {
LinkedList master = new LinkedList(masterArg);
LinkedList test = new LinkedList(testArg);
@@ -317,23 +321,6 @@ public class IntervalUtils {
return lengths;
}
- /**
- * Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
- * @param locs The genome locs.
- * @return The maximum number of parts the intervals can be split into.
- */
- public static int countContigIntervals(List locs) {
- int maxFiles = 0;
- String contig = null;
- for (GenomeLoc loc: locs) {
- if (contig == null || !contig.equals(loc.getContig())) {
- maxFiles++;
- contig = loc.getContig();
- }
- }
- return maxFiles;
- }
-
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
@@ -373,7 +360,6 @@ public class IntervalUtils {
* @return A list of lists of genome locs, split according to splits
*/
public static List> splitIntervalsToSubLists(List locs, List splits) {
- int locIndex = 1;
int start = 0;
List> sublists = new ArrayList>(splits.size());
for (Integer stop: splits) {
@@ -465,7 +451,7 @@ public class IntervalUtils {
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
@Ensures({"result != null"})
- final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) {
+ static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) {
final List split = new ArrayList();
long size = 0;
@@ -579,10 +565,101 @@ public class IntervalUtils {
}
}
- public static final long intervalSize(final List locs) {
+ public static long intervalSize(final List locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
+
+ public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
+ ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
+ GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
+ List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
+
+ if (originalList.isEmpty())
+ throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
+
+ List flankingList = getFlankingIntervals(parser, originalList, basePairs);
+
+ if (flankingList.isEmpty())
+ throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
+
+ SAMFileHeader samFileHeader = new SAMFileHeader();
+ samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
+ IntervalList intervalList = new IntervalList(samFileHeader);
+ int i = 0;
+ for (GenomeLoc loc: flankingList)
+ intervalList.add(toInterval(loc, ++i));
+ intervalList.write(flankingIntervals);
+ }
+
+ /**
+ * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
+ * @param parser A genome loc parser for creating the new intervals
+ * @param locs Original genome locs
+ * @param basePairs Number of base pairs on each side of loc
+ * @return The list of intervals between the locs
+ */
+ public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) {
+ List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
+
+ if (sorted.size() == 0)
+ return Collections.emptyList();
+
+ LinkedHashMap> locsByContig = splitByContig(sorted);
+ List expanded = new ArrayList();
+ for (String contig: locsByContig.keySet()) {
+ List contigLocs = locsByContig.get(contig);
+ int contigLocsSize = contigLocs.size();
+
+ GenomeLoc startLoc, stopLoc;
+
+ // Create loc at start of the list
+ startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
+ if (startLoc != null)
+ expanded.add(startLoc);
+
+ // Create locs between each loc[i] and loc[i+1]
+ for (int i = 0; i < contigLocsSize - 1; i++) {
+ stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
+ startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
+ if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
+ // NOTE: This is different than GenomeLoc.merge()
+ // merge() returns a loc which covers the entire range of stop and start,
+ // possibly returning positions inside loc(i) or loc(i+1)
+ // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
+ GenomeLoc merged = parser.createGenomeLoc(
+ stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
+ expanded.add(merged);
+ } else {
+ expanded.add(stopLoc);
+ expanded.add(startLoc);
+ }
+ }
+
+ // Create loc at the end of the list
+ stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
+ if (stopLoc != null)
+ expanded.add(stopLoc);
+ }
+ return expanded;
+ }
+
+ private static LinkedHashMap> splitByContig(List sorted) {
+ LinkedHashMap> splits = new LinkedHashMap>();
+ GenomeLoc last = null;
+ List contigLocs = null;
+ for (GenomeLoc loc: sorted) {
+ if (GenomeLoc.isUnmapped(loc))
+ continue;
+ if (last == null || !last.onSameContig(loc)) {
+ contigLocs = new ArrayList();
+ splits.put(loc.getContig(), contigLocs);
+ }
+ contigLocs.add(loc);
+ last = loc;
+ }
+ return splits;
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
index 6d7c8dad9..d3a52167a 100755
--- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
@@ -261,7 +261,7 @@ public class GATKSAMRecord extends BAMRecord {
* @return true if the read has no bases
*/
public boolean isEmpty() {
- return this.getReadLength() == 0;
+ return super.getReadBases() == null || super.getReadLength() == 0;
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java
index d3cc7d6a5..98032f94d 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java
@@ -25,8 +25,7 @@ final class CommonInfo {
public CommonInfo(String name, double log10PError, Set filters, Map attributes) {
this.name = name;
setLog10PError(log10PError);
- if ( filters != null && ! filters.isEmpty() )
- this.filters = filters;
+ this.filters = filters;
if ( attributes != null && ! attributes.isEmpty() ) {
this.attributes = attributes;
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index dba16cf86..bbe5308a9 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,7 +25,13 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.jgrapht.util.MathUtil;
+
+import java.util.EnumMap;
+import java.util.Map;
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
@@ -94,6 +100,47 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
+ //Returns null in case of missing likelihoods
+ public EnumMap getAsMap(boolean normalizeFromLog10){
+ //Make sure that the log10likelihoods are set
+ double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ if(likelihoods == null)
+ return null;
+ EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
+ likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
+ return likelihoodsMap;
+ }
+
+ //Return the neg log10 Genotype Quality (GQ) for the given genotype
+ //Returns Double.NEGATIVE_INFINITY in case of missing genotype
+ public double getLog10GQ(Genotype.Type genotype){
+ EnumMap likelihoods = getAsMap(false);
+ if(likelihoods == null)
+ return Double.NEGATIVE_INFINITY;
+
+ double qual = Double.NEGATIVE_INFINITY;
+ for(Map.Entry likelihood : likelihoods.entrySet()){
+ if(likelihood.getKey() == genotype)
+ continue;
+ if(likelihood.getValue() > qual)
+ qual = likelihood.getValue();
+ }
+
+ //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
+ qual = likelihoods.get(genotype) - qual;
+
+ //Quality of other genotypes 1-P(G)
+ if (qual < 0) {
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = Math.log10(1.0 - chosenGenotype);
+ }
+ return -1 * qual;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 5ad734b79..34131b9c4 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -318,7 +318,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
public VariantContext subContextFromSamples(Set sampleNames, Collection alleles) {
loadGenotypes();
VariantContextBuilder builder = new VariantContextBuilder(this);
- return builder.genotypes(genotypes.subsetToSamples(sampleNames)).make();
+ return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make();
}
public VariantContext subContextFromSamples(Set sampleNames) {
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java
index 8d7dd82ac..17a7d1974 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java
@@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest {
SAMRecordIterator iterator = new SAMRecordIterator();
GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5);
- Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap());
+ Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap());
WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs());
WindowMaker.WindowMakerIterator window = windowMaker.next();
LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null);
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java
index dc3a6cafe..62c93bddd 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java
@@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.utils.GenomeLocParser;
@@ -42,7 +43,7 @@ import java.util.Collections;
public class MockLocusShard extends LocusShard {
public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) {
super( genomeLocParser,
- new SAMDataSource(Collections.emptyList(),genomeLocParser),
+ new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),
intervals,
null);
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java
deleted file mode 100755
index e41a6b3b7..000000000
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java
+++ /dev/null
@@ -1,223 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import static org.testng.Assert.fail;
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.BaseTest;
-import org.broadinstitute.sting.commandline.Tags;
-import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
-import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
-import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
-import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.testng.annotations.AfterMethod;
-import org.testng.annotations.BeforeMethod;
-
-import org.testng.annotations.Test;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- *
- * User: aaron
- * Date: Apr 8, 2009
- * Time: 8:14:23 PM
- *
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2009 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever. Neither
- * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
- *
- */
-
-
-/**
- * @author aaron
- * @version 1.0
- * @date Apr 8, 2009
- *
- * Class SAMBAMDataSourceUnitTest
- *
- * The test of the SAMBAM simple data source.
- */
-public class SAMBAMDataSourceUnitTest extends BaseTest {
-
- private List readers;
- private IndexedFastaSequenceFile seq;
- private GenomeLocParser genomeLocParser;
-
- /**
- * This function does the setup of our parser, before each method call.
- *
- * Called before every test case method.
- */
- @BeforeMethod
- public void doForEachTest() throws FileNotFoundException {
- readers = new ArrayList();
-
- // sequence
- seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
- genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
- }
-
- /**
- * Tears down the test fixture after each call.
- *
- * Called after every test case method.
- */
- @AfterMethod
- public void undoForEachTest() {
- seq = null;
- readers.clear();
- }
-
-
- /** Test out that we can shard the file and iterate over every read */
- @Test
- public void testLinearBreakIterateAll() {
- logger.warn("Executing testLinearBreakIterateAll");
-
- // setup the data
- readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
-
- // the sharding strat.
- SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
- ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
- int count = 0;
-
- try {
- for (Shard sh : strat) {
- int readCount = 0;
- count++;
-
- GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
- logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
- logger.debug("count = " + count);
- StingSAMIterator datum = data.seek(sh);
-
- // for the first couple of shards make sure we can see the reads
- if (count < 5) {
- for (SAMRecord r : datum) {
- }
- readCount++;
- }
- datum.close();
-
- // if we're over 100 shards, break out
- if (count > 100) {
- break;
- }
- }
- }
- catch (UserException.CouldNotReadInputFile e) {
- e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
- fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
- }
- }
-
-
- /** Test out that we can shard the file and iterate over every read */
- @Test
- public void testMergingTwoBAMFiles() {
- logger.warn("Executing testMergingTwoBAMFiles");
-
- // setup the test files
- readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
-
- // the sharding strat.
- SAMDataSource data = new SAMDataSource(readers,genomeLocParser);
- ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser);
-
- ArrayList readcountPerShard = new ArrayList();
- ArrayList readcountPerShard2 = new ArrayList();
-
- // count up the first hundred shards
- int shardsToCount = 100;
- int count = 0;
-
- try {
- for (Shard sh : strat) {
- int readCount = 0;
- count++;
- if (count > shardsToCount) {
- break;
- }
-
- StingSAMIterator datum = data.seek(sh);
-
- for (SAMRecord r : datum) {
- readCount++;
-
- }
- readcountPerShard.add(readCount);
- logger.debug("read count = " + readCount);
- datum.close();
- }
- }
- catch (UserException.CouldNotReadInputFile e) {
- e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
- fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
- }
-
-
- // setup the data and the counter before our second run
- readers.clear();
- readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
- readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
-
- count = 0;
- // the sharding strat.
- data = new SAMDataSource(readers,genomeLocParser);
- strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser);
-
- logger.debug("Pile two:");
- try {
- for (Shard sh : strat) {
- int readCount = 0;
- count++;
-
- // can we leave?
- if (count > shardsToCount) {
- break;
- }
-
- StingSAMIterator datum = data.seek(sh);
-
- for (SAMRecord r : datum) {
- readCount++;
- }
-
- readcountPerShard2.add(readCount);
- logger.debug("read count = " + readCount);
- datum.close();
- }
- }
- catch (UserException.CouldNotReadInputFile e) {
- e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
- fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
- }
-
- /*int pos = 0;
- for (; pos < 100; pos++) {
- if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
- fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos));
- }
- } */
-
- }
-
-
-
-
-}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java
new file mode 100755
index 000000000..ba2d68ec9
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import static org.testng.Assert.fail;
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.commandline.Tags;
+import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
+import org.broadinstitute.sting.gatk.filters.ReadFilter;
+import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Apr 8, 2009
+ *
+ * Class SAMDataSourceUnitTest
+ *
+ * The test of the SAMBAM simple data source.
+ */
+public class SAMDataSourceUnitTest extends BaseTest {
+
+ private List readers;
+ private IndexedFastaSequenceFile seq;
+ private GenomeLocParser genomeLocParser;
+
+ /**
+ * This function does the setup of our parser, before each method call.
+ *
+ * Called before every test case method.
+ */
+ @BeforeMethod
+ public void doForEachTest() throws FileNotFoundException {
+ readers = new ArrayList();
+
+ // sequence
+ seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
+ genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
+ }
+
+ /**
+ * Tears down the test fixture after each call.
+ *
+ * Called after every test case method.
+ */
+ @AfterMethod
+ public void undoForEachTest() {
+ seq = null;
+ readers.clear();
+ }
+
+
+ /** Test out that we can shard the file and iterate over every read */
+ @Test
+ public void testLinearBreakIterateAll() {
+ logger.warn("Executing testLinearBreakIterateAll");
+
+ // setup the data
+ readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags()));
+
+ // the sharding strat.
+ SAMDataSource data = new SAMDataSource(readers,
+ new ThreadAllocation(),
+ null,
+ genomeLocParser,
+ false,
+ SAMFileReader.ValidationStringency.SILENT,
+ null,
+ null,
+ new ValidationExclusion(),
+ new ArrayList(),
+ false,
+ false);
+
+ Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer());
+ int count = 0;
+
+ try {
+ for (Shard sh : strat) {
+ int readCount = 0;
+ count++;
+
+ GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1);
+ logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig());
+ logger.debug("count = " + count);
+ StingSAMIterator datum = data.seek(sh);
+
+ // for the first couple of shards make sure we can see the reads
+ if (count < 5) {
+ for (SAMRecord r : datum) {
+ }
+ readCount++;
+ }
+ datum.close();
+
+ // if we're over 100 shards, break out
+ if (count > 100) {
+ break;
+ }
+ }
+ }
+ catch (UserException.CouldNotReadInputFile e) {
+ e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
+ fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
+ }
+ }
+}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
index 9de4d5c04..91c18078e 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
@@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.*;
import java.util.*;
@@ -66,9 +67,9 @@ public class RefMetaDataTrackerUnitTest {
C = Allele.create("C");
G = Allele.create("G");
T = Allele.create("T");
- AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C).make());
- AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G).make());
- AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T).make());
+ AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make();
+ AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make();
+ AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make();
span10_10 = makeSpan(10, 10);
span1_20 = makeSpan(1, 20);
span10_20 = makeSpan(10, 20);
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java
index 7f4d96add..9226f97e2 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java
@@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
+import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
-import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser;
@@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest {
private List bamList;
private Walker countReadWalker;
private File output;
- private long readSize = 100000;
private TraverseReads traversalEngine = null;
private IndexedFastaSequenceFile ref = null;
@@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */
@Test
public void testUnmappedReadCount() {
- SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser);
- ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
- ref.getSequenceDictionary(),
- readSize,
- genomeLocParser);
+ SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser);
+ Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
countReadWalker.initialize();
Object accumulator = countReadWalker.reduceInit();
- while (shardStrategy.hasNext()) {
+ for(Shard shard: shardStrategy) {
traversalEngine.startTimersIfNecessary();
- Shard shard = shardStrategy.next();
if (shard == null) {
fail("Shard == null");
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
index 462abeba1..5c8fa32a8 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
@@ -33,7 +33,7 @@ public class SnpEffUnitTest {
@Test
public void testParseWellFormedEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertTrue( effect.isWellFormed() && effect.isCoding() );
@@ -42,7 +42,7 @@ public class SnpEffUnitTest {
@Test
public void testParseInvalidEffectNameEffect() {
String effectName = "MADE_UP_EFFECT";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
@@ -51,7 +51,7 @@ public class SnpEffUnitTest {
@Test
public void testParseInvalidEffectImpactEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
@@ -60,27 +60,27 @@ public class SnpEffUnitTest {
@Test
public void testParseWrongNumberOfMetadataFieldsEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
}
@Test
- public void testParseSnpEffWarningEffect() {
+ public void testParseSnpEffOneWarningOrErrorEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
- Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") );
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") );
}
@Test
- public void testParseSnpEffErrorEffect() {
+ public void testParseSnpEffBothWarningAndErrorEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
- Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") );
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") );
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index b2786117f..3bfb81dd0 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
- Arrays.asList("a6687f0d3830fa6e518b7874857f6f70"));
+ Arrays.asList("9beb795536e95954f810835c6058f2ad"));
executeTest("test file has annotations, asking for annotations, #1", spec);
}
@@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
- Arrays.asList("64b6804cb1e27826e3a47089349be581"));
+ Arrays.asList("2977bb30c8b84a5f4094fe6090658561"));
executeTest("test file has annotations, asking for annotations, #2", spec);
}
@@ -64,7 +64,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
- Arrays.asList("b59508cf66da6b2de280a79b3b7d85b1"));
+ Arrays.asList("49d989f467b8d6d8f98f7c1b67cd4a05"));
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
}
@@ -72,7 +72,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
- Arrays.asList("09f8e840770a9411ff77508e0ed0837f"));
+ Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92"));
executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
}
@@ -80,7 +80,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testExcludeAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
- Arrays.asList("b8e18b23568e4d2381f51d4430213040"));
+ Arrays.asList("33062eccd6eb73bc49440365430454c4"));
executeTest("test exclude annotations", spec);
}
@@ -88,7 +88,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testOverwritingHeader() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1,
- Arrays.asList("78d2c19f8107d865970dbaf3e12edd92"));
+ Arrays.asList("062155edec46a8c52243475fbf3a2943"));
executeTest("test overwriting header", spec);
}
@@ -96,7 +96,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoReads() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
- Arrays.asList("16e3a1403fc376320d7c69492cad9345"));
+ Arrays.asList("06635f2dd91b539bfbce9bf7914d8e43"));
executeTest("not passing it any reads", spec);
}
@@ -104,7 +104,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testDBTagWithDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
- Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d"));
+ Arrays.asList("820eeba1f6e3a0758a69d937c524a38e"));
executeTest("getting DB tag with dbSNP", spec);
}
@@ -112,7 +112,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testDBTagWithHapMap() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
- Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688"));
+ Arrays.asList("31cc2ce157dd20771418c08d6b3be1fa"));
executeTest("getting DB tag with HM3", spec);
}
@@ -120,7 +120,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testUsingExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
- Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31"));
+ Arrays.asList("074865f8f8c0ca7bfd58681f396c49e9"));
executeTest("using expression", spec);
}
@@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testUsingExpressionWithID() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
- Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5"));
+ Arrays.asList("97b26db8135d083566fb585a677fbe8a"));
executeTest("using expression with ID", spec);
}
@@ -148,9 +148,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
- "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
+ "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
1,
- Arrays.asList("122321a85e448f21679f6ca15c5e22ad")
+ Arrays.asList("51258f5c880bd1ca3eb45a1711335c66")
);
executeTest("Testing SnpEff annotations", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index 1c01fbdd4..6d4a971a5 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
-import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
index c663c1dd7..2cd76e7a5 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
@@ -6,23 +6,131 @@ import org.testng.annotations.Test;
import java.util.Arrays;
public class PhaseByTransmissionIntegrationTest extends WalkerTest {
- private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission";
- private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf";
+ private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/";
+ private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped";
+ private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf";
+ private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf";
+ private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf";
+ private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf";
+ //Tests using PbT on all genotypes with default parameters
+ //And all reporting options
@Test
- public void testBasicFunctionality() {
+ public void testTrueNegativeMV() {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T PhaseByTransmission",
"-NO_HEADER",
"-R " + b37KGReference,
- "--variant " + fundamentalTestVCF,
- "-f NA12892+NA12891=NA12878",
+ "--variant " + TNTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14")
+ );
+ executeTest("testTrueNegativeMV", spec);
+ }
+
+ @Test
+ public void testTruePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + TPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556")
+ );
+ executeTest("testTruePositiveMV", spec);
+ }
+
+ @Test
+ public void testFalsePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee")
+ );
+ executeTest("testFalsePositiveMV", spec);
+ }
+
+ @Test
+ public void testSpecialCases() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + SpecialTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3")
+ );
+ executeTest("testSpecialCases", spec);
+ }
+
+ //Test using a different prior
+ //Here the FP file is used but as the prior is lowered, 3 turn to TP
+ @Test
+ public void testPriorOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-prior 1e-4",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e")
+ );
+ executeTest("testPriorOption", spec);
+ }
+
+ //Test when running without MV reporting option
+ //This is the exact same test file as FP but should not generate a .mvf file
+ @Test
+ public void testMVFileOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
"-o %s"
),
1,
- Arrays.asList("")
+ Arrays.asList("398724bc1e65314cc5ee92706e05a3ee")
);
- executeTest("testBasicFunctionality", spec);
+ executeTest("testMVFileOption", spec);
}
+
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index 3dceb9bd2..102d4715e 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
- "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
"-noEV",
"-EV TiTvVariantEvaluator",
"-noST",
"-ST FunctionalClass",
- "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
"-o %s"
),
1,
- Arrays.asList("d9dcb352c53106f54fcc981f15d35a90")
+ Arrays.asList("a36414421621b377d6146d58d2fcecd0")
);
executeTest("testFunctionClassWithSnpeff", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
index f1f849bf5..e9f138a0e 100644
--- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
@@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
@@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigIndexValid() {
- SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
}
@@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigInfoKnownContig() {
- SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
}
@@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
}
+
+ private static class FlankingGenomeLocTestData extends TestDataProvider {
+ final GenomeLocParser parser;
+ final int basePairs;
+ final GenomeLoc original, flankStart, flankStop;
+
+ private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
+ super(FlankingGenomeLocTestData.class, name);
+ this.parser = parser;
+ this.basePairs = basePairs;
+ this.original = parse(parser, original);
+ this.flankStart = flankStart == null ? null : parse(parser, flankStart);
+ this.flankStop = flankStop == null ? null : parse(parser, flankStop);
+ }
+
+ private static GenomeLoc parse(GenomeLocParser parser, String str) {
+ return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
+ }
+ }
+
+ @DataProvider(name = "flankingGenomeLocs")
+ public Object[][] getFlankingGenomeLocs() {
+ int contigLength = 10000;
+ SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
+ GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
+
+ new FlankingGenomeLocTestData("atStartBase1", parser, 1,
+ "chr1:1", null, "chr1:2");
+
+ new FlankingGenomeLocTestData("atStartBase50", parser, 50,
+ "chr1:1", null, "chr1:2-51");
+
+ new FlankingGenomeLocTestData("atStartRange50", parser, 50,
+ "chr1:1-10", null, "chr1:11-60");
+
+ new FlankingGenomeLocTestData("atEndBase1", parser, 1,
+ "chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
+
+ new FlankingGenomeLocTestData("atEndBase50", parser, 50,
+ "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
+
+ new FlankingGenomeLocTestData("atEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 10, contigLength),
+ String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
+ null);
+
+ new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
+ "chr1:2", "chr1:1", "chr1:3");
+
+ new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
+ "chr1:21-30", "chr1:1-20", "chr1:31-80");
+
+ new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
+ "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
+
+ new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
+ String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
+ String.format("chr1:%d-%d", contigLength - 20, contigLength));
+
+ new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
+ "chr1:3", "chr1:2", "chr1:4");
+
+ new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
+ "chr1:101-200", "chr1:51-100", "chr1:201-250");
+
+ new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
+ "chr1:" + (contigLength - 3),
+ "chr1:" + (contigLength - 4),
+ "chr1:" + (contigLength - 2));
+
+ new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
+ String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
+ String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
+
+ new FlankingGenomeLocTestData("unmapped", parser, 50,
+ "unmapped", null, null);
+
+ new FlankingGenomeLocTestData("fullContig", parser, 50,
+ "chr1", null, null);
+
+ return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
+ }
+
+ @Test(dataProvider = "flankingGenomeLocs")
+ public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
+ GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.flankStart);
+ assertEquals(actual, data.flankStart, description);
+ }
+
+ @Test(dataProvider = "flankingGenomeLocs")
+ public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
+ GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.flankStop);
+ assertEquals(actual, data.flankStop, description);
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
index 3f5d05e66..7a2696b7b 100755
--- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
@@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest {
double t6 = t.getElapsedTime();
Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart");
Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer");
-
- t.stop().start();
- Assert.assertTrue(t.isRunning(), "second started timer isn't running");
- Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset");
- Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time
}
private final static void idleLoop() {
diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
index f625af23c..ecb5a6d33 100644
--- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
@@ -30,8 +30,10 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
-import org.testng.annotations.BeforeClass;
-import org.testng.annotations.Test;
+import org.testng.annotations.*;
+
+import java.util.LinkedList;
+import java.util.List;
/**
* Created by IntelliJ IDEA.
@@ -44,180 +46,214 @@ public class ReadClipperUnitTest extends BaseTest {
// TODO: Add error messages on failed tests
+ //int debug = 0;
+
GATKSAMRecord read, expected;
ReadClipper readClipper;
final static String BASES = "ACTG";
final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63
- @BeforeClass
+
+ public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) {
+ Assert.assertEquals(read.getReadBases(), readBases);
+ Assert.assertEquals(read.getBaseQualityString(), baseQuals);
+ Assert.assertEquals(read.getCigarString(), cigar);
+ }
+
+ public class testParameter {
+ int inputStart;
+ int inputStop;
+ int substringStart;
+ int substringStop;
+ String cigar;
+
+ public testParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) {
+ inputStart = InputStart;
+ inputStop = InputStop;
+ substringStart = SubstringStart;
+ substringStop = SubstringStop;
+ cigar = Cigar;
+ }
+ }
+
+ // What the test read looks like
+ // Ref: 1 2 3 4 5 6 7 8
+ // Read: 0 1 2 3 - - - -
+ // -----------------------------
+ // Bases: A C T G - - - -
+ // Quals: ! + 5 ? - - - -
+
+ @BeforeMethod
public void init() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
- read.setReadUnmappedFlag(true);
read.setReadBases(new String(BASES).getBytes());
read.setBaseQualityString(new String(QUALS));
readClipper = new ReadClipper(read);
+ //logger.warn(read.getCigarString());
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipBothEndsByReferenceCoordinates() {
- logger.warn("Executing testHardClipBothEndsByReferenceCoordinates");
+ logger.warn("Executing testHardClipBothEndsByReferenceCoordinates");
+ //int debug = 1;
//Clip whole read
- Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader()));
+ Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader()));
+
//clip 1 base
- expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3);
+ expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4);
Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes());
Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3));
Assert.assertEquals(expected.getCigarString(), "1H2M1H");
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReadCoordinates() {
+
logger.warn("Executing testHardClipByReadCoordinates");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipByReadCoordinates(0,0);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(0,0,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(3,3,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(0,1,2,4,"2H2M"));//clip 2 bases at start
+ testList.add(new testParameter(2,3,0,2,"2M2H"));//clip 2 bases at end
- //clip 1 base at end
- expected = readClipper.hardClipByReadCoordinates(3,3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
-
- //clip 2 bases at start
- expected = readClipper.hardClipByReadCoordinates(0,1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
-
- //clip 2 bases at end
- expected = readClipper.hardClipByReadCoordinates(2,3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinates() {
logger.warn("Executing testHardClipByReferenceCoordinates");
-
+ //logger.warn(debug);
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipByReferenceCoordinates(-1,1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(-1,1,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(4,-1,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(-1,2,2,4,"2H2M"));//clip 2 bases at start
+ testList.add(new testParameter(3,-1,0,2,"2M2H"));//clip 2 bases at end
- //clip 1 base at end
- expected = readClipper.hardClipByReferenceCoordinates(3,-1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
-
- //clip 2 bases at start
- expected = readClipper.hardClipByReferenceCoordinates(-1,2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
-
- //clip 2 bases at end
- expected = readClipper.hardClipByReferenceCoordinates(2,-1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinates(p.inputStart,p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinatesLeftTail() {
+ init();
logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start
+ testList.add(new testParameter(2, -1, 2, 4, "2H2M"));//clip 2 bases at start
- //clip 2 bases at start
- expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinatesRightTail() {
+ init();
logger.warn("Executing testHardClipByReferenceCoordinatesRightTail");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at end
- expected = readClipper.hardClipByReferenceCoordinatesRightTail(3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
+ List testList = new LinkedList();
+ testList.add(new testParameter(-1, 4, 0, 3, "3M1H"));//clip 1 base at end
+ testList.add(new testParameter(-1, 3, 0, 2, "2M2H"));//clip 2 bases at end
- //clip 2 bases at end
- expected = readClipper.hardClipByReferenceCoordinatesRightTail(2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true ) // TODO This function is returning null reads
public void testHardClipLowQualEnds() {
- logger.warn("Executing testHardClipByReferenceCoordinates");
+ logger.warn("Executing testHardClipByReferenceCoordinates");
//Clip whole read
Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipLowQualEnds((byte)34);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
-
- //clip 2 bases at start
- expected = readClipper.hardClipLowQualEnds((byte)44);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(11,-1,2,4,"2H2M"));//clip 2 bases at start
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
+ /* todo find a better way to test lowqual tail clipping on both sides
// Reverse Quals sequence
readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
- //clip 1 base at end
- expected = readClipper.hardClipLowQualEnds((byte)34);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
+ testList = new LinkedList();
+ testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end
- //clip 2 bases at end
- expected = readClipper.hardClipLowQualEnds((byte)44);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
-
- // revert Qual sequence
- readClipper.getRead().setBaseQualityString(QUALS);
+ for ( testParameter p : testList ) {
+ init();
+ readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
+ */
}
-}
+
+ public class CigarReadMaker {
+
+ }
+
+ @Test ( enabled = false )
+ public void testHardClipSoftClippedBases() {
+
+ // Generate a list of cigars to test
+ // We will use testParameter in the following way
+ // Right tail, left tail,
+ }
+}
\ No newline at end of file
diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
index 9c3b905c2..03d33d2c5 100644
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@@ -1,8 +1,8 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.ReferenceSequenceFile;
-import net.sf.picard.util.IntervalUtil;
import net.sf.samtools.SAMFileHeader;
+import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
Assert.assertEquals(merged.size(), 1);
}
+
+ /*
+ Split into tests that can be written to files and tested by writeFlankingIntervals,
+ and lists that cannot but are still handled by getFlankingIntervals.
+ */
+ private static abstract class FlankingIntervalsTestData extends TestDataProvider {
+ final public File referenceFile;
+ final public GenomeLocParser parser;
+ final int basePairs;
+ final List original;
+ final List expected;
+
+ protected FlankingIntervalsTestData(Class> clazz, String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(clazz, name);
+ this.referenceFile = referenceFile;
+ this.parser = parser;
+ this.basePairs = basePairs;
+ this.original = parse(parser, original);
+ this.expected = parse(parser, expected);
+ }
+
+ private static List parse(GenomeLocParser parser, List locs) {
+ List parsed = new ArrayList();
+ for (String loc: locs)
+ parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
+ return parsed;
+ }
+ }
+
+ private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
+ public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
+ }
+ }
+
+ private static class FlankingIntervalsList extends FlankingIntervalsTestData {
+ public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
+ }
+ }
+
+ /* Intervals where the original and the flanks can be written to files. */
+ @DataProvider(name = "flankingIntervalsFiles")
+ public Object[][] getFlankingIntervalsFiles() {
+ File hg19ReferenceFile = new File(BaseTest.hg19Reference);
+ int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
+
+ new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:1"),
+ Arrays.asList("1:2"));
+
+ new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:1"),
+ Arrays.asList("1:2-51"));
+
+ new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:1-10"),
+ Arrays.asList("1:11-60"));
+
+ new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + hg19Length1),
+ Arrays.asList("1:" + (hg19Length1 - 1)));
+
+ new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:" + hg19Length1),
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
+
+ new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
+
+ new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:2"),
+ Arrays.asList("1:1", "1:3"));
+
+ new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:21-30"),
+ Arrays.asList("1:1-20", "1:31-80"));
+
+ new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + (hg19Length1 - 1)),
+ Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
+
+ new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
+ Arrays.asList(
+ String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
+ String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
+
+ new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:3"),
+ Arrays.asList("1:2", "1:4"));
+
+ new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200"),
+ Arrays.asList("1:51-100", "1:201-250"));
+
+ new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + (hg19Length1 - 3)),
+ Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
+
+ new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
+ Arrays.asList(
+ String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
+ String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
+
+ new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:401-500"),
+ Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
+
+ new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:301-400"),
+ Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:271-400"),
+ Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:221-400"),
+ Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:121-400"),
+ Arrays.asList("1:51-100", "1:401-450"));
+
+ new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "2:301-400"),
+ Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
+
+ // Explicit testing a problematic agilent target pair
+ new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
+ // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
+ Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
+
+ return TestDataProvider.getTests(FlankingIntervalsFile.class);
+ }
+
+ /* Intervals where either the original and/or the flanks cannot be written to a file. */
+ @DataProvider(name = "flankingIntervalsLists")
+ public Object[][] getFlankingIntervalsLists() {
+ File hg19ReferenceFile = new File(BaseTest.hg19Reference);
+ List empty = Collections.emptyList();
+
+ new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ empty,
+ empty);
+
+ new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("unmapped"),
+ empty);
+
+ new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1"),
+ empty);
+
+ new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1", "2", "3"),
+ empty);
+
+ new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:301-400", "unmapped"),
+ Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
+
+ return TestDataProvider.getTests(FlankingIntervalsList.class);
+ }
+
+ @Test(dataProvider = "flankingIntervalsFiles")
+ public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
+ File originalFile = createTempFile("original.", ".intervals");
+ File flankingFile = createTempFile("flanking.", ".intervals");
+ try {
+ List lines = new ArrayList();
+ for (GenomeLoc loc: data.original)
+ lines.add(loc.toString());
+ FileUtils.writeLines(originalFile, lines);
+
+ IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
+
+ List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
+
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.expected);
+ Assert.assertEquals(actual, data.expected, description);
+ } finally {
+ FileUtils.deleteQuietly(originalFile);
+ FileUtils.deleteQuietly(flankingFile);
+ }
+ }
+
+ @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
+ public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
+ File originalFile = createTempFile("original.", ".intervals");
+ File flankingFile = createTempFile("flanking.", ".intervals");
+ try {
+ List lines = new ArrayList();
+ for (GenomeLoc loc: data.original)
+ lines.add(loc.toString());
+ FileUtils.writeLines(originalFile, lines);
+
+ // Should throw a user exception on bad input if either the original
+ // intervals are empty or if the flanking intervals are empty
+ IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
+ } finally {
+ FileUtils.deleteQuietly(originalFile);
+ FileUtils.deleteQuietly(flankingFile);
+ }
+ }
+
+ @Test(dataProvider = "flankingIntervalsLists")
+ public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
+ List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.expected);
+ Assert.assertEquals(actual, data.expected, description);
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
index 9243588ab..70a18856f 100755
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
@@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
+import org.broadinstitute.sting.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import java.util.EnumMap;
+
/**
* Basic unit test for Genotype likelihoods objects
@@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest {
gl.getAsVector();
}
+ @Test
+ public void testGetAsMap(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(v);
+ //Log scale
+ EnumMap glMap = gl.getAsMap(false);
+ Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Linear scale
+ glMap = gl.getAsMap(true);
+ double [] vl = MathUtils.normalizeFromLog10(v);
+ Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ glMap = gl.getAsMap(false);
+ Assert.assertNull(glMap);
+
+ }
+
+ @Test
+ public void testGetLog10GQ(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString);
+
+ //GQ for the best guess genotype
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),-3.9);
+
+ double[] test = MathUtils.normalizeFromLog10(gl.getAsVector());
+
+ //GQ for the other genotypes
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1]));
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1]));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY);
+
+ }
+
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) {
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java
index 4c0d22f70..5f2dacdfb 100755
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java
@@ -8,6 +8,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.testng.annotations.BeforeSuite;
+import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
@@ -55,7 +56,10 @@ public class VariantContextUnitTest extends BaseTest {
ATC = Allele.create("ATC");
ATCref = Allele.create("ATC", true);
+ }
+ @BeforeTest
+ public void beforeTest() {
basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A');
snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A');
insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC)).referenceBaseForIndel((byte)'A');
@@ -75,16 +79,16 @@ public class VariantContextUnitTest extends BaseTest {
// test REF
List alleles = Arrays.asList(Tref);
- VariantContext vc = snpBuilder.alleles(alleles).make();
+ VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION);
// test SNPs
alleles = Arrays.asList(Tref, A);
- vc = snpBuilder.alleles(alleles).make();
+ vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
alleles = Arrays.asList(Tref, A, C);
- vc = snpBuilder.alleles(alleles).make();
+ vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
// test MNPs
@@ -98,7 +102,7 @@ public class VariantContextUnitTest extends BaseTest {
// test INDELs
alleles = Arrays.asList(Aref, ATC);
- vc = basicBuilder.alleles(alleles).make();
+ vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
alleles = Arrays.asList(ATCref, A);
@@ -106,7 +110,7 @@ public class VariantContextUnitTest extends BaseTest {
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
alleles = Arrays.asList(Tref, TA, TC);
- vc = basicBuilder.alleles(alleles).make();
+ vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
alleles = Arrays.asList(ATCref, A, AC);
@@ -131,12 +135,12 @@ public class VariantContextUnitTest extends BaseTest {
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
alleles = Arrays.asList(Aref, T, symbolic);
- vc = basicBuilder.alleles(alleles).make();
+ vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
// test SYMBOLIC
alleles = Arrays.asList(Tref, symbolic);
- vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
+ vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC);
}
@@ -280,50 +284,50 @@ public class VariantContextUnitTest extends BaseTest {
Assert.assertEquals(vc.getGenotype("foo").getType(), Genotype.Type.MIXED);
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadConstructorArgs1() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadConstructorArgs2() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadConstructorArgs3() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Throwable.class)
public void testBadConstructorArgs4() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.emptyList()).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadConstructorArgsDuplicateAlleles1() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadConstructorArgsDuplicateAlleles2() {
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make();
}
- @Test (expectedExceptions = IllegalStateException.class)
+ @Test (expectedExceptions = Throwable.class)
public void testBadLoc1() {
List alleles = Arrays.asList(Aref, T, del);
new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Throwable.class)
public void testBadID1() {
new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make();
}
- @Test (expectedExceptions = IllegalArgumentException.class)
+ @Test (expectedExceptions = Exception.class)
public void testBadID2() {
- new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("");
+ new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make();
}
@Test
@@ -557,7 +561,7 @@ public class VariantContextUnitTest extends BaseTest {
@Test(dataProvider = "getAlleles")
public void testMergeAlleles(GetAllelesTest cfg) {
final List altAlleles = cfg.alleles.subList(1, cfg.alleles.size());
- final VariantContext vc = snpBuilder.alleles(cfg.alleles).make();
+ final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).referenceBaseForIndel((byte)'A').make();
Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles");
Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size");
diff --git a/public/java/src/net/sf/samtools/GATKBinList.java b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala
similarity index 57%
rename from public/java/src/net/sf/samtools/GATKBinList.java
rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala
index b53062aaf..d90db0de4 100644
--- a/public/java/src/net/sf/samtools/GATKBinList.java
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala
@@ -22,30 +22,27 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package net.sf.samtools;
+package org.broadinstitute.sting.queue.extensions.gatk
-import java.util.BitSet;
+import org.broadinstitute.sting.queue.function.InProcessFunction
+import org.broadinstitute.sting.commandline.{Output, Argument, Input}
+import java.io.File
+import org.broadinstitute.sting.utils.interval.IntervalUtils
-/**
- * A temporary solution to work around Java access rights issues:
- * override chunk and make it public.
- * TODO: Eliminate once we determine the final fate of the BAM index reading code.
- */
-public class GATKBinList extends BinList {
- /**
- * Create a new BinList over sequenceCount sequences, consisting of the given bins.
- * @param referenceSequence Reference sequence to which these bins are relevant.
- * @param bins The given bins to include.
- */
- public GATKBinList(final int referenceSequence, final BitSet bins) {
- super(referenceSequence,bins);
- }
+class WriteFlankingIntervalsFunction extends InProcessFunction {
+ @Input(doc="The reference sequence")
+ var reference : File = _
- /**
- * Retrieves the bins stored in this list.
- * @return A bitset where a bin is present in the list if the bit is true.
- */
- public BitSet getBins() {
- return super.getBins();
- }
+ @Input(doc="The interval list to flank")
+ var inputIntervals : File = _
+
+ @Output(doc="The output intervals file to write to")
+ var outputIntervals: File = _
+
+ @Argument(doc="Number of base pair to flank the input intervals")
+ var flankSize : Int = _
+
+ def run() {
+ IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
+ }
}
diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
deleted file mode 100755
index 77eb3ccbc..000000000
--- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.broadinstitute.sting.queue.library.ipf.intervals
-
-import org.broadinstitute.sting.queue.function.InProcessFunction
-import org.broadinstitute.sting.commandline._
-import java.io.{PrintStream, File}
-import collection.JavaConversions._
-import org.broadinstitute.sting.utils.text.XReadLines
-import net.sf.picard.reference.FastaSequenceFile
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
-import collection.immutable.TreeSet
-
-// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
-class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
- @Input(doc="The interval list to expand") val inList : File = in
- @Input(doc="The reference sequence") val refDict : File = ref
- @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
- @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
- @Output(doc="The output intervals file to write to") val outList : File = out
- @Argument(doc="The output format for the intervals") val outTypeStr = opType
- @Argument(doc="The input format for the intervals") val inTypeStr = ipType
-
- var output : PrintStream = _
- var parser : GenomeLocParser = _
- var xrl : XReadLines = _
- val outType = IntervalFormatType.convert(outTypeStr)
- val inType = IntervalFormatType.convert(inTypeStr)
-
- var offsetIn : Int = 0
- var offsetOut : Int = 0
-
- var first : Boolean = true
- var lastTwo : (GenomeLoc,GenomeLoc) = _
-
- var intervalCache : TreeSet[GenomeLoc] = _
- val LINES_TO_CACHE : Int = 1000
-
- def run = {
- output = new PrintStream(outList)
- intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
- def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
- })
- parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
- xrl = new XReadLines(inList)
- offsetIn = if (isBed(inType)) 1 else 0
- offsetOut = if( isBed(outType)) 1 else 0
- var line : String = xrl.next
- while ( line.startsWith("@") ) {
- line = xrl.next
- }
- var prevLoc: GenomeLoc = null
- var curLoc: GenomeLoc = null
- var nextLoc : GenomeLoc = parseGenomeInterval(line)
- var linesProcessed : Int = 1
- while ( prevLoc != null || curLoc != null || nextLoc != null ) {
- prevLoc = curLoc
- curLoc = nextLoc
- nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
- if ( curLoc != null ) {
- val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
- val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
- if ( left != null ) {
- intervalCache += left
- }
- if ( right != null ) {
- intervalCache += right
- }
- }
- linesProcessed += 1
- if ( linesProcessed % LINES_TO_CACHE == 0 ) {
- val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
- intervalCache = intervalCache -- toPrint
- toPrint.foreach(u => output.print("%s%n".format(repr(u))))
- }
- //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
- }
-
- intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
-
- output.close()
- }
-
- def expandLeft(g: GenomeLoc) : GenomeLoc = {
- parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
- }
-
- def expandRight(g: GenomeLoc) : GenomeLoc = {
- parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
- }
-
- def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
- if ( borderG == null || ! newG.overlapsP(borderG) ) {
- return newG
- } else {
- if ( newG.getStart < borderG.getStart ) {
- if ( borderG.getStart - startInt > newG.getStart ) {
- return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
- }
- } else {
- if ( borderG.getStop + startInt < newG.getStop ){
- return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
- }
- }
- }
-
- null
- }
-
- def repr(loc : GenomeLoc) : String = {
- if ( loc == null ) return "null"
- if ( outType == IntervalFormatType.INTERVALS ) {
- return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
- } else {
- return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
- }
- }
-
- def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
- t == IntervalFormatType.BED
- }
-
- def parseGenomeInterval( s : String ) : GenomeLoc = {
- val sp = s.split("\\s+")
- // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
- if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
- }
-
- object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
- type IntervalFormatType = Value
- val INTERVALS,BED,TDF = Value
-
- def convert(s : String) : IntervalFormatType = {
- if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
- }
- }
-}
\ No newline at end of file
diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
deleted file mode 100755
index e929477a1..000000000
--- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-package org.broadinstitute.sting.queue.library.ipf.intervals
-
-import org.broadinstitute.sting.queue.function.InProcessFunction
-import collection.JavaConversions._
-import org.broadinstitute.sting.commandline._
-import java.io.{PrintStream, File}
-import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
-import org.broadinstitute.sting.utils.text.XReadLines
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
-
-class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
- @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
- @Output(doc="Output interval file to which to write") val output : File = outFile
- @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
- @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
-
-
- var outStream : PrintStream = _
- var contigs : List[String] = Nil
- var dict : SAMSequenceDictionary = _
- var parser : GenomeLocParser = _
-
- def run = {
- outStream = new PrintStream(output)
- dict = new SAMSequenceDictionary
- // note: memory hog
- val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
- sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
- parser = new GenomeLocParser(dict)
- sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
- outStream.close()
- }
-
- def writeOut(g : GenomeLoc) : Unit = {
- outStream.print("%s%n".format(g.toString))
- }
-
- def parse(s : String) : (String,Int,Int) = {
- if ( s.contains(":") ) {
- val split1 = s.split(":")
- val split2 = split1(1).split("-")
- return (split1(0),split2(0).toInt,split2(1).toInt)
- } else {
- val split = s.split("\\s+")
- return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
- }
- }
-
- def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
- parser.createGenomeLoc(coords._1,coords._2,coords._3)
- }
-
- def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
- if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
- a.init :+ (a.last :+ b)
- } else {
- a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
- }
- }
-
- def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
- if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
- null
- }
- u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
- }
-
- def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
-
-}
\ No newline at end of file
diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar
old mode 100755
new mode 100644
similarity index 88%
rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar
rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar
index bfd06f97f..ee5d02367
Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ
diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
similarity index 77%
rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml
rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
index f0568def4..5417641d3 100644
--- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml
+++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
@@ -1,3 +1,3 @@
-
+