diff --git a/ivy.xml b/ivy.xml index 96c1de844..ee24bc367 100644 --- a/ivy.xml +++ b/ivy.xml @@ -76,7 +76,7 @@ - + diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java new file mode 100644 index 000000000..5005b6265 --- /dev/null +++ b/public/java/src/net/sf/samtools/BAMFileReader.java @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader extends SAMFileReader.ReaderImplementation { + // True if reading from a File rather than an InputStream + private boolean mIsSeekable = false; + + // For converting bytes into other primitive types + private BinaryCodec mStream = null; + + // Underlying compressed data stream. + private final BAMInputStream mInputStream; + private SAMFileHeader mFileHeader = null; + + // Populated if the file is seekable and an index exists + private File mIndexFile; + private BAMIndex mIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + + // If true, all SAMRecords are fully decoded as they are read. + private final boolean eagerDecode; + + // For error-checking. + private ValidationStringency mValidationStringency; + + // For creating BAMRecords + private SAMRecordFactory samRecordFactory; + + /** + * Use the caching index reader implementation rather than the disk-hit-per-file model. + */ + private boolean mEnableIndexCaching = false; + + /** + * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. + */ + private boolean mEnableIndexMemoryMapping = true; + + /** + * Add information about the origin (reader and position) to SAM records. + */ + private SAMFileReader mFileReader = null; + + /** + * Prepare to read BAM from a stream (not seekable) + * @param stream source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final InputStream stream, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = false; + mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(null); + } + + /** + * Prepare to read BAM from a file (seekable) + * @param file source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final File file, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); + if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { + System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + + " is older than BAM " + file.getAbsolutePath()); + } + } + + BAMFileReader(final SeekableStream strm, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), + indexFile, + eagerDecode, + strm.getSource(), + validationStringency, + factory); + } + + private BAMFileReader(final BAMInputStream inputStream, + final File indexFile, + final boolean eagerDecode, + final String source, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = true; + mInputStream = inputStream; + mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(source); + mFirstRecordPointer = inputStream.getFilePointer(); + } + + /** + * If true, writes the source of every read into the source SAMRecords. + * @param enabled true to write source information into each SAMRecord. + */ + void enableFileSource(final SAMFileReader reader, final boolean enabled) { + this.mFileReader = enabled ? reader : null; + } + + /** + * If true, uses the caching version of the index reader. + * @param enabled true to write source information into each SAMRecord. + */ + public void enableIndexCaching(final boolean enabled) { + if(mIndex != null) + throw new SAMException("Unable to turn on index caching; index file has already been loaded."); + this.mEnableIndexCaching = enabled; + } + + /** + * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). + * This is slower but more scalable when accessing large numbers of BAM files sequentially. + * @param enabled True to use memory mapping, false to use regular I/O. + */ + public void enableIndexMemoryMapping(final boolean enabled) { + if (mIndex != null) { + throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); + } + this.mEnableIndexMemoryMapping = enabled; + } + + @Override void enableCrcChecking(final boolean enabled) { + this.mInputStream.setCheckCrcs(enabled); + } + + @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } + + /** + * @return true if ths is a BAM file, and has an index + */ + public boolean hasIndex() { + return (mIndexFile != null); + } + + /** + * Retrieves the index for the given file type. Ensure that the index is of the specified type. + * @return An index of the given type. + */ + public BAMIndex getIndex() { + if(mIndexFile == null) + throw new SAMException("No index is available for this BAM file."); + if(mIndex == null) + mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) + : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); + return mIndex; + } + + void close() { + if (mStream != null) { + mStream.close(); + } + if (mIndex != null) { + mIndex.close(); + } + mStream = null; + mFileHeader = null; + mIndex = null; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Set error-checking level for subsequent SAMRecord reads. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + SAMFileReader.ValidationStringency getValidationStringency() { + return this.mValidationStringency; + } + + /** + * Prepare to iterate through the SAMRecords in file order. + * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, + * that iterator must be closed before getIterator() can be called again. + * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to + * getIterator() begins its iteration where the last one left off. That is the best that can be + * done in that situation. + */ + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + @Override + CloseableIterator getIterator(final SAMFileSpan chunks) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!(chunks instanceof BAMFileSpan)) { + throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); + } + + // Create an iterator over the given chunk boundaries. + mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); + return mCurrentIterator; + } + + /** + * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know + * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, + * the rightmost bound is guaranteed to be after the last read in the file. + * @return An unbounded pointer to the first record in the BAM file. + */ + @Override + SAMFileSpan getFilePointerSpanningReads() { + return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); + } + + /** + * Prepare to iterate through the SAMRecords that match the given interval. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. + * A value of zero implies the start of the reference sequence. + * @param end A value of zero implies the end of the reference sequence. + * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval + * specified by start and end. If false, the SAMRecords need only overlap the interval. + * @return Iterator for the matching SAMRecords + */ + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords with the given alignment start. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Alignment start sought. + * @return Iterator for the matching SAMRecords. + */ + CloseableIterator queryAlignmentStart(final String sequence, final int start) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); + return mCurrentIterator; + } + + public CloseableIterator queryUnmapped() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + try { + final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); + if (startOfLastLinearBin != -1) { + mInputStream.seek(startOfLastLinearBin); + } else { + // No mapped reads in file, just start at the first read in file. + mInputStream.seek(mFirstRecordPointer); + } + mCurrentIterator = new BAMFileIndexUnmappedIterator(); + return mCurrentIterator; + } catch (IOException e) { + throw new RuntimeException("IOException seeking to unmapped reads", e); + } + } + + /** + * Reads the header from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private void readHeader(final String source) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(mValidationStringency); + mFileHeader = headerCodec.decode(new StringLineReader(textHeader), + source); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + mFileHeader.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + source); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + source); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(source)); + } + mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } + } + + /** + * Reads a single binary sequence record from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private SAMSequenceRecord readSequenceRecord(final String source) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); + } + + /** + * Iterator for non-indexed sequential iteration through all SAMRecords in file. + * Starting point of iteration is wherever current file position is when the iterator is constructed. + */ + private class BAMFileIterator implements CloseableIterator { + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec; + private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file + + BAMFileIterator() { + this(true); + } + + /** + * @param advance Trick to enable subclass to do more setup before advancing + */ + BAMFileIterator(final boolean advance) { + this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); + this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (mCurrentIterator != null && this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + + if (mNextRecord != null) { + ++this.samRecordIndex; + // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setValidationStringency(mValidationStringency); + + if (mValidationStringency != ValidationStringency.SILENT) { + final List validationErrors = mNextRecord.isValid(); + SAMUtils.processValidationErrors(validationErrors, + this.samRecordIndex, BAMFileReader.this.getValidationStringency()); + } + } + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + /** + * Read the next record from the input stream. + */ + SAMRecord getNextRecord() throws IOException { + final long startCoordinate = mInputStream.getFilePointer(); + final SAMRecord next = bamRecordCodec.decode(); + final long stopCoordinate = mInputStream.getFilePointer(); + + if(mFileReader != null && next != null) + next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); + + return next; + } + + /** + * @return The record that will be return by the next call to next() + */ + protected SAMRecord peek() { + return mNextRecord; + } + } + + /** + * Prepare to iterate through SAMRecords matching the target interval. + * @param sequence Desired reference sequence. + * @param start 1-based start of target interval, inclusive. + * @param end 1-based end of target interval, inclusive. + * @param queryType contained, overlapping, or starting-at query. + */ + private CloseableIterator createIndexIterator(final String sequence, + final int start, + final int end, + final QueryType queryType) { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + final SAMFileHeader fileHeader = getFileHeader(); + final int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMIndex fileIndex = getIndex(); + final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); + filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; + } + + // Create an iterator over the above chunk boundaries. + final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); + + // Add some preprocessing filters for edge-case reads that don't fit into this + // query type. + return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); + } + + enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} + + /** + * Look for BAM index file according to standard naming convention. + * + * @param dataFile BAM file name. + * @return Index file name, or null if not found. + */ + private static File findIndexFile(final File dataFile) { + // If input is foo.bam, look for foo.bai + final String bamExtension = ".bam"; + File indexFile; + final String fileName = dataFile.getName(); + if (fileName.endsWith(bamExtension)) { + final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; + indexFile = new File(dataFile.getParent(), bai); + if (indexFile.exists()) { + return indexFile; + } + } + + // If foo.bai doesn't exist look for foo.bam.bai + indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + private class BAMFileIndexIterator extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + + /** + * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. + * @param filePointers the block / offset combination, stored in chunk format. + */ + BAMFileIndexIterator(final long[] filePointers) { + super(false); // delay advance() until after construction + mFilePointers = filePointers; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + // Advance to next file block if necessary + while (mInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + return super.getNextRecord(); + } + } + + /** + * A decorating iterator that filters out records that are outside the bounds of the + * given query parameters. + */ + private class BAMQueryFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private final CloseableIterator wrappedIterator; + + /** + * The next record to be returned. Will be null if no such record exists. + */ + private SAMRecord mNextRecord; + + private final int mReferenceIndex; + private final int mRegionStart; + private final int mRegionEnd; + private final QueryType mQueryType; + + public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { + this.wrappedIterator = iterator; + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + mRegionStart = start; + if (queryType == QueryType.STARTING_AT) { + mRegionEnd = mRegionStart; + } else { + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + } + mQueryType = queryType; + mNextRecord = advance(); + } + + /** + * Returns true if a next element exists; false otherwise. + */ + public boolean hasNext() { + return mNextRecord != null; + } + + /** + * Gets the next record from the given iterator. + * @return The next SAM record in the iterator. + */ + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); + final SAMRecord currentRead = mNextRecord; + mNextRecord = advance(); + return currentRead; + } + + /** + * Closes down the existing iterator. + */ + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + SAMRecord advance() { + while (true) { + // Pull next record from stream + if(!wrappedIterator.hasNext()) + return null; + + final SAMRecord record = wrappedIterator.next(); + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + // If read is unmapped but has a coordinate, return it if the coordinate is within + // the query region, regardless of whether the mapped mate will be returned. + final int alignmentEnd; + if (mQueryType == QueryType.STARTING_AT) { + alignmentEnd = -1; + } else { + alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? + record.getAlignmentEnd(): alignmentStart); + } + + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + return null; + } + // Filter for overlap with region + if (mQueryType == QueryType.CONTAINED) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else if (mQueryType == QueryType.OVERLAPPING) { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } else { + if (alignmentStart == mRegionStart) { + return record; + } + } + } + } + } + + private class BAMFileIndexUnmappedIterator extends BAMFileIterator { + private BAMFileIndexUnmappedIterator() { + while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + advance(); + } + } + } + +} diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index 623f46291..4692c6671 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -25,6 +25,7 @@ package net.sf.samtools; import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan { super(); } + /** + * Create a new GATKBAMFileSpan from an existing BAMFileSpan. + * @param sourceFileSpan + */ + public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) { + if(!(sourceFileSpan instanceof BAMFileSpan)) + throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead"); + BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan; + for(Chunk chunk: sourceBAMFileSpan.getChunks()) + add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk)); + } + /** * Convenience constructor to construct a BAM file span from * a single chunk. diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index f590809e2..5d349e72e 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -69,6 +69,22 @@ public class GATKChunk extends Chunk { super.setChunkEnd(value); } + public long getBlockStart() { + return getChunkStart() >>> 16; + } + + public int getBlockOffsetStart() { + return (int)(getChunkStart() & 0xFFFF); + } + + public long getBlockEnd() { + return getChunkEnd() >>> 16; + } + + public int getBlockOffsetEnd() { + return ((int)getChunkEnd() & 0xFFFF); + } + /** * Computes an approximation of the uncompressed size of the * chunk, in bytes. Can be used to determine relative weights diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java new file mode 100644 index 000000000..d825c23d5 --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package net.sf.samtools.util; + +import java.io.IOException; + +/** + * An input stream formulated for use reading BAM files. Supports + */ +public interface BAMInputStream { + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) throws IOException; + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer(); + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check); + + public int read() throws java.io.IOException; + + public int read(byte[] bytes) throws java.io.IOException; + + public int read(byte[] bytes, int i, int i1) throws java.io.IOException; + + public long skip(long l) throws java.io.IOException; + + public int available() throws java.io.IOException; + + public void close() throws java.io.IOException; + + public void mark(int i); + + public void reset() throws java.io.IOException; + + public boolean markSupported(); +} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java new file mode 100755 index 000000000..fae2fc89b --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java @@ -0,0 +1,483 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools.util; + + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import net.sf.samtools.FileTruncatedException; + +/* + * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. + * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. + * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the + * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. + * + * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format + */ +public class BlockCompressedInputStream extends InputStream implements BAMInputStream { + private InputStream mStream = null; + private SeekableStream mFile = null; + private byte[] mFileBuffer = null; + private byte[] mCurrentBlock = null; + private int mCurrentOffset = 0; + private long mBlockAddress = 0; + private int mLastBlockLength = 0; + private final BlockGunzipper blockGunzipper = new BlockGunzipper(); + + + /** + * Note that seek() is not supported if this ctor is used. + */ + public BlockCompressedInputStream(final InputStream stream) { + mStream = IOUtil.toBufferedStream(stream); + mFile = null; + } + + /** + * Use this ctor if you wish to call seek() + */ + public BlockCompressedInputStream(final File file) + throws IOException { + mFile = new SeekableFileStream(file); + mStream = null; + + } + + public BlockCompressedInputStream(final URL url) { + mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); + mStream = null; + } + + /** + * For providing some arbitrary data source. No additional buffering is + * provided, so if the underlying source is not buffered, wrap it in a + * SeekableBufferedStream before passing to this ctor. + */ + public BlockCompressedInputStream(final SeekableStream strm) { + mFile = strm; + mStream = null; + } + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check) { + this.blockGunzipper.setCheckCrcs(check); + } + + /** + * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the + * next caller of a method for this input stream. The next caller might be the same thread or another thread. + * Note that although the next caller can read this many bytes without blocking, the available() method call itself + * may block in order to fill an internal buffer if it has been exhausted. + */ + public int available() + throws IOException { + if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { + readBlock(); + } + if (mCurrentBlock == null) { + return 0; + } + return mCurrentBlock.length - mCurrentOffset; + } + + /** + * Closes the underlying InputStream or RandomAccessFile + */ + public void close() + throws IOException { + if (mFile != null) { + mFile.close(); + mFile = null; + } else if (mStream != null) { + mStream.close(); + mStream = null; + } + // Encourage garbage collection + mFileBuffer = null; + mCurrentBlock = null; + } + + /** + * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. + * If no byte is available because the end of the stream has been reached, the value -1 is returned. + * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. + + * @return the next byte of data, or -1 if the end of the stream is reached. + */ + public int read() + throws IOException { + return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; + } + + /** + * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes + * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, + * or an exception is thrown. + * + * read(buf) has the same effect as read(buf, 0, buf.length). + * + * @param buffer the buffer into which the data is read. + * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer) + throws IOException { + return read(buffer, 0, buffer.length); + } + + private volatile ByteArrayOutputStream buf = null; + private static final byte eol = '\n'; + private static final byte eolCr = '\r'; + + /** + * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), + * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). + * + * @return A String containing the contents of the line, excluding the line terminating + * character, or null if the end of the stream has been reached + * + * @exception IOException If an I/O error occurs + * @ + */ + public String readLine() throws IOException { + int available = available(); + if (available == 0) { + return null; + } + if(null == buf){ // lazy initialisation + buf = new ByteArrayOutputStream(8192); + } + buf.reset(); + boolean done = false; + boolean foundCr = false; // \r found flag + while (!done) { + int linetmpPos = mCurrentOffset; + int bCnt = 0; + while((available-- > 0)){ + final byte c = mCurrentBlock[linetmpPos++]; + if(c == eol){ // found \n + done = true; + break; + } else if(foundCr){ // previous char was \r + --linetmpPos; // current char is not \n so put it back + done = true; + break; + } else if(c == eolCr){ // found \r + foundCr = true; + continue; // no ++bCnt + } + ++bCnt; + } + if(mCurrentOffset < linetmpPos){ + buf.write(mCurrentBlock, mCurrentOffset, bCnt); + mCurrentOffset = linetmpPos; + } + available = available(); + if(available == 0){ + // EOF + done = true; + } + } + return buf.toString(); + } + + /** + * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read + * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. + * + * This method blocks until input data is available, end of file is detected, or an exception is thrown. + * + * @param buffer buffer into which data is read. + * @param offset the start offset in array b at which the data is written. + * @param length the maximum number of bytes to read. + * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer, int offset, int length) + throws IOException { + final int originalLength = length; + while (length > 0) { + final int available = available(); + if (available == 0) { + // Signal EOF to caller + if (originalLength == length) { + return -1; + } + break; + } + final int copyLength = Math.min(length, available); + System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); + mCurrentOffset += copyLength; + offset += copyLength; + length -= copyLength; + } + return originalLength - length; + } + + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) + throws IOException { + if (mFile == null) { + throw new IOException("Cannot seek on stream based file"); + } + // Decode virtual file pointer + // Upper 48 bits is the byte offset into the compressed stream of a block. + // Lower 16 bits is the byte offset into the uncompressed stream inside the block. + final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); + final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); + final int available; + if (mBlockAddress == compressedOffset && mCurrentBlock != null) { + available = mCurrentBlock.length; + } else { + mFile.seek(compressedOffset); + mBlockAddress = compressedOffset; + mLastBlockLength = 0; + readBlock(); + available = available(); + } + if (uncompressedOffset > available || + (uncompressedOffset == available && !eof())) { + throw new IOException("Invalid file pointer: " + pos); + } + mCurrentOffset = uncompressedOffset; + } + + private boolean eof() throws IOException { + if (mFile.eof()) { + return true; + } + // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. + return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + } + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer() { + if (mCurrentOffset == mCurrentBlock.length) { + // If current offset is at the end of the current block, file pointer should point + // to the beginning of the next block. + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); + } + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); + } + + public static long getFileBlock(final long bgzfOffset) { + return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); + } + + /** + * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). + * @return true if the given file looks like a valid BGZF file. + */ + public static boolean isValidFile(final InputStream stream) + throws IOException { + if (!stream.markSupported()) { + throw new RuntimeException("Cannot test non-buffered stream"); + } + stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + stream.reset(); + return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); + } + + private static boolean isValidBlockHeader(final byte[] buffer) { + return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && + (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && + (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && + buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && + buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && + buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); + } + + private void readBlock() + throws IOException { + + if (mFileBuffer == null) { + mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; + } + int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + if (count == 0) { + // Handle case where there is no empty gzip block at end. + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mCurrentBlock = new byte[0]; + return; + } + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + throw new IOException("Premature end of file"); + } + final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; + if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { + throw new IOException("Unexpected compressed block length: " + blockLength); + } + final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; + count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); + if (count != remaining) { + throw new FileTruncatedException("Premature end of file"); + } + inflateBlock(mFileBuffer, blockLength); + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mLastBlockLength = blockLength; + } + + private void inflateBlock(final byte[] compressedBlock, final int compressedLength) + throws IOException { + final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); + byte[] buffer = mCurrentBlock; + mCurrentBlock = null; + if (buffer == null || buffer.length != uncompressedLength) { + try { + buffer = new byte[uncompressedLength]; + } catch (NegativeArraySizeException e) { + throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); + } + } + blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); + mCurrentBlock = buffer; + } + + private int readBytes(final byte[] buffer, final int offset, final int length) + throws IOException { + if (mFile != null) { + return readBytes(mFile, buffer, offset, length); + } else if (mStream != null) { + return readBytes(mStream, buffer, offset, length); + } else { + return 0; + } + } + + private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = file.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + private int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } + + public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} + + public static FileTermination checkTermination(final File file) + throws IOException { + final long fileSize = file.length(); + if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { + return FileTermination.DEFECTIVE; + } + final RandomAccessFile raFile = new RandomAccessFile(file, "r"); + try { + raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; + raFile.readFully(buf); + if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { + return FileTermination.HAS_TERMINATOR_BLOCK; + } + final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); + buf = new byte[bufsize]; + raFile.seek(fileSize - bufsize); + raFile.read(buf); + for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; + i >= 0; --i) { + if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, + buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { + continue; + } + final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; + if (buf.length - i == totalBlockSizeMinusOne + 1) { + return FileTermination.HAS_HEALTHY_LAST_BLOCK; + } else { + return FileTermination.DEFECTIVE; + } + } + return FileTermination.DEFECTIVE; + } finally { + raFile.close(); + } + } + + private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { + for (int i = 0; i < length; ++i) { + if (preamble[i] != buf[i + startOffset]) { + return false; + } + } + return true; + } +} + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index bed1e710e..b0b57f7fc 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -331,12 +331,12 @@ public abstract class CommandLineProgram { * used to indicate an error occured * * @param msg the message - * @param e the error + * @param t the error */ - public static void exitSystemWithError(String msg, final Exception e) { + public static void exitSystemWithError(String msg, final Throwable t) { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("stack trace %n"); - e.printStackTrace(); + t.printStackTrace(); errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); @@ -394,8 +394,8 @@ public abstract class CommandLineProgram { * * @param e the exception occured */ - public static void exitSystemWithError(Exception e) { - exitSystemWithError(e.getMessage(), e); + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b8488dc9a..d3db35c07 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -99,8 +99,8 @@ public class CommandLineGATK extends CommandLineExecutable { } catch (net.sf.samtools.SAMException e) { // Let's try this out and see how it is received by our users exitSystemWithSamError(e); - } catch (Exception e) { - exitSystemWithError(e); + } catch (Throwable t) { + exitSystemWithError(t); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f8e87aa58..f2e0b5d0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; @@ -126,6 +127,11 @@ public class GenomeAnalysisEngine { */ private Collection filters; + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + /** * A currently hacky unique name for this GATK instance */ @@ -199,6 +205,9 @@ public class GenomeAnalysisEngine { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + // Prepare the data for traversal. initializeDataSources(); @@ -218,7 +227,7 @@ public class GenomeAnalysisEngine { // create the output streams " initializeOutputStreams(microScheduler.getOutputTracker()); - ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); // execute the microscheduler, storing the results return microScheduler.execute(this.walker, shardStrategy); @@ -266,6 +275,16 @@ public class GenomeAnalysisEngine { return Collections.unmodifiableList(filters); } + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null; + Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null; + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads); + } + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. @@ -286,7 +305,7 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); } - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); } protected DownsamplingMethod getDownsamplingMethod() { @@ -397,103 +416,49 @@ public class GenomeAnalysisEngine { * @param intervals intervals * @return the sharding strategy */ - protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original - // sharding system; it's required with the new sharding system only for locus walkers. - if(readsDataSource != null && !readsDataSource.hasIndex() ) { - if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - Shard.ShardType shardType; if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - shardType = Shard.ShardType.LOCUS; + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); } - else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) - shardType = Shard.ShardType.READ; else - throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files"); - - List region; - if(intervals != null) - region = intervals.toList(); - else { - region = new ArrayList(); - for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences()) - region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength())); - } - - return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region); + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); } - - ShardStrategy shardStrategy; - ShardStrategyFactory.SHATTER_STRATEGY shardType; - - long SHARD_SIZE = 100000L; - - if (walker instanceof LocusWalker) { - if (walker instanceof RodWalker) SHARD_SIZE *= 1000; - - if (intervals != null && !intervals.isEmpty()) { - if (readsDataSource == null) - throw new IllegalArgumentException("readsDataSource is null"); - if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE,getGenomeLocParser()); - } else if (walker instanceof ReadWalker || - walker instanceof DuplicateWalker) { - shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; - - if (intervals != null && !intervals.isEmpty()) { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } - } else if (walker instanceof ReadPairWalker) { - if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } else - throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName()); - - return shardStrategy; } protected boolean flashbackData() { @@ -751,6 +716,8 @@ public class GenomeAnalysisEngine { return new SAMDataSource( samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, genomeLocParser, argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, @@ -763,8 +730,7 @@ public class GenomeAnalysisEngine { getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, - argCollection.defaultBaseQualities, - !argCollection.disableLowMemorySharding); + argCollection.defaultBaseQualities); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8078a1ea4..64b63dcd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -194,10 +194,14 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) - public int numberOfThreads = 1; + /** How many threads should be allocated to this analysis. */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + public Integer numberOfThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false) + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) + public Integer numberOfBAMFileHandles = null; + + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; // -------------------------------------------------------------------------------------------------------------- @@ -292,9 +296,6 @@ public class GATKArgumentCollection { @Hidden public boolean allowIntervalsWithUnindexedBAM = false; - @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false) - public boolean disableLowMemorySharding = false; - // -------------------------------------------------------------------------------------------------------------- // // methods @@ -365,7 +366,11 @@ public class GATKArgumentCollection { (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } - if (other.numberOfThreads != this.numberOfThreads) { + if (!other.numberOfThreads.equals(this.numberOfThreads)) { + return false; + } + if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) || + (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) { return false; } if (other.intervalMerging != this.intervalMerging) { @@ -389,9 +394,6 @@ public class GATKArgumentCollection { if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; - if (disableLowMemorySharding != other.disableLowMemorySharding) - return false; - return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java deleted file mode 100644 index de938e845..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 7, 2011 - * Time: 2:46:34 PM - * To change this template use File | Settings | File Templates. - */ -public class BAMBlockStartIterator implements Iterator { - /** - * How large is a BGZF header? - */ - private static int BGZF_HEADER_SIZE = 18; - - /** - * Where within the header does the BLOCKSIZE actually live? - */ - private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2; - - private FileChannel bamInputChannel; - private ByteBuffer headerByteBuffer; - - private long nextLocation = 0; - - public BAMBlockStartIterator(File bamFile) { - try { - FileInputStream bamInputStream = new FileInputStream(bamFile); - bamInputChannel = bamInputStream.getChannel(); - - headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE); - headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - } - catch(IOException ex) { - throw new StingException("Could not open file",ex); - } - } - - public boolean hasNext() { - return nextLocation != -1; - } - - public Long next() { - long currentLocation = nextLocation; - advance(); - return currentLocation; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator"); - } - - private void advance() { - int readStatus; - - headerByteBuffer.clear(); - try { - readStatus = bamInputChannel.read(headerByteBuffer); - } - catch(IOException ex) { - throw new StingException("Could not read header data",ex); - } - - if(readStatus == -1) { - nextLocation = -1; - try { - bamInputChannel.close(); - } - catch(IOException ex) { - throw new StingException("Could not close input file",ex); - } - return; - } - - headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION); - int blockSize = headerByteBuffer.getShort(); - - try { - bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1); - nextLocation = bamInputChannel.position(); - } - catch(IOException ex) { - throw new StingException("Could not reposition input stream",ex); - } - } - - public static void main(String argv[]) throws IOException { - BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam")); - int i = 0; - while(blockStartIterator.hasNext()) - System.out.printf("%d -> %d%n",i++,blockStartIterator.next()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java deleted file mode 100644 index 4d91fb45f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKChunk; -import net.sf.samtools.LinearIndex; - -import java.util.*; - -/** - * Represents the contents of a bam index file for one reference. - * A BAM index (.bai) file contains information for all references in the bam file. - * This class describes the data present in the index file for one of these references; - * including the bins, chunks, and linear index. - */ -class BAMIndexContent { - /** - * The reference sequence for the data currently loaded. - */ - private final int mReferenceSequence; - - /** - * A list of all bins in the above reference sequence. - */ - private final BinList mBinList; - - /** - * The linear index for the reference sequence above. - */ - private final LinearIndex mLinearIndex; - - - /** - * @param referenceSequence Content corresponds to this reference. - * @param bins Array of bins represented by this content, possibly sparse - * @param numberOfBins Number of non-null bins - * @param linearIndex Additional index used to optimize queries - */ - BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) { - this.mReferenceSequence = referenceSequence; - this.mBinList = new BinList(bins, numberOfBins); - this.mLinearIndex = linearIndex; - } - - /** - * Reference for this Content - */ - public int getReferenceSequence() { - return mReferenceSequence; - } - - /** - * Does this content have anything in this bin? - */ - public boolean containsBin(final GATKBin bin) { - return mBinList.getBin(bin.getBinNumber()) != null; - } - - /** - * @return iterable list of bins represented by this content - */ - public BinList getBins() { - return mBinList; - } - - /** - * @return the number of non-null bins represented by this content - */ - int getNumberOfNonNullBins() { - return mBinList.getNumberOfNonNullBins(); - } - - /** - * @return all chunks associated with all bins in this content - */ - public List getAllChunks() { - List allChunks = new ArrayList(); - for (GATKBin b : mBinList) - if (b.getChunkList() != null) { - allChunks.addAll(Arrays.asList(b.getChunkList())); - } - return Collections.unmodifiableList(allChunks); - } - - /** - * @return the linear index represented by this content - */ - public LinearIndex getLinearIndex() { - return mLinearIndex; - } - - /** - * This class is used to encapsulate the list of Bins store in the BAMIndexContent - * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure - */ - class BinList implements Iterable { - - private final GATKBin[] mBinArray; - public final int numberOfNonNullBins; - public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based - - /** - * @param binArray a sparse array representation of the bins. The index into the array is the bin number. - * @param numberOfNonNullBins - */ - BinList(GATKBin[] binArray, int numberOfNonNullBins) { - this.mBinArray = binArray; - this.numberOfNonNullBins = numberOfNonNullBins; - this.maxBinNumber = mBinArray.length - 1; - } - - GATKBin getBin(int binNumber) { - if (binNumber > maxBinNumber) return null; - return mBinArray[binNumber]; - } - - int getNumberOfNonNullBins() { - return numberOfNonNullBins; - } - - /** - * Gets an iterator over all non-null bins. - * - * @return An iterator over all bins. - */ - public Iterator iterator() { - return new BinIterator(); - } - - private class BinIterator implements Iterator { - /** - * Stores the bin # of the Bin currently in use. - */ - private int nextBin; - - public BinIterator() { - nextBin = 0; - } - - /** - * Are there more bins in this set, waiting to be returned? - * - * @return True if more bins are remaining. - */ - public boolean hasNext() { - while (nextBin <= maxBinNumber) { - if (getBin(nextBin) != null) return true; - nextBin++; - } - return false; - } - - /** - * Gets the next bin in the provided BinList. - * - * @return the next available bin in the BinList. - */ - public GATKBin next() { - if (!hasNext()) - throw new NoSuchElementException("This BinIterator is currently empty"); - GATKBin result = getBin(nextBin); - nextBin++; - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a bin iterator"); - } - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java deleted file mode 100644 index 15a372ca6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; - -import java.util.HashMap; -import java.util.Map; - -/** - * Models a bin at which all BAM files in the merged input stream overlap. - */ -class BAMOverlap { - public final int start; - public final int stop; - - private final Map bins = new HashMap(); - - public BAMOverlap(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - public void addBin(final SAMReaderID id, final Bin bin) { - bins.put(id,bin); - } - - public Bin getBin(final SAMReaderID id) { - return bins.get(id); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index 521bcd5a3..762722fcd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator { /** * Create a new BAM schedule based on the given index. - * @param indexFiles Index files. + * @param dataSource The SAM data source to use. * @param intervals List of */ - public BAMSchedule(final Map indexFiles, final List intervals) { + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); - referenceSequence = intervals.get(0).getContigIndex(); + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); createScheduleFile(); - readerIDs.addAll(indexFiles.keySet()); + readerIDs.addAll(dataSource.getReaderIDs()); for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = indexFiles.get(reader); + final GATKBAMIndex index = dataSource.getIndex(reader); final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); @@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator { if(selectedIterators.isEmpty()) return; + // Create the target schedule entry BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { PeekableIterator scheduleIterator = scheduleIterators.get(reader); BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); @@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator { scheduleIterator.next(); } + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + nextScheduleEntry = mergedScheduleEntry; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 47eb55b28..dca4cc771 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.*; @@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator { private FilePointer nextFilePointer = null; - private final GenomeLocSortedSet loci; + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; - private final PeekableIterator locusIterator; + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); + for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { + // Match only on sequence name; trust startup validation to make sure all the sequences match. + if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) + intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); + } + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } - private GenomeLoc currentLocus; + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } - public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } + + + private BAMScheduler(final SAMDataSource dataSource) { this.dataSource = dataSource; - for(SAMReaderID reader: dataSource.getReaderIDs()) - indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(); + Map currentPosition = dataSource.getCurrentPosition(); + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; } public boolean hasNext() { @@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator { if(!hasNext()) throw new NoSuchElementException("No next element available in interval sharder"); FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; advance(); + return currentFilePointer; } @@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator { if(loci.isEmpty()) return; - nextFilePointer = null; while(nextFilePointer == null && currentLocus != null) { // special case handling of the unmapped shard. if(currentLocus == GenomeLoc.UNMAPPED) { nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); currentLocus = null; continue; } @@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator { int coveredRegionStop = Integer.MAX_VALUE; GenomeLoc coveredRegion = null; - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); // No overlapping data at all. if(scheduleEntry != null) { @@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator { } else { // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); for(SAMReaderID reader: indexFiles.keySet()) nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); } @@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator { // Early exit if no bins were found. if(coveredRegion == null) { // for debugging only: maximum split is 16384. - if(currentLocus.size() > 16384) { - GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - } - else { - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - } + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; continue; } // Early exit if only part of the first interval was found. if(currentLocus.startsBefore(coveredRegion)) { - // for debugging only: maximum split is 16384. int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); GenomeLoc[] splitContigs = currentLocus.split(splitPoint); nextFilePointer.addLocation(splitContigs[0]); @@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator { /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM indices. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex(); + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { if(bamScheduleIterator != null) bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentLocus.getContigIndex(); + lastReferenceSequenceLoaded = currentContigIndex; // Naive algorithm: find all elements in current contig for proper schedule creation. List lociInContig = new LinkedList(); for(GenomeLoc locus: loci) { - if(locus.getContigIndex() == lastReferenceSequenceLoaded) + if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); } if(!bamScheduleIterator.hasNext()) @@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator { return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; } + /** + * Create a span from the given start point to the end of the file. + * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). + * @return A file span from the given point to the end of the file. + */ + private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { + return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java new file mode 100644 index 000000000..f468d2020 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.LinkedList; +import java.util.Queue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Preloads BGZF blocks in preparation for unzipping and data processing. + * TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should + * TODO: be replaced with a central thread management strategy. + */ +public class BGZFBlockLoadingDispatcher { + /** + * The file handle cache, used when allocating blocks from the dispatcher. + */ + private final FileHandleCache fileHandleCache; + + private final ExecutorService threadPool; + + private final Queue inputQueue; + + public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { + threadPool = Executors.newFixedThreadPool(numThreads); + fileHandleCache = new FileHandleCache(numFileHandles); + inputQueue = new LinkedList(); + + threadPool.execute(new BlockLoader(this,fileHandleCache,true)); + } + + /** + * Initiates a request for a new block load. + * @param readerPosition Position at which to load. + */ + void queueBlockLoad(final SAMReaderPosition readerPosition) { + synchronized(inputQueue) { + inputQueue.add(readerPosition); + inputQueue.notify(); + } + } + + /** + * Claims the next work request from the queue. + * @return The next work request, or null if none is available. + */ + SAMReaderPosition claimNextWorkRequest() { + synchronized(inputQueue) { + while(inputQueue.isEmpty()) { + try { + inputQueue.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item"); + } + } + return inputQueue.poll(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java new file mode 100644 index 000000000..e377f865d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BAMInputStream; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.RuntimeEOFException; +import net.sf.samtools.util.SeekableStream; +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends SeekableStream implements BAMInputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current position. + */ + private SAMReaderPosition position; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Has the buffer been filled since last request? + */ + private boolean bufferFilled = false; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.samFile.length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.samFile); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + if(buffer.remaining() > 0) { + // If there's data in the buffer, figure out from whence it came. + final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0; + final int blockOffset = buffer.position(); + filePointer = blockAddress << 16 | blockOffset; + } + else { + // Otherwise, find the next position to load. + filePointer = position.getBlockAddress() << 16; + } + } + + if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) + throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", + BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer), + BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()))); + + return filePointer; + } + + public void seek(long target) { + // TODO: Validate the seek point. + //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); + synchronized(lock) { + clearBuffers(); + position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target)); + waitForBufferFill(); + buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target)); + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(target); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + } + + private void clearBuffers() { + this.position.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + blockOffsets.clear(); + blockPositions.clear(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return position != null && position.getBlockAddress() >= length; + } + } + + public void setCheckCrcs(final boolean check) { + // TODO: Implement + } + + /** + * Submits a new access plan for the given dataset. + * @param position The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final SAMReaderPosition position) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + synchronized(lock) { + // Assume that the access plan is going to tell us to start where we are and move forward. + // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. + if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) + position.advancePosition(this.position.getBlockAddress()); + } + this.position = position; + } + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first block in the series. + while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) { + bytesToRemove += blockOffsets.remove(); + blockPositions.remove(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + buffer.position(bytesToRemove); + buffer.compact(); + + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param position target position for the data. + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + synchronized(lock) { + try { + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Advance the position to take the most recent read into account. + long lastReadPosition = position.getBlockAddress(); + + byte[] validBytes = null; + if(validatingInputStream != null) { + validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(lastReadPosition << 16); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + this.position = position; + position.advancePosition(filePosition); + + if(buffer.remaining() < incomingBuffer.remaining()) { + //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + lock.wait(); + //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); + } + + // Queue list of block offsets / block positions. + blockOffsets.add(buffer.position()); + blockPositions.add(lastReadPosition); + + buffer.put(incomingBuffer); + + // Set up the buffer for reading. + buffer.flip(); + bufferFilled = true; + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + + if(validatingInputStream != null) { + byte[] validBytes = new byte[length]; + try { + validatingInputStream.read(validBytes,offset,length); + for(int i = offset; i < offset+length; i++) { + if(bytes[i] != validBytes[i]) { + System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId()); + throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); + } + } + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + return length - remaining; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + bufferFilled = false; + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(position); + try { + lock.wait(); + } + catch(InterruptedException ex) { + // TODO: handle me. + throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); + } + + if(bufferFilled && buffer.remaining() == 0) + throw new RuntimeEOFException("No more data left in InputStream"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java new file mode 100644 index 000000000..ab4299802 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +/** + * An engine for loading blocks. + */ +class BlockLoader implements Runnable { + /** + * Coordinates the input queue. + */ + private BGZFBlockLoadingDispatcher dispatcher; + + /** + * A cache from which to retrieve open file handles. + */ + private final FileHandleCache fileHandleCache; + + /** + * Whether asynchronous decompression should happen. + */ + private final boolean decompress; + + /** + * An direct input buffer for incoming data from disk. + */ + private final ByteBuffer inputBuffer; + + public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) { + this.dispatcher = dispatcher; + this.fileHandleCache = fileHandleCache; + this.decompress = decompress; + + this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + inputBuffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void run() { + for(;;) { + SAMReaderPosition readerPosition = null; + try { + readerPosition = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + + long blockAddress = readerPosition.getBlockAddress(); + //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); + + ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + long nextBlockAddress = position(inputStream); + fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + + ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; + int bytesCopied = block.remaining(); + + BlockInputStream bamInputStream = readerPosition.getInputStream(); + bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + + //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); + } + catch(Throwable error) { + if(readerPosition != null && readerPosition.getInputStream() != null) + readerPosition.getInputStream().reportException(error); + } + } + + } + + private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException { + FileChannel channel = inputStream.getChannel(); + + // Read the block header + channel.position(blockAddress); + + int uncompressedDataSize = 0; + int bufferSize = 0; + + do { + inputBuffer.clear(); + inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + inputBuffer.flip(); + if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) + throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass."); + + // Verify that the file was read at a valid point. + if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 || + unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 || + unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG || + unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN || + unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 || + unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) { + throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position"); + } + + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET); + bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1; + + // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block. + inputBuffer.limit(bufferSize); + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block. + uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4); + //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize); + } + while(uncompressedDataSize == 0 && channel.position() < channel.size()); + + // Prepare the buffer for reading. + inputBuffer.flip(); + + return inputBuffer; + } + + private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException { + final int compressedBufferSize = bgzfBlock.remaining(); + + // Determine the uncompressed buffer size ( + bgzfBlock.position(bgzfBlock.limit()-4); + int uncompressedBufferSize = bgzfBlock.getInt(); + byte[] uncompressedContent = new byte[uncompressedBufferSize]; + + // Bound the CDATA section of the buffer. + bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH); + bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + byte[] compressedContent = new byte[bgzfBlock.remaining()]; + ByteBuffer.wrap(compressedContent).put(bgzfBlock); + + // Decompress the buffer. + final Inflater inflater = new Inflater(true); + inflater.setInput(compressedContent); + int bytesUncompressed = inflater.inflate(uncompressedContent); + if(bytesUncompressed != uncompressedBufferSize) + throw new ReviewedStingException("Error decompressing block"); + + return ByteBuffer.wrap(uncompressedContent); + } + + private long position(final FileInputStream inputStream) throws IOException { + return inputStream.getChannel().position(); + } + + private int unpackUByte8(final ByteBuffer buffer,final int position) { + return buffer.get(position) & 0xFF; + } + + private int unpackUInt16(final ByteBuffer buffer,final int position) { + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + return buffer.getShort(position) & 0xFFFF; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java new file mode 100644 index 000000000..29de6eb37 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index e4141f61c..df7827250 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.SAMFileSpan; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -40,28 +41,25 @@ import java.util.*; */ public class FilePointer { protected final SortedMap fileSpans = new TreeMap(); - protected final BAMOverlap overlap; - protected final List locations; + protected final List locations = new ArrayList(); /** * Does this file pointer point into an unmapped region? */ protected final boolean isRegionUnmapped; - public FilePointer() { - this((BAMOverlap)null); - } - - public FilePointer(final GenomeLoc location) { - this.overlap = null; - this.locations = Collections.singletonList(location); - this.isRegionUnmapped = GenomeLoc.isUnmapped(location); - } - - public FilePointer(final BAMOverlap overlap) { - this.overlap = overlap; - this.locations = new ArrayList(); - this.isRegionUnmapped = false; + public FilePointer(final GenomeLoc... locations) { + this.locations.addAll(Arrays.asList(locations)); + boolean foundMapped = false, foundUnmapped = false; + for(GenomeLoc location: locations) { + if(GenomeLoc.isUnmapped(location)) + foundUnmapped = true; + else + foundMapped = true; + } + if(foundMapped && foundUnmapped) + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + this.isRegionUnmapped = foundUnmapped; } /** @@ -217,4 +215,20 @@ public class FilePointer { fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); combined.addFileSpans(initialElement.getKey(),fileSpan); } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FilePointer:%n"); + builder.append("\tlocations = {"); + builder.append(Utils.join(";",locations)); + builder.append("}%n\tregions = %n"); + for(Map.Entry entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 4ddf28dce..f78693c27 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -25,419 +25,58 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.AbstractBAMFileIndex; -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.*; +import java.util.Iterator; /** - * Shard intervals based on position within the BAM file. - * - * @author mhanna - * @version 0.1 + * Handles the process of aggregating BAM intervals into individual shards. + * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder. */ -public class IntervalSharder { - private static Logger logger = Logger.getLogger(IntervalSharder.class); +public class IntervalSharder implements Iterator { + /** + * The iterator actually laying out the data for BAM scheduling. + */ + private final PeekableIterator wrappedIterator; - public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - return new IntervalSharder.FilePointerIterator(dataSource,loci); + /** + * The parser, for interval manipulation. + */ + private final GenomeLocParser parser; + + public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); + } + + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + } + + public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser()); + } + + private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) { + wrappedIterator = new PeekableIterator(scheduler); + this.parser = parser; + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); } /** - * A lazy-loading iterator over file pointers. + * Accumulate shards where there's no additional cost to processing the next shard in the sequence. + * @return The next file pointer to process. */ - private static class FilePointerIterator implements Iterator { - final SAMDataSource dataSource; - final GenomeLocSortedSet loci; - final PeekableIterator locusIterator; - final Queue cachedFilePointers = new LinkedList(); - - public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - this.dataSource = dataSource; - this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - advance(); - } - - public boolean hasNext() { - return !cachedFilePointers.isEmpty(); - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("FilePointerIterator iteration is complete"); - FilePointer filePointer = cachedFilePointers.remove(); - if(cachedFilePointers.isEmpty()) - advance(); - return filePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator"); - } - - private void advance() { - GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); - String contig = null; - - // If the next section of the BAM to be processed is unmapped, handle this region separately. - while(locusIterator.hasNext() && nextBatch.isEmpty()) { - contig = null; - while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) { - GenomeLoc nextLocus = locusIterator.next(); - contig = nextLocus.getContig(); - nextBatch.add(nextLocus); - } - } - - if(nextBatch.size() > 0) { - cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); - } - } + public FilePointer next() { + FilePointer current = wrappedIterator.next(); + while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); + return current; } - /** - * Merge / split intervals based on an awareness of the structure of the BAM file. - * @param dataSource - * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads. - * @param loci - * @return - */ - private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { - // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs. - if(contig == null) { - FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - filePointer.addFileSpans(id,null); - return Collections.singletonList(filePointer); - } - - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - List filePointers = new ArrayList(); - FilePointer lastFilePointer = null; - BAMOverlap lastBAMOverlap = null; - - Map readerToIndexMap = new HashMap(); - IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator(); - for(SAMReaderID id: dataSource.getReaderIDs()) { - final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); - // If this contig can't be found in the reference, skip over it. - if(referenceSequence == null && contig != null) - continue; - final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id); - binMerger.addReader(id, - index, - referenceSequence.getSequenceIndex(), - index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator()); - // Cache the reader for later data lookup. - readerToIndexMap.put(id,index); - } - - PeekableIterator binIterator = new PeekableIterator(binMerger); - - for(GenomeLoc location: loci) { - if(!location.getContig().equals(contig)) - throw new ReviewedStingException("Location outside bounds of contig"); - - if(!binIterator.hasNext()) - break; - - int locationStart = location.getStart(); - final int locationStop = location.getStop(); - - // Advance to first bin. - while(binIterator.peek().stop < locationStart) - binIterator.next(); - - // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make - // sure the extending bin is not pruned from the list. - List bamOverlaps = new ArrayList(); - while(binIterator.hasNext() && binIterator.peek().stop <= locationStop) - bamOverlaps.add(binIterator.next()); - if(binIterator.hasNext() && binIterator.peek().start <= locationStop) - bamOverlaps.add(binIterator.peek()); - - // Bins found; try to match bins with locations. - Iterator bamOverlapIterator = bamOverlaps.iterator(); - - while(locationStop >= locationStart) { - int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0; - int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0; - - while(binStop < locationStart && bamOverlapIterator.hasNext()) { - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - lastBAMOverlap = bamOverlapIterator.next(); - lastFilePointer = new FilePointer(lastBAMOverlap); - binStart = lastFilePointer.overlap.start; - binStop = lastFilePointer.overlap.stop; - } - - if(locationStart < binStart) { - // The region starts before the first bin in the sequence. Add the region occurring before the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - final int regionStop = Math.min(locationStop,binStart-1); - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop); - lastFilePointer = new FilePointer(subset); - - locationStart = regionStop + 1; - } - else if(locationStart > binStop) { - // The region starts after the last bin in the sequence. Add the region occurring after the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.add(new FilePointer(subset)); - - locationStart = locationStop + 1; - } - else { - if(lastFilePointer == null) - throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer."); - - // The start of the region overlaps the bin. Add the overlapping subset. - final int regionStop = Math.min(locationStop,binStop); - lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop)); - locationStart = regionStop + 1; - } - } - } - - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - // Lookup the locations for every file pointer in the index. - for(SAMReaderID id: readerToIndexMap.keySet()) { - BrowseableBAMIndex index = readerToIndexMap.get(id); - for(FilePointer filePointer: filePointers) - filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id))); - } - - return filePointers; - } - - private static class BinMergingIterator implements Iterator { - private PriorityQueue binQueue = new PriorityQueue(); - private Queue pendingOverlaps = new LinkedList(); - - public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) { - binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins))); - } - - public boolean hasNext() { - return pendingOverlaps.size() > 0 || !binQueue.isEmpty(); - } - - public BAMOverlap next() { - if(!hasNext()) - throw new NoSuchElementException("No elements left in merging iterator"); - if(pendingOverlaps.isEmpty()) - advance(); - return pendingOverlaps.remove(); - } - - public void advance() { - List bins = new ArrayList(); - int boundsStart, boundsStop; - - // Prime the pump - if(binQueue.isEmpty()) - return; - bins.add(getNextBin()); - boundsStart = bins.get(0).getStart(); - boundsStop = bins.get(0).getStop(); - - // Accumulate all the bins that overlap the current bin, in sorted order. - while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) { - ReaderBin bin = getNextBin(); - bins.add(bin); - boundsStart = Math.min(boundsStart,bin.getStart()); - boundsStop = Math.max(boundsStop,bin.getStop()); - } - - List> range = new ArrayList>(); - int start = bins.get(0).getStart(); - int stop = bins.get(0).getStop(); - while(start <= boundsStop) { - // Find the next stopping point. - for(ReaderBin bin: bins) { - stop = Math.min(stop,bin.getStop()); - if(start < bin.getStart()) - stop = Math.min(stop,bin.getStart()-1); - } - - range.add(new Pair(start,stop)); - // If the last entry added included the last element, stop. - if(stop >= boundsStop) - break; - - // Find the next start. - start = stop + 1; - for(ReaderBin bin: bins) { - if(start >= bin.getStart() && start <= bin.getStop()) - break; - else if(start < bin.getStart()) { - start = bin.getStart(); - break; - } - } - } - - // Add the next series of BAM overlaps to the window. - for(Pair window: range) { - BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second); - for(ReaderBin bin: bins) - bamOverlap.addBin(bin.id,bin.bin); - pendingOverlaps.add(bamOverlap); - } - } - - public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } - - private ReaderBin peekNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.peek(); - return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin()); - } - - private ReaderBin getNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.remove(); - ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin()); - if(current.hasNextBin()) - binQueue.add(current); - return readerBin; - } - - } - - /** - * Filters out bins not at the lowest level in the tree. - */ - private static class LowestLevelBinFilteringIterator implements Iterator { - private BrowseableBAMIndex index; - private Iterator wrappedIterator; - - private Bin nextBin; - - public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) { - this.index = index; - this.wrappedIterator = iterator; - advance(); - } - - public boolean hasNext() { - return nextBin != null; - } - - public Bin next() { - Bin bin = nextBin; - advance(); - return bin; - } - - public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); } - - private void advance() { - nextBin = null; - while(wrappedIterator.hasNext() && nextBin == null) { - Bin bin = wrappedIterator.next(); - if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1) - nextBin = bin; - } - } - } + public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } } - -class BinQueueState implements Comparable { - private final SAMReaderID id; - private final BrowseableBAMIndex index; - private final int referenceSequence; - private final PeekableIterator bins; - - private int firstLocusInCurrentBin; - private int lastLocusInCurrentBin; - - public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bins = new PeekableIterator(bins); - refreshLocusInBinCache(); - } - - public SAMReaderID getReaderID() { - return id; - } - - public BrowseableBAMIndex getIndex() { - return index; - } - - public int getReferenceSequence() { - return referenceSequence; - } - - public boolean hasNextBin() { - return bins.hasNext(); - } - - public Bin peekNextBin() { - return bins.peek(); - } - - public Bin nextBin() { - Bin nextBin = bins.next(); - refreshLocusInBinCache(); - return nextBin; - } - - public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) { - if(!this.bins.hasNext() && !other.bins.hasNext()) return 0; - if(!this.bins.hasNext()) return -1; - if(!this.bins.hasNext()) return 1; - - // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid. - if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 || - other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) { - throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid."); - } - - // Straight integer subtraction works here because lhsStart, rhsStart always positive. - if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin) - return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin; - - // Straight integer subtraction works here because lhsStop, rhsStop always positive. - return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin; - } - - private void refreshLocusInBinCache() { - firstLocusInCurrentBin = -1; - lastLocusInCurrentBin = -1; - if(bins.hasNext()) { - Bin bin = bins.peek(); - firstLocusInCurrentBin = index.getFirstLocusInBin(bin); - lastLocusInCurrentBin = index.getLastLocusInBin(bin); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java new file mode 100644 index 000000000..585b63457 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import java.util.Iterator; + +/** + * Batch granular file pointers into potentially larger shards. + */ +public class LocusShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + public boolean hasNext() { + return filePointers.hasNext(); + } + + public Shard next() { + FilePointer current = filePointers.next(); + while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0) + current = current.combine(parser,filePointers.next()); + return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + }; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java deleted file mode 100755 index a5ca07853..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileSpan; -import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * A sharding strategy for loci based on reading of the index. - */ -public class LocusShardStrategy implements ShardStrategy { - /** - * The data source to use when performing this sharding. - */ - private final SAMDataSource reads; - - /** - * the parser for creating shards - */ - private GenomeLocParser genomeLocParser; - - /** - * An iterator through the available file pointers. - */ - private final Iterator filePointerIterator; - - /** - * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs - * @param reads Data source from which to load index data. - * @param locations List of locations for which to load data. - */ - public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) { - this.reads = reads; - this.genomeLocParser = genomeLocParser; - - if(!reads.isEmpty()) { - GenomeLocSortedSet intervals; - if(locations == null) { - // If no locations were passed in, shard the entire BAM file. - SAMFileHeader header = reads.getHeader(); - intervals = new GenomeLocSortedSet(genomeLocParser); - - for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) { - // Check this sequence against the reference sequence dictionary. - // TODO: Do a better job of merging reads + reference. - SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName()); - if(refSequenceRecord != null) { - final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength()); - intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length)); - } - } - } - else - intervals = locations; - - if(reads.isLowMemoryShardingEnabled()) { - /* - Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - List filePointers = new ArrayList(); - while(filePointerIterator.hasNext()) - filePointers.add(filePointerIterator.next()); - this.filePointerIterator = filePointers.iterator(); - */ - this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - } - else - this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); - } - else { - final int maxShardSize = 100000; - List filePointers = new ArrayList(); - if(locations == null) { - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop))); - } - } - } - else { - for(GenomeLoc interval: locations) { - while(interval.size() > maxShardSize) { - filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1))); - interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - filePointers.add(new FilePointer(interval)); - } - } - filePointerIterator = filePointers.iterator(); - } - - } - - /** - * returns true if there are additional shards - * - * @return false if we're done processing shards - */ - public boolean hasNext() { - return filePointerIterator.hasNext(); - } - - public long shardNumber = 0; - - /** - * gets the next Shard - * - * @return the next shard - */ - public LocusShard next() { - FilePointer nextFilePointer = filePointerIterator.next(); - Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null; - - /* - System.out.printf("Shard %d: interval = {",++shardNumber); - for(GenomeLoc locus: nextFilePointer.locations) - System.out.printf("%s;",locus); - System.out.printf("}; "); - - if(fileSpansBounding == null) - System.out.printf("no shard data%n"); - else { - SortedMap sortedSpans = new TreeMap(fileSpansBounding); - for(Map.Entry entry: sortedSpans.entrySet()) { - System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue()); - } - } - */ - - return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding); - } - - /** we don't support the remove command */ - public void remove() { - throw new UnsupportedOperationException("ShardStrategies don't support remove()"); - } - - /** - * makes the IntervalShard iterable, i.e. usable in a for loop. - * - * @return - */ - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java deleted file mode 100644 index bf5f33dc3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.Iterator; - -/** - * Handles the process of aggregating BAM intervals into individual shards. - */ -public class LowMemoryIntervalSharder implements Iterator { - /** - * The iterator actually laying out the data for BAM scheduling. - */ - private final PeekableIterator wrappedIterator; - - /** - * The parser, for interval manipulation. - */ - private final GenomeLocParser parser; - - public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); - parser = loci.getGenomeLocParser(); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Accumulate shards where there's no additional cost to processing the next shard in the sequence. - * @return The next file pointer to process. - */ - public FilePointer next() { - FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); - return current; - } - - public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java deleted file mode 100644 index 278eeb898..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.List; - -/** - * A single, monolithic shard bridging all available data. - * @author mhanna - * @version 0.1 - */ -public class MonolithicShard extends Shard { - /** - * Creates a new monolithic shard of the given type. - * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic. - * @param locs Intervals that this monolithic shard should process. - */ - public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) { - super(parser, shardType, locs, readsDataSource, null, false); - if(shardType != ShardType.LOCUS && shardType != ShardType.READ) - throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType); - } - - /** - * String representation of this shard. - * @return "entire genome". - */ - @Override - public String toString() { - return "entire genome"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java deleted file mode 100644 index 28b737f28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Create a giant shard representing all the data in the input BAM(s). - * - * @author mhanna - * @version 0.1 - */ -public class MonolithicShardStrategy implements ShardStrategy { - /** - * The single shard associated with this sharding strategy. - */ - private MonolithicShard shard; - - /** - * Create a new shard strategy for shards of the given type. - * @param shardType The shard type. - */ - public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) { - shard = new MonolithicShard(parser,readsDataSource,shardType,region); - } - - /** - * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator; - * will only return another copy of the active iterator. - * @return A copy of this. - */ - public Iterator iterator() { - return this; - } - - /** - * Returns true if the monolithic shard has not yet been consumed, or false otherwise. - * @return True if shard has been consumed, false otherwise. - */ - public boolean hasNext() { - return shard != null; - } - - /** - * Returns the monolithic shard if it has not already been retrieved. - * @return The monolithic shard. - * @throws NoSuchElementException if no such data exists. - */ - public Shard next() { - if(shard == null) - throw new NoSuchElementException("Monolithic shard has already been retrived."); - - Shard working = shard; - shard = null; - return working; - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a shard strategy"); - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - * @param size adjust the next size to this - */ - public void adjustNextShardSize( long size ) { - throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard."); - } - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 4d9c9092d..5f40c0ea5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -35,10 +35,15 @@ import java.util.Map; * @version 0.1 */ public class ReadShard extends Shard { + /** + * What is the maximum number of reads which should go into a read shard. + */ + public static final int MAX_READS = 10000; + /** * The reads making up this shard. */ - private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS); + private final Collection reads = new ArrayList(MAX_READS); public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); @@ -66,7 +71,7 @@ public class ReadShard extends Shard { * @return True if this shard's buffer is full (and the shard can buffer reads). */ public boolean isBufferFull() { - return reads.size() > ReadShardStrategy.MAX_READS; + return reads.size() > ReadShard.MAX_READS; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java new file mode 100644 index 000000000..fa8a7d454 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + */ +public class ReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + if(!fileSpan.isEmpty()) + selectedReaders.put(id,fileSpan); + } + + if(selectedReaders.size() > 0) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java deleted file mode 100755 index 5ea75dbb0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.SAMFileSpan; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.*; - -/** - * The sharding strategy for reads using a simple counting mechanism. Each read shard - * has a specific number of reads (default to 10K) which is configured in the constructor. - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - */ -public class ReadShardStrategy implements ShardStrategy { - /** - * What is the maximum number of reads which should go into a read shard. - */ - protected static final int MAX_READS = 10000; - - /** - * The data source used to shard. - */ - private final SAMDataSource dataSource; - - /** - * The intervals to be processed. - */ - private final GenomeLocSortedSet locations; - - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** our storage of the genomic locations they'd like to shard over */ - private final List filePointers = new ArrayList(); - - /** - * Iterator over the list of file pointers. - */ - private final Iterator filePointerIterator; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentFilePointer; - - /** - * Ending position of the last shard in the file. - */ - private Map position; - - /** - * An indicator whether the strategy has sharded into the unmapped region. - */ - private boolean isIntoUnmappedRegion = false; - - private final GenomeLocParser parser; - - /** - * Create a new read shard strategy, loading read shards from the given BAM file. - * @param dataSource Data source from which to load shards. - * @param locations intervals to use for sharding. - */ - public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) { - this.dataSource = dataSource; - this.parser = parser; - this.position = this.dataSource.getCurrentPosition(); - this.locations = locations; - - if(locations != null) - filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations); - else - filePointerIterator = filePointers.iterator(); - - if(filePointerIterator.hasNext()) - currentFilePointer = filePointerIterator.next(); - - advance(); - } - - /** - * do we have another read shard? - * @return True if any more data is available. False otherwise. - */ - public boolean hasNext() { - return nextShard != null; - } - - /** - * Retrieves the next shard, if available. - * @return The next shard, if available. - * @throws java.util.NoSuchElementException if no such shard is available. - */ - public Shard next() { - if(!hasNext()) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - public void advance() { - Map shardPosition = new HashMap(); - nextShard = null; - - if(locations != null) { - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; - - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); - } - - if(selectedReaders.size() > 0) { - Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - dataSource.fillShard(shard); - - if(!shard.isBufferEmpty()) { - nextShard = shard; - break; - } - } - - selectedReaders.clear(); - currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null; - } - } - else { - // todo -- this nulling of intervals is a bit annoying since readwalkers without - // todo -- any -L values need to be special cased throughout the code. - Shard shard = new ReadShard(parser,dataSource,position,null,false); - dataSource.fillShard(shard); - nextShard = !shard.isBufferEmpty() ? shard : null; - } - - this.position = dataSource.getCurrentPosition(); - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - - /** - * Convenience method for using ShardStrategy in an foreach loop. - * @return A iterator over shards. - */ - public Iterator iterator() { - return this; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java deleted file mode 100644 index c76c1d8ae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 2, 2011 - * Time: 4:36:40 PM - * To change this template use File | Settings | File Templates. - */ -class ReaderBin { - public final SAMReaderID id; - public final BrowseableBAMIndex index; - public final int referenceSequence; - public final Bin bin; - - public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bin = bin; - } - - public int getStart() { - return index.getFirstLocusInBin(bin); - } - - public int getStop() { - return index.getLastLocusInBin(bin); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 8452aadfd..0a1eb0563 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -71,7 +73,7 @@ public class SAMDataSource { /** * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. */ - private final GenomeLocParser genomeLocParser; + protected final GenomeLocParser genomeLocParser; /** * Identifiers for the readers driving this data source. @@ -91,13 +93,18 @@ public class SAMDataSource { /** * How far along is each reader? */ - private final Map readerPositions = new HashMap(); + private final Map readerPositions = new HashMap(); /** * The merged header. */ private final SAMFileHeader mergedHeader; + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + /** * The sort order of the BAM files. Files without a sort order tag are assumed to be * in coordinate order. @@ -131,17 +138,24 @@ public class SAMDataSource { private final SAMResourcePool resourcePool; /** - * Whether to enable the new low-memory sharding mechanism. + * Asynchronously loads BGZF blocks. */ - private boolean enableLowMemorySharding = false; + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. */ - public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocParser) { + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, @@ -150,8 +164,7 @@ public class SAMDataSource { new ValidationExclusion(), new ArrayList(), false, - false, - true); + false); } /** @@ -159,6 +172,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -167,9 +182,10 @@ public class SAMDataSource { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, - boolean enableLowMemorySharding) { + boolean generateExtendedEvents) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, useOriginalBaseQualities, strictness, @@ -182,8 +198,7 @@ public class SAMDataSource { BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ - (byte) -1, - enableLowMemorySharding); + (byte) -1); } /** @@ -205,6 +220,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -217,13 +234,19 @@ public class SAMDataSource { BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, - byte defaultBaseQualities, - boolean enableLowMemorySharding) { - this.enableLowMemorySharding(enableLowMemorySharding); + byte defaultBaseQualities) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + else + dispatcher = null; + validationStringency = strictness; for (SAMReaderID readerID : samFiles) { if (!readerID.samFile.canRead()) @@ -235,10 +258,13 @@ public class SAMDataSource { SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. - for(SAMFileReader reader: readers.values()) { + for(SAMReaderID readerID: readerIDs) { // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); + headers.put(readerID,header); + if ( header.getReadGroups().isEmpty() ) { throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); @@ -275,7 +301,7 @@ public class SAMDataSource { qmode, refReader, defaultBaseQualities); - + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -296,12 +322,10 @@ public class SAMDataSource { originalToMergedReadGroupMappings.put(id,mappingToMerged); } - if(enableLowMemorySharding) { - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); } resourcePool.releaseReaders(readers); @@ -314,22 +338,6 @@ public class SAMDataSource { */ public ReadProperties getReadsInfo() { return readProperties; } - /** - * Enable experimental low-memory sharding. - * @param enable True to enable sharding. False otherwise. - */ - public void enableLowMemorySharding(final boolean enable) { - enableLowMemorySharding = enable; - } - - /** - * Returns whether low-memory sharding is enabled. - * @return True if enabled, false otherwise. - */ - public boolean isLowMemoryShardingEnabled() { - return enableLowMemorySharding; - } - /** * Checks to see whether any reads files are supplying data. * @return True if no reads files are supplying data to the traversal; false otherwise. @@ -368,7 +376,7 @@ public class SAMDataSource { * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. */ - public Map getCurrentPosition() { + public Map getCurrentPosition() { return readerPositions; } @@ -381,7 +389,7 @@ public class SAMDataSource { } public SAMFileHeader getHeader(SAMReaderID id) { - return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); + return headers.get(id); } /** @@ -404,45 +412,21 @@ public class SAMDataSource { return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); } - /** - * No read group collisions at this time because only one SAM file is currently supported. - * @return False always. - */ - public boolean hasReadGroupCollisions() { - return hasReadGroupCollisions; - } - /** * True if all readers have an index. * @return True if all readers have an index. */ public boolean hasIndex() { - if(enableLowMemorySharding) - return readerIDs.size() == bamIndices.size(); - else { - for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { - if(!reader.hasIndex()) - return false; - } - return true; - } + return readerIDs.size() == bamIndices.size(); } /** * Gets the index for a particular reader. Always preloaded. - * TODO: Should return object of type GATKBAMIndex, but cannot because there - * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new - * TODO: sharding system goes live. * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ - public Object getIndex(final SAMReaderID id) { - if(enableLowMemorySharding) - return bamIndices.get(id); - else { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - return readers.getReader(id).getBrowseableIndex(); - } + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); } /** @@ -454,7 +438,7 @@ public class SAMDataSource { } /** - * Gets the cumulative read metrics for shards already processed. + * Gets the cumulative read metrics for shards already processed. * @return Cumulative read metrics. */ public ReadMetrics getCumulativeReadMetrics() { @@ -507,10 +491,6 @@ public class SAMDataSource { } public StingSAMIterator seek(Shard shard) { - // todo: refresh monolithic sharding implementation - if(shard instanceof MonolithicShard) - return seekMonolithic(shard); - if(shard.buffersReads()) { return shard.iterator(); } @@ -540,7 +520,7 @@ public class SAMDataSource { */ private void initializeReaderPositions(SAMReaders readers) { for(SAMReaderID id: getReaderIDs()) - readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads()); + readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); } /** @@ -548,7 +528,6 @@ public class SAMDataSource { * @param readers Readers from which to load data. * @param shard The shard specifying the data limits. * @param enableVerification True to verify. For compatibility with old sharding strategy. - * TODO: Collapse this flag when the two sharding systems are merged. * @return An iterator over the selected data. */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { @@ -559,14 +538,20 @@ public class SAMDataSource { for(SAMReaderID id: getReaderIDs()) { CloseableIterator iterator = null; - if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null) - continue; - iterator = shard.getFileSpans().get(id) != null ? - readers.getReader(id).iterator(shard.getFileSpans().get(id)) : - readers.getReader(id).queryUnmapped(); + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + } + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(readProperties.getReadBufferSize() != null) iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); - if(shard.getGenomeLocs() != null) + if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); mergingIterator.addIterator(readers.getReader(id),iterator); } @@ -584,33 +569,6 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } - /** - * A stopgap measure to handle monolithic sharding - * @param shard the (monolithic) shard. - * @return An iterator over the monolithic shard. - */ - private StingSAMIterator seekMonolithic(Shard shard) { - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); - for(SAMReaderID id: getReaderIDs()) - mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator()); - - return applyDecoratingIterators(shard.getReadMetrics(), - shard instanceof ReadShard, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getDownsamplingMethod().toFraction, - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.defaultBaseQualities()); - } - /** * Adds this read to the given shard. * @param shard The shard to which to add the read. @@ -618,7 +576,7 @@ public class SAMDataSource { * @param read The read to add to the shard. */ private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing(); + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); shard.addRead(read); readerPositions.put(id,endChunk); } @@ -689,19 +647,6 @@ public class SAMDataSource { this.maxEntries = maxEntries; } - /** - * Dangerous internal method; retrieves any set of readers, whether in iteration or not. - * Used to handle non-exclusive, stateless operations, such as index queries. - * @return Any collection of SAMReaders, whether in iteration or not. - */ - protected SAMReaders getReadersWithoutLocking() { - synchronized(this) { - if(allResources.size() == 0) - createNewResource(); - } - return allResources.get(0); - } - /** * Choose a set of readers from the pool to use for this query. When complete, * @return @@ -753,6 +698,11 @@ public class SAMDataSource { */ private final Map readers = new LinkedHashMap(); + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + /** * Derive a new set of readers from the Reads metadata. * @param readerIDs reads to load. @@ -760,12 +710,20 @@ public class SAMDataSource { */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = new SAMFileReader(readerID.samFile); + File indexFile = findIndexFile(readerID.samFile); + + SAMFileReader reader = null; + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + inputStreams.put(readerID,blockInputStream); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); - reader.enableIndexMemoryMapping(false); - if(!enableLowMemorySharding) - reader.enableIndexCaching(true); reader.setValidationStringency(validationStringency); final SAMFileHeader header = reader.getFileHeader(); @@ -786,6 +744,15 @@ public class SAMDataSource { return readers.get(id); } + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + /** * Searches for the reader id of this reader. * @param reader Reader for which to search. @@ -883,7 +850,7 @@ public class SAMDataSource { * Filters out reads that do not overlap the current GenomeLoc. * Note the custom implementation: BAM index querying returns all reads that could * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. + * down performance, this implementation is highly customized to its task. */ private class IntervalOverlapFilteringIterator implements CloseableIterator { /** @@ -903,7 +870,7 @@ public class SAMDataSource { /** * Custom representation of interval bounds. - * Makes it simpler to track current position. + * Makes it simpler to track current position. */ private int[] intervalContigIndices; private int[] intervalStarts; @@ -941,7 +908,7 @@ public class SAMDataSource { i++; } } - + advance(); } @@ -1070,6 +1037,40 @@ public class SAMDataSource { return indexFile; } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * @return + */ + public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); + return shardBalancer; + } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java new file mode 100644 index 000000000..f9f6539a7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class SAMReaderPosition { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = filePosition; + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + try { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + positionIterator.next(); + // Check to see if the iterator has more data available. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) { + nextBlockAddress = positionIterator.peek().getBlockStart(); + break; + } + } + } + catch(Exception ex) { + throw new ReviewedStingException(""); + } + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java new file mode 100644 index 000000000..962208086 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Iterator; + +/** + * Balances maximally granular file pointers into shards of reasonable size. + */ +public abstract class ShardBalancer implements Iterable { + protected SAMDataSource readsDataSource; + protected PeekableIterator filePointers; + protected GenomeLocParser parser; + + public void initialize(final SAMDataSource readsDataSource, final Iterator filePointers, final GenomeLocParser parser) { + this.readsDataSource = readsDataSource; + this.filePointers = new PeekableIterator(filePointers); + this.parser = parser; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java deleted file mode 100644 index 989cf9fce..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import java.util.Iterator; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 4:55:37 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Interface ShardStrategy - *

- * The base interface for the sharding strategy; before we had a base abstract - * class, but not this will be an interface to accomidate read based sharding - */ -public interface ShardStrategy extends Iterator, Iterable { -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java deleted file mode 100644 index 780b41ef7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * - * User: aaron - * Date: Apr 6, 2009 - * Time: 7:09:22 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 6, 2009 - *

- * Class ShardStrategyFactory - *

- * The Shard Strategy Factory, use this class to create and transfer shard strategies - * between different approaches. - */ -public class ShardStrategyFactory { - public enum SHATTER_STRATEGY { - MONOLITHIC, // Put all of the available data into one shard. - LOCUS_EXPERIMENTAL, - READS_EXPERIMENTAL - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L); - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser,readsDataSource,null); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); - } - - } - - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l); - - } - - /** - * get a new shatter strategy - * - * @param readsDataSource The reads used to shatter this file. - * @param referenceDataSource The reference used to shatter this file. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return A strategy for shattering this data. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser, readsDataSource,lst); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented"); - } - - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 673df6dfa..577db0965 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -30,10 +30,12 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; -import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; +import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram { // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals GenomeLocSortedSet intervalSortedSet = null; @@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram { logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); @@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram { logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index c8c79bb14..2c33a19b8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.CreateSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; +import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * Loads reference data from fasta file * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile index; + private IndexedFastaSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -173,7 +185,7 @@ public class ReferenceDataSource { logger.info("Treating existing index file as complete."); } - index = new CachingIndexedFastaSequenceFile(fastaFile); + reference = new CachingIndexedFastaSequenceFile(fastaFile); } catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); @@ -192,6 +204,52 @@ public class ReferenceDataSource { * @return IndexedFastaSequenceFile that was created from file */ public IndexedFastaSequenceFile getReference() { - return this.index; + return this.reference; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + return shards; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 162baed00..b0043e68c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; @@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } - public Object execute( Walker walker, ShardStrategy shardStrategy ) { + public Object execute( Walker walker, Iterable shardStrategy ) { // Fast fail for walkers not supporting TreeReducible interface. if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index deafcd0cc..ff5e1064b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; @@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler { * @param walker Computation to perform over dataset. * @param shardStrategy A strategy for sharding the data. */ - public Object execute(Walker walker, ShardStrategy shardStrategy) { + public Object execute(Walker walker, Iterable shardStrategy) { walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index e731b9864..d013db7e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -30,11 +30,11 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads the informations associated with the reads * @param reference the reference file * @param rods the rods to include in the traversal - * @param nThreadsToUse Number of threads to utilize. + * @param threadAllocation Number of threads to utilize. * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (walker instanceof TreeReducible && nThreadsToUse > 1) { + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse)); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads()); } else { - if(nThreadsToUse > 1) + if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods); } @@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * * @return the return type of the walker */ - public abstract Object execute(Walker walker, ShardStrategy shardStrategy); + public abstract Object execute(Walker walker, Iterable shardStrategy); /** * Retrieves the object responsible for tracking and managing output. diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java new file mode 100644 index 000000000..0c81af07b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.resourcemanagement; + +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Models how threads are distributed between various components of the GATK. + */ +public class ThreadAllocation { + /** + * The number of CPU threads to be used by the GATK. + */ + private final int numCPUThreads; + + /** + * Number of threads to devote exclusively to IO. Default is 0. + */ + private final int numIOThreads; + + public int getNumCPUThreads() { + return numCPUThreads; + } + + public int getNumIOThreads() { + return numIOThreads; + } + + /** + * Construct the default thread allocation. + */ + public ThreadAllocation() { + this(1,null,null); + } + + /** + * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. + * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). + * @param totalThreads Complete number of threads to allocate. + * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numIOThreads Total number of threads allocated exclusively to IO. + */ + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { + // If no allocation information is present, allocate all threads to CPU + if(numCPUThreads == null && numIOThreads == null) { + this.numCPUThreads = totalThreads; + this.numIOThreads = 0; + } + // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). + else if(numIOThreads == null) { + if(numCPUThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = totalThreads - numCPUThreads; + } + // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). + else if(numCPUThreads == null) { + if(numIOThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); + this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); + this.numIOThreads = numIOThreads; + } + else { + if(numCPUThreads + numIOThreads != totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = numIOThreads; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index b39fdd79d..a14d999ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial return infoDB.getSample(paternalID); } + public ArrayList getParents(){ + ArrayList parents = new ArrayList(2); + Sample parent = getMother(); + if(parent != null) + parents.add(parent); + parent = getFather(); + if(parent != null) + parents.add(parent); + return parents; + } + /** * Get gender of the sample * @return property of key "gender" - must be of type Gender diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8098de5b1..ab38b69cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 85977bf8e..1956dac6c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public enum InfoFieldKey { EFFECT_KEY ("SNPEFF_EFFECT", -1), IMPACT_KEY ("SNPEFF_IMPACT", 0), - CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), - AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), - GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), - GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), - TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), - EXON_ID_KEY ("SNPEFF_EXON_ID", 7), - FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1); + FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7), + EXON_ID_KEY ("SNPEFF_EXON_ID", 8); // Actual text of the key private final String keyName; @@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // are validated against this list. public enum EffectType { // High-impact effects: - FRAME_SHIFT (EffectFunctionalClass.NONE, false), - STOP_GAINED (EffectFunctionalClass.NONSENSE, false), - START_LOST (EffectFunctionalClass.NONE, false), - SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false), - SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false), - EXON_DELETED (EffectFunctionalClass.NONE, false), - STOP_LOST (EffectFunctionalClass.NONE, false), + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + EXON_DELETED, + FRAME_SHIFT, + STOP_GAINED, + STOP_LOST, // Moderate-impact effects: - NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false), - CODON_CHANGE (EffectFunctionalClass.NONE, false), - CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_DELETION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false), - UTR_5_DELETED (EffectFunctionalClass.NONE, false), - UTR_3_DELETED (EffectFunctionalClass.NONE, false), + NON_SYNONYMOUS_CODING, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + UTR_5_DELETED, + UTR_3_DELETED, // Low-impact effects: - SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - START_GAINED (EffectFunctionalClass.NONE, false), + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + START_GAINED, + SYNONYMOUS_CODING, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, // Modifiers: - NONE (EffectFunctionalClass.NONE, true), - CHROMOSOME (EffectFunctionalClass.NONE, true), - INTERGENIC (EffectFunctionalClass.NONE, true), - UPSTREAM (EffectFunctionalClass.NONE, true), - UTR_5_PRIME (EffectFunctionalClass.NONE, true), - CDS (EffectFunctionalClass.NONE, true), - GENE (EffectFunctionalClass.NONE, true), - TRANSCRIPT (EffectFunctionalClass.NONE, true), - EXON (EffectFunctionalClass.NONE, true), - INTRON (EffectFunctionalClass.NONE, true), - UTR_3_PRIME (EffectFunctionalClass.NONE, true), - DOWNSTREAM (EffectFunctionalClass.NONE, true), - INTRON_CONSERVED (EffectFunctionalClass.NONE, true), - INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true), - REGULATION (EffectFunctionalClass.NONE, true), - CUSTOM (EffectFunctionalClass.NONE, true), - WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true); - - private final EffectFunctionalClass functionalClass; - private final boolean isModifier; - - EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) { - this.functionalClass = functionalClass; - this.isModifier = isModifier; - } - - public EffectFunctionalClass getFunctionalClass() { - return functionalClass; - } - - public boolean isModifier() { - return isModifier; - } + NONE, + CHROMOSOME, + CUSTOM, + CDS, + GENE, + TRANSCRIPT, + EXON, + INTRON_CONSERVED, + UTR_5_PRIME, + UTR_3_PRIME, + DOWNSTREAM, + INTRAGENIC, + INTERGENIC, + INTERGENIC_CONSERVED, + UPSTREAM, + REGULATION, + INTRON } - // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of - // classifying some of the LOW impact effects as MODIFIERs. + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER. public enum EffectImpact { MODIFIER (0), LOW (1), @@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio UNKNOWN } - // We assign a functional class to each SnpEff effect. + // SnpEff assigns a functional class to each effect. public enum EffectFunctionalClass { NONE (0), SILENT (1), @@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public List getKeyNames() { return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), InfoFieldKey.GENE_NAME_KEY.getKeyName(), InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), - InfoFieldKey.EXON_ID_KEY.getKeyName(), - InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName() + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } @@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return Arrays.asList( new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())), new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"), new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())) + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio protected static class SnpEffEffect { private EffectType effect; private EffectImpact impact; + private EffectFunctionalClass functionalClass; private String codonChange; private String aminoAcidChange; private String geneName; @@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private String parseError = null; private boolean isWellFormed = true; - private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11; - // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, - // errors come after warnings, not vice versa: - private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; - private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + // If there is either a warning OR an error, it will be in the last field. If there is both + // a warning AND an error, the warning will be in the second-to-last field, and the error will + // be in the last field. + private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1; + private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2; + private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1; - private static final int SNPEFF_CODING_FIELD_INDEX = 5; + // Position of the field indicating whether the effect is coding or non-coding. This field is used + // in selecting the most significant effect, but is not included in the annotations we return + // since it can be deduced from the SNPEFF_GENE_BIOTYPE field. + private static final int SNPEFF_CODING_FIELD_INDEX = 6; public SnpEffEffect ( String effectName, String[] effectMetadata ) { parseEffectName(effectName); @@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private void parseEffectMetadata ( String[] effectMetadata ) { if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { - if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { - parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) { + parseError(String.format("SnpEff issued the following warning or error: \"%s\"", + effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR])); } - else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { - parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) { + parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"", + effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR], + effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR])); } else { parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", @@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return; } - if ( effect != null && effect.isModifier() ) { - impact = EffectImpact.MODIFIER; + // The impact field will never be empty, and should always contain one of the enumerated values: + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } - else { + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + // The functional class field will be empty when the effect has no functional class associated with it: + if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) { try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()])); } } + else { + functionalClass = EffectFunctionalClass.NONE; + } codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + // The coding field will be empty when SnpEff has no coding info for the effect: if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); @@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return true; } else if ( impact.isSameImpactAs(other.impact) ) { - return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass()); + return functionalClass.isHigherPriorityThan(other.functionalClass); } return false; @@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString()); addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); - addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString()); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index b5987963f..106bb1982 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils; * Time: 6:46:09 PM * To change this template use File | Settings | File Templates. */ -enum DiploidGenotype { +public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), AG ('A', 'G'), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 666fe88a3..295cf8688 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); if ( elt.isReducedRead() ) { // reduced read representation - byte qual = elt.getQual(); - if ( BaseUtils.isRegularBase( elt.getBase() )) { + if ( BaseUtils.isRegularBase( obsBase )) { add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods return elt.getRepresentativeCount(); // we added nObs bases here - } else // odd bases or deletions => don't use them - return 0; - } else { - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; + } + + // odd bases or deletions => don't use them + return 0; } + + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { @@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @return */ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) return 0; - } else { - byte qual = p.getQual(); - if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); - if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); - if ( (int)qual < minBaseQual ) - qual = (byte)0; + byte qual = p.getQual(); - return qual; - } + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; } // ----------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 489e963e8..74c55dbfe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param priors priors to use for GLs * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set - * - * @param useBAQedPileup + * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @return genotype likelihoods per sample for AA, AB, BB */ public abstract Allele getLikelihoods(RefMetaDataTracker tracker, @@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse, boolean useBAQedPileup); + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index bdd4e2c65..369c2d0c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -258,7 +258,7 @@ public class UnifiedGenotyper extends LocusWalker result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); return result; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 2d71ea8a8..8585104d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -7,35 +7,80 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; /** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * */ -public class PhaseByTransmission extends RodWalker { +public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") - public ArrayList familySpecs = null; + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; + + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; @Output protected VCFWriter vcfWriter = null; @@ -43,241 +88,633 @@ public class PhaseByTransmission extends RodWalker { private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + public final double NO_TRANSMISSION_PROB = -1.0; - private class Trio { - private String mother; - private String father; - private String child; + private ArrayList trios = new ArrayList(); - public Trio(String mother, String father, String child) { - this.mother = mother; - this.father = father; - this.child = child; - } + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; - public Trio(String familySpec) { - String[] pieces = familySpec.split("[\\+\\=]"); + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; - this.mother = pieces[0]; - this.father = pieces[1]; - this.child = pieces[2]; - } + //Metrics counters hash keys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; - public String getMother() { return mother; } - public String getFather() { return father; } - public String getChild() { return child; } + //Random number generator + private Random rand = new Random(); + + private enum FamilyMember { + MOTHER, + FATHER, + CHILD } - private ArrayList trios = new ArrayList(); + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. + private class TrioPhase { - public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { - if (familySpecs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // spec list set, and treat the entries as if they had been specified on the command line. - ArrayList specs = new ArrayList(); - for (String familySpec : familySpecs) { - File specFile = new File(familySpec); + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; - try { - XReadLines reader = new XReadLines(specFile); + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); - List lines = reader.readLines(); - for (String line : lines) { - specs.add(new Trio(line)); - } - } catch (FileNotFoundException e) { - specs.add(new Trio(familySpec)); // not a file, so must be a family spec + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else{ + return null; + } + return alleles; + } + + private boolean isPhasable(Genotype.Type genotype){ + return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; + } + + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true)); + } + else + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false)); + } + + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + + //Special case for Het/Het as it is ambiguous + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + return; + } + + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the mother and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + } + } + + //Phases a family by transmission + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); } } - return specs; + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false)); } - return new ArrayList(); + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Take care of cases where one or more family members are no call + if(!isPhasable(child)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else if(!isPhasable(mother)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(!isPhasable(father)){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(!isPhasable(father)){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } + } + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) + phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); + + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); + + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double log10Error; + if(genotype.getType() == phasedGenotype.getType()) + log10Error = genotype.getLog10PError(); + else + log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); + + return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased()); + } + + } /** - * Parse the familial relationship specification, and initialize VCF writer + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer */ public void initialize() { - trios = getFamilySpecsFromCommandLineInput(familySpecs); - ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + //Get the trios from the families passed as ped + setTrios(); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + + Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL"); + } - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); + /** + * Select trios and parent/child pairs only + */ + private void setTrios(){ - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; - - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; - } - - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getLog10PError(), null, g.getAttributes(), false); - - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getLog10PError(), null, g.getAttributes(), false); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getLog10PError(), null, g.getAttributes(), false); - - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - - return genotypes; - } - - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; - - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; + Map> families = this.getSampleDB().getFamilies(); + Set family; + ArrayList parents; + for(String familyID : families.keySet()){ + family = families.get(familyID); + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size())); } - } - - return matchingAlleles; - } - - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; - - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; - - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); - - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); - } - - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); - - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); - - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getLog10PError(), child.getFilters(), child.getAttributes(), true); - - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); - } - } - - ArrayList finalGenotypes = new ArrayList(); - - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); - - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); - - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getLog10PError(), mom.getFilters(), mom.getAttributes(), true); - - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); - - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); - - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getLog10PError(), dad.getFilters(), dad.getAttributes(), true); - - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); - - return finalGenotypes; - } - } - - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - return finalGenotypes; - } - - private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - - if (mother.isCalled() && father.isCalled() && child.isCalled()) { - ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); - ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); - - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMotherGenotype = mother; - Genotype bestFatherGenotype = father; - Genotype bestChildGenotype = child; - - double norm = 0.0; - - for (Genotype motherGenotype : possibleMotherGenotypes) { - for (Genotype fatherGenotype : possibleFatherGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); - norm += prior*configurationLikelihood; - - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMotherGenotype = motherGenotype; - bestFatherGenotype = fatherGenotype; - bestChildGenotype = childGenotype; - } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); + break; } } } - if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { - Map attributes = new HashMap(); - attributes.putAll(bestChildGenotype.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + } - finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); + + + } + + //Create the transmission matrices + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(Genotype.Type.class); + transmissionMatrix = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mother : Genotype.Type.values()){ + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + for(Genotype.Type child : Genotype.Type.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } + + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. + private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; } } - return finalGenotypes; + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } + + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; + + //MV + return 1; + } + + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. + private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } + + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isCalled()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); + likelihoods.put(Genotype.Type.HET,1.0/3.0); + likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } + + //Returns the Genotype.Type; returns UNVAILABLE if given null + private Genotype.Type getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return Genotype.Type.UNAVAILABLE; + return genotype.getType(); + } + + + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); + Genotype.Type pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); + pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + int configuration_index =0; + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + //Get the most likely combination + //Only check for most likely combination if at least a parent and the child have genotypes + if(child.isCalled() && parentsCalled > 0){ + int mvCount; + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); + bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(cumulativeMVCount/3); + } + configurationLikelihood = 0; + } + } + } + + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + + //In case of multiple equally likely combinations, take a random one + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); + } + + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; + } + + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); + + } + + + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } } /** @@ -289,53 +726,153 @@ public class PhaseByTransmission extends RodWalker { * @return null */ @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + String mvfLine; + if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); - GenotypesContext genotypesContext = GenotypesContext.create(vc.getGenotypes().size()); + GenotypesContext genotypeMap = vc.getGenotypes(); - for (Trio trio : trios) { - Genotype mother = vc.getGenotype(trio.getMother()); - Genotype father = vc.getGenotype(trio.getFather()); - Genotype child = vc.getGenotype(trio.getChild()); + int mvCount; - ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); - genotypesContext.add(phasedMother, phasedFather, phasedChild); + //Fill the genotype map with the new genotypes and increment metrics counters + genotypeMap.add(phasedChild); + if(mother != null){ + genotypeMap.add(phasedMother); + if(father != null){ + genotypeMap.add(phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + } + else{ + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } + } + else{ + genotypeMap.add(phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(mvCount>0 && mvFile != null) + mvFile.println(mvfLine); + } - VariantContext newvc = new VariantContextBuilder(vc).genotypes(genotypesContext).make(); - - vcfWriter.add(newvc); + vcfWriter.add(new VariantContextBuilder(vc).genotypes(genotypeMap).make()); } - - return null; + return metricsCounters; } /** - * Provide an initial value for reduce computations. + * Initializes the reporting counters. * - * @return Initial value of reduce. + * @return All counters initialized to 0 */ @Override - public Integer reduceInit() { - return null; + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + return metricsCounters; } /** - * Reduces a single map with the accumulator provided as the ReduceType. + * Adds the value of the site phased to the reporting counters. * - * @param value result of the map. - * @param sum accumulator for the reduce. + * @param value Site values + * @param sum accumulator for the reporting counters * @return accumulator with result of the map taken into account. */ @Override - public Integer reduce(Integer value, Integer sum) { - return null; + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + + return sum; + } + + + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java deleted file mode 100644 index e770418c1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.ReadPairWalker; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.List; - -/** - * Counts the number of read pairs encountered in a file sorted in - * query name order. Breaks counts down by total pairs and number - * of paired reads. - * - * - *

Input

- *

- * One or more bam files. - *

- * - *

Output

- *

- * Number of pairs seen. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CountPairs \
- *   -o output.txt \
- *   -I input.bam
- * 
- * - * @author mhanna - */ -public class CountPairsWalker extends ReadPairWalker { - @Output - private PrintStream out; - - /** - * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec. - */ - private long firstOfPair = 0; - - /** - * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec. - */ - private long secondOfPair = 0; - - /** - * A breakdown of the total number of reads seen with exactly the same read name. - */ - private List pairCountsByType = new ExpandingArrayList(); - - /** - * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser. - * @param reads Collection of reads having the same name. - * @return Semantics defined by implementer. - */ - @Override - public Integer map(Collection reads) { - if(pairCountsByType.get(reads.size()) != null) - pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1); - else - pairCountsByType.set(reads.size(),1L); - - for(SAMRecord read: reads) { - if(read.getFirstOfPairFlag()) firstOfPair++; - if(read.getSecondOfPairFlag()) secondOfPair++; - } - - return 1; - } - - /** - * No pairs at the beginning of a traversal. - * @return 0 always. - */ - @Override - public Long reduceInit() { - return 0L; - } - - /** - * Combine number of pairs seen in this iteration (always 1) with total number of pairs - * seen in previous iterations. - * @param value Pairs in this iteration (1), from the map function. - * @param sum Count of all pairs in prior iterations. - * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1). - */ - @Override - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Print summary statistics over the entire traversal. - * @param sum A count of all read pairs viewed. - */ - @Override - public void onTraversalDone(Long sum) { - out.printf("Total number of pairs : %d%n",sum); - out.printf("Total number of first reads in pair : %d%n",firstOfPair); - out.printf("Total number of second reads in pair: %d%n",secondOfPair); - for(int i = 1; i < pairCountsByType.size(); i++) { - if(pairCountsByType.get(i) == null) - continue; - out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i)); - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 9c24360c5..babc88966 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -270,8 +270,8 @@ public class SelectVariants extends RodWalker { private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable - * number of variants. Use --select_random_fraction for larger numbers of variants. + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory + * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; @@ -527,7 +527,7 @@ public class SelectVariants extends RodWalker { } } if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub, ref.getBase()); + randomlyAddVariant(++variantNumber, sub); } else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); @@ -691,7 +691,7 @@ public class SelectVariants extends RodWalker { return new VariantContextBuilder(builder.make()).attributes(attributes).make(); } - private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { + private void randomlyAddVariant(int rank, VariantContext vc) { if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e10bcbaa0..8cba183da 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -554,4 +554,54 @@ public class GenomeLocParser { return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } + /** + * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + + int start = loc.getStart() - maxBasePairs; + int stop = loc.getStart() - 1; + + if (start < 1) + start = 1; + if (stop < 1) + return null; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } + + /** + * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + int contigLength = contig.getSequenceLength(); + + int start = loc.getStop() + 1; + int stop = loc.getStop() + maxBasePairs; + + if (start > contigLength) + return null; + if (stop > contigLength) + stop = contigLength; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 6e4ddddc4..8c1061494 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -171,6 +171,9 @@ public class ReadClipper { clippedRead = op.apply(algorithm, clippedRead); } wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return new GATKSAMRecord( clippedRead.getHeader() ); return clippedRead; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // this should never happen diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 1aafafc27..92c8840fb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -353,7 +353,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { // some exceptions if ( key.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( Math.abs(g.getLog10PError() + Genotype.NO_LOG10_PERROR) < 1e-6) + if ( ! g.hasLog10PError() ) val = VCFConstants.MISSING_VALUE_v4; else { val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f0e164c87..159b145a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -233,8 +233,12 @@ public class IntervalUtils { * * Returns a null string if there are no differences, otherwise returns a string describing the difference * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference */ - public static final String equateIntervals(List masterArg, List testArg) { + public static String equateIntervals(List masterArg, List testArg) { LinkedList master = new LinkedList(masterArg); LinkedList test = new LinkedList(testArg); @@ -317,23 +321,6 @@ public class IntervalUtils { return lengths; } - /** - * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param locs The genome locs. - * @return The maximum number of parts the intervals can be split into. - */ - public static int countContigIntervals(List locs) { - int maxFiles = 0; - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } - } - return maxFiles; - } - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -373,7 +360,6 @@ public class IntervalUtils { * @return A list of lists of genome locs, split according to splits */ public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; int start = 0; List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { @@ -465,7 +451,7 @@ public class IntervalUtils { @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) @Ensures({"result != null"}) - final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { final List split = new ArrayList(); long size = 0; @@ -579,10 +565,101 @@ public class IntervalUtils { } } - public static final long intervalSize(final List locs) { + public static long intervalSize(final List locs) { long size = 0; for ( final GenomeLoc loc : locs ) size += loc.size(); return size; } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (String contig: locsByContig.keySet()) { + List contigLocs = locsByContig.get(contig); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 6d7c8dad9..d3a52167a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -261,7 +261,7 @@ public class GATKSAMRecord extends BAMRecord { * @return true if the read has no bases */ public boolean isEmpty() { - return this.getReadLength() == 0; + return super.getReadBases() == null || super.getReadLength() == 0; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java index d3cc7d6a5..98032f94d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java @@ -25,8 +25,7 @@ final class CommonInfo { public CommonInfo(String name, double log10PError, Set filters, Map attributes) { this.name = name; setLog10PError(log10PError); - if ( filters != null && ! filters.isEmpty() ) - this.filters = filters; + this.filters = filters; if ( attributes != null && ! attributes.isEmpty() ) { this.attributes = attributes; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index dba16cf86..bbe5308a9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,7 +25,13 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.io.DirectOutputTracker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.jgrapht.util.MathUtil; + +import java.util.EnumMap; +import java.util.Map; public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; @@ -94,6 +100,47 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + + //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype + public double getLog10GQ(Genotype.Type genotype){ + EnumMap likelihoods = getAsMap(false); + if(likelihoods == null) + return Double.NEGATIVE_INFINITY; + + double qual = Double.NEGATIVE_INFINITY; + for(Map.Entry likelihood : likelihoods.entrySet()){ + if(likelihood.getKey() == genotype) + continue; + if(likelihood.getValue() > qual) + qual = likelihood.getValue(); + } + + //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) + qual = likelihoods.get(genotype) - qual; + + //Quality of other genotypes 1-P(G) + if (qual < 0) { + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = Math.log10(1.0 - chosenGenotype); + } + return -1 * qual; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 5ad734b79..34131b9c4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -318,7 +318,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati public VariantContext subContextFromSamples(Set sampleNames, Collection alleles) { loadGenotypes(); VariantContextBuilder builder = new VariantContextBuilder(this); - return builder.genotypes(genotypes.subsetToSamples(sampleNames)).make(); + return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make(); } public VariantContext subContextFromSamples(Set sampleNames) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 8d7dd82ac..17a7d1974 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java index dc3a6cafe..62c93bddd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -42,7 +43,7 @@ import java.util.Collections; public class MockLocusShard extends LocusShard { public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { super( genomeLocParser, - new SAMDataSource(Collections.emptyList(),genomeLocParser), + new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), intervals, null); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java deleted file mode 100755 index e41a6b3b7..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java +++ /dev/null @@ -1,223 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import static org.testng.Assert.fail; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * - * User: aaron - * Date: Apr 8, 2009 - * Time: 8:14:23 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 - *

- * Class SAMBAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMBAMDataSourceUnitTest extends BaseTest { - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - StingSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testMergingTwoBAMFiles() { - logger.warn("Executing testMergingTwoBAMFiles"); - - // setup the test files - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - - ArrayList readcountPerShard = new ArrayList(); - ArrayList readcountPerShard2 = new ArrayList(); - - // count up the first hundred shards - int shardsToCount = 100; - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - - } - readcountPerShard.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - - // setup the data and the counter before our second run - readers.clear(); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - count = 0; - // the sharding strat. - data = new SAMDataSource(readers,genomeLocParser); - strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser); - - logger.debug("Pile two:"); - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - // can we leave? - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - } - - readcountPerShard2.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - /*int pos = 0; - for (; pos < 100; pos++) { - if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { - fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos)); - } - } */ - - } - - - - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java new file mode 100755 index 000000000..ba2d68ec9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import static org.testng.Assert.fail; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * @author aaron + * @version 1.0 + * @date Apr 8, 2009 + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + false); + + Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + StingSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 9de4d5c04..91c18078e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.*; import java.util.*; @@ -66,9 +67,9 @@ public class RefMetaDataTrackerUnitTest { C = Allele.create("C"); G = Allele.create("G"); T = Allele.create("T"); - AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C).make()); - AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G).make()); - AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T).make()); + AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make(); + AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make(); + AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make(); span10_10 = makeSpan(10, 10); span1_20 = makeSpan(1, 20); span10_20 = makeSpan(10, 20); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 7f4d96add..9226f97e2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest { private List bamList; private Walker countReadWalker; private File output; - private long readSize = 100000; private TraverseReads traversalEngine = null; private IndexedFastaSequenceFile ref = null; @@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testUnmappedReadCount() { - SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - ref.getSequenceDictionary(), - readSize, - genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); - while (shardStrategy.hasNext()) { + for(Shard shard: shardStrategy) { traversalEngine.startTimersIfNecessary(); - Shard shard = shardStrategy.next(); if (shard == null) { fail("Shard == null"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java index 462abeba1..5c8fa32a8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -33,7 +33,7 @@ public class SnpEffUnitTest { @Test public void testParseWellFormedEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); @@ -42,7 +42,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectNameEffect() { String effectName = "MADE_UP_EFFECT"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -51,7 +51,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectImpactEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -60,27 +60,27 @@ public class SnpEffUnitTest { @Test public void testParseWrongNumberOfMetadataFieldsEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); } @Test - public void testParseSnpEffWarningEffect() { + public void testParseSnpEffOneWarningOrErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") ); } @Test - public void testParseSnpEffErrorEffect() { + public void testParseSnpEffBothWarningAndErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index b2786117f..3bfb81dd0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a6687f0d3830fa6e518b7874857f6f70")); + Arrays.asList("9beb795536e95954f810835c6058f2ad")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64b6804cb1e27826e3a47089349be581")); + Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -64,7 +64,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b59508cf66da6b2de280a79b3b7d85b1")); + Arrays.asList("49d989f467b8d6d8f98f7c1b67cd4a05")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -72,7 +72,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); + Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -80,7 +80,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b8e18b23568e4d2381f51d4430213040")); + Arrays.asList("33062eccd6eb73bc49440365430454c4")); executeTest("test exclude annotations", spec); } @@ -88,7 +88,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); + Arrays.asList("062155edec46a8c52243475fbf3a2943")); executeTest("test overwriting header", spec); } @@ -96,7 +96,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("16e3a1403fc376320d7c69492cad9345")); + Arrays.asList("06635f2dd91b539bfbce9bf7914d8e43")); executeTest("not passing it any reads", spec); } @@ -104,7 +104,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); + Arrays.asList("820eeba1f6e3a0758a69d937c524a38e")); executeTest("getting DB tag with dbSNP", spec); } @@ -112,7 +112,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); + Arrays.asList("31cc2ce157dd20771418c08d6b3be1fa")); executeTest("getting DB tag with HM3", spec); } @@ -120,7 +120,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31")); + Arrays.asList("074865f8f8c0ca7bfd58681f396c49e9")); executeTest("using expression", spec); } @@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5")); + Arrays.asList("97b26db8135d083566fb585a677fbe8a")); executeTest("using expression with ID", spec); } @@ -148,9 +148,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("122321a85e448f21679f6ca15c5e22ad") + Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1c01fbdd4..6d4a971a5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; import java.util.HashMap; import java.util.List; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index c663c1dd7..2cd76e7a5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -6,23 +6,131 @@ import org.testng.annotations.Test; import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { - private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission"; - private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; + private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/"; + private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped"; + private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; + private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; + private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; + //Tests using PbT on all genotypes with default parameters + //And all reporting options @Test - public void testBasicFunctionality() { + public void testTrueNegativeMV() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", "-NO_HEADER", "-R " + b37KGReference, - "--variant " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", + "--variant " + TNTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14") + ); + executeTest("testTrueNegativeMV", spec); + } + + @Test + public void testTruePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + TPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556") + ); + executeTest("testTruePositiveMV", spec); + } + + @Test + public void testFalsePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee") + ); + executeTest("testFalsePositiveMV", spec); + } + + @Test + public void testSpecialCases() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + SpecialTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3") + ); + executeTest("testSpecialCases", spec); + } + + //Test using a different prior + //Here the FP file is used but as the prior is lowered, 3 turn to TP + @Test + public void testPriorOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-prior 1e-4", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e") + ); + executeTest("testPriorOption", spec); + } + + //Test when running without MV reporting option + //This is the exact same test file as FP but should not generate a .mvf file + @Test + public void testMVFileOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", "-o %s" ), 1, - Arrays.asList("") + Arrays.asList("398724bc1e65314cc5ee92706e05a3ee") ); - executeTest("testBasicFunctionality", spec); + executeTest("testMVFileOption", spec); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3dceb9bd2..102d4715e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, - Arrays.asList("d9dcb352c53106f54fcc981f15d35a90") + Arrays.asList("a36414421621b377d6146d58d2fcecd0") ); executeTest("testFunctionClassWithSnpeff", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index f1f849bf5..e9f138a0e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigIndexValid() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } @@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigInfoKnownContig() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference } @@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } + + private static class FlankingGenomeLocTestData extends TestDataProvider { + final GenomeLocParser parser; + final int basePairs; + final GenomeLoc original, flankStart, flankStop; + + private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) { + super(FlankingGenomeLocTestData.class, name); + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.flankStart = flankStart == null ? null : parse(parser, flankStart); + this.flankStop = flankStop == null ? null : parse(parser, flankStop); + } + + private static GenomeLoc parse(GenomeLocParser parser, String str) { + return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str); + } + } + + @DataProvider(name = "flankingGenomeLocs") + public Object[][] getFlankingGenomeLocs() { + int contigLength = 10000; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength); + GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary()); + + new FlankingGenomeLocTestData("atStartBase1", parser, 1, + "chr1:1", null, "chr1:2"); + + new FlankingGenomeLocTestData("atStartBase50", parser, 50, + "chr1:1", null, "chr1:2-51"); + + new FlankingGenomeLocTestData("atStartRange50", parser, 50, + "chr1:1-10", null, "chr1:11-60"); + + new FlankingGenomeLocTestData("atEndBase1", parser, 1, + "chr1:" + contigLength, "chr1:" + (contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndBase50", parser, 50, + "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 10, contigLength), + String.format("chr1:%d-%d", contigLength - 60, contigLength - 11), + null); + + new FlankingGenomeLocTestData("nearStartBase1", parser, 1, + "chr1:2", "chr1:1", "chr1:3"); + + new FlankingGenomeLocTestData("nearStartRange50", parser, 50, + "chr1:21-30", "chr1:1-20", "chr1:31-80"); + + new FlankingGenomeLocTestData("nearEndBase1", parser, 1, + "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength); + + new FlankingGenomeLocTestData("nearEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 30, contigLength - 21), + String.format("chr1:%d-%d", contigLength - 80, contigLength - 31), + String.format("chr1:%d-%d", contigLength - 20, contigLength)); + + new FlankingGenomeLocTestData("beyondStartBase1", parser, 1, + "chr1:3", "chr1:2", "chr1:4"); + + new FlankingGenomeLocTestData("beyondStartRange50", parser, 50, + "chr1:101-200", "chr1:51-100", "chr1:201-250"); + + new FlankingGenomeLocTestData("beyondEndBase1", parser, 1, + "chr1:" + (contigLength - 3), + "chr1:" + (contigLength - 4), + "chr1:" + (contigLength - 2)); + + new FlankingGenomeLocTestData("beyondEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 200, contigLength - 101), + String.format("chr1:%d-%d", contigLength - 250, contigLength - 201), + String.format("chr1:%d-%d", contigLength - 100, contigLength - 51)); + + new FlankingGenomeLocTestData("unmapped", parser, 50, + "unmapped", null, null); + + new FlankingGenomeLocTestData("fullContig", parser, 50, + "chr1", null, null); + + return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStart); + assertEquals(actual, data.flankStart, description); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStop); + assertEquals(actual, data.flankStop, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 3f5d05e66..7a2696b7b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest { double t6 = t.getElapsedTime(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - - t.stop().start(); - Assert.assertTrue(t.isRunning(), "second started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset"); - Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time } private final static void idleLoop() { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index f625af23c..ecb5a6d33 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -30,8 +30,10 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import org.testng.annotations.*; + +import java.util.LinkedList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -44,180 +46,214 @@ public class ReadClipperUnitTest extends BaseTest { // TODO: Add error messages on failed tests + //int debug = 0; + GATKSAMRecord read, expected; ReadClipper readClipper; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 - @BeforeClass + + public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) { + Assert.assertEquals(read.getReadBases(), readBases); + Assert.assertEquals(read.getBaseQualityString(), baseQuals); + Assert.assertEquals(read.getCigarString(), cigar); + } + + public class testParameter { + int inputStart; + int inputStop; + int substringStart; + int substringStop; + String cigar; + + public testParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) { + inputStart = InputStart; + inputStop = InputStop; + substringStart = SubstringStart; + substringStop = SubstringStop; + cigar = Cigar; + } + } + + // What the test read looks like + // Ref: 1 2 3 4 5 6 7 8 + // Read: 0 1 2 3 - - - - + // ----------------------------- + // Bases: A C T G - - - - + // Quals: ! + 5 ? - - - - + + @BeforeMethod public void init() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); readClipper = new ReadClipper(read); + //logger.warn(read.getCigarString()); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipBothEndsByReferenceCoordinates() { - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + //int debug = 1; //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader())); + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader())); + //clip 1 base - expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4); Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); Assert.assertEquals(expected.getCigarString(), "1H2M1H"); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReadCoordinates(0,0); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(0,0,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(3,3,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(0,1,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(2,3,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReadCoordinates(3,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReadCoordinates(0,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReadCoordinates(2,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinates() { logger.warn("Executing testHardClipByReferenceCoordinates"); - + //logger.warn(debug); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinates(-1,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(-1,1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(4,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(-1,2,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(3,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinates(3,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinates(-1,2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinates(2,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinates(p.inputStart,p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesLeftTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start + testList.add(new testParameter(2, -1, 2, 4, "2H2M"));//clip 2 bases at start - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesRightTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader())); - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + List testList = new LinkedList(); + testList.add(new testParameter(-1, 4, 0, 3, "3M1H"));//clip 1 base at end + testList.add(new testParameter(-1, 3, 0, 2, "2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) // TODO This function is returning null reads public void testHardClipLowQualEnds() { - logger.warn("Executing testHardClipByReferenceCoordinates"); + logger.warn("Executing testHardClipByReferenceCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 2 bases at start - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + List testList = new LinkedList(); + testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(11,-1,2,4,"2H2M"));//clip 2 bases at start + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + /* todo find a better way to test lowqual tail clipping on both sides // Reverse Quals sequence readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - //clip 1 base at end - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + testList = new LinkedList(); + testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - // revert Qual sequence - readClipper.getRead().setBaseQualityString(QUALS); + for ( testParameter p : testList ) { + init(); + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + */ } -} + + public class CigarReadMaker { + + } + + @Test ( enabled = false ) + public void testHardClipSoftClippedBases() { + + // Generate a list of cigars to test + // We will use testParameter in the following way + // Right tail, left tail, + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 9c3b905c2..03d33d2c5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,8 +1,8 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest { List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 9243588ab..70a18856f 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; + /** * Basic unit test for Genotype likelihoods objects @@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest { gl.getAsVector(); } + @Test + public void testGetAsMap(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(v); + //Log scale + EnumMap glMap = gl.getAsMap(false); + Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Linear scale + glMap = gl.getAsMap(true); + double [] vl = MathUtils.normalizeFromLog10(v); + Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + glMap = gl.getAsMap(false); + Assert.assertNull(glMap); + + } + + @Test + public void testGetLog10GQ(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); + + //GQ for the best guess genotype + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),-3.9); + + double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); + + //GQ for the other genotypes + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 4c0d22f70..5f2dacdfb 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -8,6 +8,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.testng.annotations.BeforeSuite; +import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; @@ -55,7 +56,10 @@ public class VariantContextUnitTest extends BaseTest { ATC = Allele.create("ATC"); ATCref = Allele.create("ATC", true); + } + @BeforeTest + public void beforeTest() { basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC)).referenceBaseForIndel((byte)'A'); @@ -75,16 +79,16 @@ public class VariantContextUnitTest extends BaseTest { // test REF List alleles = Arrays.asList(Tref); - VariantContext vc = snpBuilder.alleles(alleles).make(); + VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION); // test SNPs alleles = Arrays.asList(Tref, A); - vc = snpBuilder.alleles(alleles).make(); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); alleles = Arrays.asList(Tref, A, C); - vc = snpBuilder.alleles(alleles).make(); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); // test MNPs @@ -98,7 +102,7 @@ public class VariantContextUnitTest extends BaseTest { // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = basicBuilder.alleles(alleles).make(); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); @@ -106,7 +110,7 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = basicBuilder.alleles(alleles).make(); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); @@ -131,12 +135,12 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = basicBuilder.alleles(alleles).make(); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -280,50 +284,50 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertEquals(vc.getGenotype("foo").getType(), Genotype.Type.MIXED); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs1() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs2() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs3() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Throwable.class) public void testBadConstructorArgs4() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.emptyList()).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles1() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles2() { new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make(); } - @Test (expectedExceptions = IllegalStateException.class) + @Test (expectedExceptions = Throwable.class) public void testBadLoc1() { List alleles = Arrays.asList(Aref, T, del); new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Throwable.class) public void testBadID1() { new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadID2() { - new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(""); + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make(); } @Test @@ -557,7 +561,7 @@ public class VariantContextUnitTest extends BaseTest { @Test(dataProvider = "getAlleles") public void testMergeAlleles(GetAllelesTest cfg) { final List altAlleles = cfg.alleles.subList(1, cfg.alleles.size()); - final VariantContext vc = snpBuilder.alleles(cfg.alleles).make(); + final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).referenceBaseForIndel((byte)'A').make(); Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles"); Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size"); diff --git a/public/java/src/net/sf/samtools/GATKBinList.java b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala similarity index 57% rename from public/java/src/net/sf/samtools/GATKBinList.java rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala index b53062aaf..d90db0de4 100644 --- a/public/java/src/net/sf/samtools/GATKBinList.java +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala @@ -22,30 +22,27 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package net.sf.samtools; +package org.broadinstitute.sting.queue.extensions.gatk -import java.util.BitSet; +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Output, Argument, Input} +import java.io.File +import org.broadinstitute.sting.utils.interval.IntervalUtils -/** - * A temporary solution to work around Java access rights issues: - * override chunk and make it public. - * TODO: Eliminate once we determine the final fate of the BAM index reading code. - */ -public class GATKBinList extends BinList { - /** - * Create a new BinList over sequenceCount sequences, consisting of the given bins. - * @param referenceSequence Reference sequence to which these bins are relevant. - * @param bins The given bins to include. - */ - public GATKBinList(final int referenceSequence, final BitSet bins) { - super(referenceSequence,bins); - } +class WriteFlankingIntervalsFunction extends InProcessFunction { + @Input(doc="The reference sequence") + var reference : File = _ - /** - * Retrieves the bins stored in this list. - * @return A bitset where a bin is present in the list if the bit is true. - */ - public BitSet getBins() { - return super.getBins(); - } + @Input(doc="The interval list to flank") + var inputIntervals : File = _ + + @Output(doc="The output intervals file to write to") + var outputIntervals: File = _ + + @Argument(doc="Number of base pair to flank the input intervals") + var flankSize : Int = _ + + def run() { + IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize) + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala deleted file mode 100755 index 77eb3ccbc..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import collection.immutable.TreeSet - -// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { - @Input(doc="The interval list to expand") val inList : File = in - @Input(doc="The reference sequence") val refDict : File = ref - @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start - @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size - @Output(doc="The output intervals file to write to") val outList : File = out - @Argument(doc="The output format for the intervals") val outTypeStr = opType - @Argument(doc="The input format for the intervals") val inTypeStr = ipType - - var output : PrintStream = _ - var parser : GenomeLocParser = _ - var xrl : XReadLines = _ - val outType = IntervalFormatType.convert(outTypeStr) - val inType = IntervalFormatType.convert(inTypeStr) - - var offsetIn : Int = 0 - var offsetOut : Int = 0 - - var first : Boolean = true - var lastTwo : (GenomeLoc,GenomeLoc) = _ - - var intervalCache : TreeSet[GenomeLoc] = _ - val LINES_TO_CACHE : Int = 1000 - - def run = { - output = new PrintStream(outList) - intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{ - def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) } - }) - parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - xrl = new XReadLines(inList) - offsetIn = if (isBed(inType)) 1 else 0 - offsetOut = if( isBed(outType)) 1 else 0 - var line : String = xrl.next - while ( line.startsWith("@") ) { - line = xrl.next - } - var prevLoc: GenomeLoc = null - var curLoc: GenomeLoc = null - var nextLoc : GenomeLoc = parseGenomeInterval(line) - var linesProcessed : Int = 1 - while ( prevLoc != null || curLoc != null || nextLoc != null ) { - prevLoc = curLoc - curLoc = nextLoc - nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null - if ( curLoc != null ) { - val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc) - val right: GenomeLoc = refine(expandRight(curLoc),nextLoc) - if ( left != null ) { - intervalCache += left - } - if ( right != null ) { - intervalCache += right - } - } - linesProcessed += 1 - if ( linesProcessed % LINES_TO_CACHE == 0 ) { - val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt)) - intervalCache = intervalCache -- toPrint - toPrint.foreach(u => output.print("%s%n".format(repr(u)))) - } - //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc))) - } - - intervalCache.foreach(u => output.print("%s%n".format(repr(u)))) - - output.close() - } - - def expandLeft(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt) - } - - def expandRight(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt) - } - - def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = { - if ( borderG == null || ! newG.overlapsP(borderG) ) { - return newG - } else { - if ( newG.getStart < borderG.getStart ) { - if ( borderG.getStart - startInt > newG.getStart ) { - return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt) - } - } else { - if ( borderG.getStop + startInt < newG.getStop ){ - return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop) - } - } - } - - null - } - - def repr(loc : GenomeLoc) : String = { - if ( loc == null ) return "null" - if ( outType == IntervalFormatType.INTERVALS ) { - return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) - } - } - - def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { - t == IntervalFormatType.BED - } - - def parseGenomeInterval( s : String ) : GenomeLoc = { - val sp = s.split("\\s+") - // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) - } - - object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { - type IntervalFormatType = Value - val INTERVALS,BED,TDF = Value - - def convert(s : String) : IntervalFormatType = { - if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} - } - } -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala deleted file mode 100755 index e929477a1..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ /dev/null @@ -1,70 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import collection.JavaConversions._ -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} -import org.broadinstitute.sting.utils.text.XReadLines -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} - -class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { - @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals - @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false - @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - - - var outStream : PrintStream = _ - var contigs : List[String] = Nil - var dict : SAMSequenceDictionary = _ - var parser : GenomeLocParser = _ - - def run = { - outStream = new PrintStream(output) - dict = new SAMSequenceDictionary - // note: memory hog - val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex - sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) - parser = new GenomeLocParser(dict) - sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) - outStream.close() - } - - def writeOut(g : GenomeLoc) : Unit = { - outStream.print("%s%n".format(g.toString)) - } - - def parse(s : String) : (String,Int,Int) = { - if ( s.contains(":") ) { - val split1 = s.split(":") - val split2 = split1(1).split("-") - return (split1(0),split2(0).toInt,split2(1).toInt) - } else { - val split = s.split("\\s+") - return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) - } - } - - def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { - parser.createGenomeLoc(coords._1,coords._2,coords._3) - } - - def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { - if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { - a.init :+ (a.last :+ b) - } else { - a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) - } - } - - def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { - if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for - null - } - u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) - } - - def range(a: Int, b: Int) : Range = new Range(a,b+1,1) - -} \ No newline at end of file diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar old mode 100755 new mode 100644 similarity index 88% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar index bfd06f97f..ee5d02367 Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml similarity index 77% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml index f0568def4..5417641d3 100644 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml +++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml @@ -1,3 +1,3 @@ - +