diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java deleted file mode 100644 index 5005b6265..000000000 --- a/public/java/src/net/sf/samtools/BAMFileReader.java +++ /dev/null @@ -1,762 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package net.sf.samtools; - - -import net.sf.samtools.util.*; -import net.sf.samtools.SAMFileReader.ValidationStringency; - -import java.io.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Internal class for reading and querying BAM files. - */ -class BAMFileReader extends SAMFileReader.ReaderImplementation { - // True if reading from a File rather than an InputStream - private boolean mIsSeekable = false; - - // For converting bytes into other primitive types - private BinaryCodec mStream = null; - - // Underlying compressed data stream. - private final BAMInputStream mInputStream; - private SAMFileHeader mFileHeader = null; - - // Populated if the file is seekable and an index exists - private File mIndexFile; - private BAMIndex mIndex = null; - private long mFirstRecordPointer = 0; - private CloseableIterator mCurrentIterator = null; - - // If true, all SAMRecords are fully decoded as they are read. - private final boolean eagerDecode; - - // For error-checking. - private ValidationStringency mValidationStringency; - - // For creating BAMRecords - private SAMRecordFactory samRecordFactory; - - /** - * Use the caching index reader implementation rather than the disk-hit-per-file model. - */ - private boolean mEnableIndexCaching = false; - - /** - * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. - */ - private boolean mEnableIndexMemoryMapping = true; - - /** - * Add information about the origin (reader and position) to SAM records. - */ - private SAMFileReader mFileReader = null; - - /** - * Prepare to read BAM from a stream (not seekable) - * @param stream source of bytes. - * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. - * @param validationStringency Controls how to handle invalidate reads or header lines. - */ - BAMFileReader(final InputStream stream, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - mIndexFile = indexFile; - mIsSeekable = false; - mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); - mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); - this.eagerDecode = eagerDecode; - this.mValidationStringency = validationStringency; - this.samRecordFactory = factory; - readHeader(null); - } - - /** - * Prepare to read BAM from a file (seekable) - * @param file source of bytes. - * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. - * @param validationStringency Controls how to handle invalidate reads or header lines. - */ - BAMFileReader(final File file, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); - if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { - System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + - " is older than BAM " + file.getAbsolutePath()); - } - } - - BAMFileReader(final SeekableStream strm, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), - indexFile, - eagerDecode, - strm.getSource(), - validationStringency, - factory); - } - - private BAMFileReader(final BAMInputStream inputStream, - final File indexFile, - final boolean eagerDecode, - final String source, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - mIndexFile = indexFile; - mIsSeekable = true; - mInputStream = inputStream; - mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); - this.eagerDecode = eagerDecode; - this.mValidationStringency = validationStringency; - this.samRecordFactory = factory; - readHeader(source); - mFirstRecordPointer = inputStream.getFilePointer(); - } - - /** - * If true, writes the source of every read into the source SAMRecords. - * @param enabled true to write source information into each SAMRecord. - */ - void enableFileSource(final SAMFileReader reader, final boolean enabled) { - this.mFileReader = enabled ? reader : null; - } - - /** - * If true, uses the caching version of the index reader. - * @param enabled true to write source information into each SAMRecord. - */ - public void enableIndexCaching(final boolean enabled) { - if(mIndex != null) - throw new SAMException("Unable to turn on index caching; index file has already been loaded."); - this.mEnableIndexCaching = enabled; - } - - /** - * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). - * This is slower but more scalable when accessing large numbers of BAM files sequentially. - * @param enabled True to use memory mapping, false to use regular I/O. - */ - public void enableIndexMemoryMapping(final boolean enabled) { - if (mIndex != null) { - throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); - } - this.mEnableIndexMemoryMapping = enabled; - } - - @Override void enableCrcChecking(final boolean enabled) { - this.mInputStream.setCheckCrcs(enabled); - } - - @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } - - /** - * @return true if ths is a BAM file, and has an index - */ - public boolean hasIndex() { - return (mIndexFile != null); - } - - /** - * Retrieves the index for the given file type. Ensure that the index is of the specified type. - * @return An index of the given type. - */ - public BAMIndex getIndex() { - if(mIndexFile == null) - throw new SAMException("No index is available for this BAM file."); - if(mIndex == null) - mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) - : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); - return mIndex; - } - - void close() { - if (mStream != null) { - mStream.close(); - } - if (mIndex != null) { - mIndex.close(); - } - mStream = null; - mFileHeader = null; - mIndex = null; - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - /** - * Set error-checking level for subsequent SAMRecord reads. - */ - void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { - this.mValidationStringency = validationStringency; - } - - SAMFileReader.ValidationStringency getValidationStringency() { - return this.mValidationStringency; - } - - /** - * Prepare to iterate through the SAMRecords in file order. - * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, - * that iterator must be closed before getIterator() can be called again. - * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to - * getIterator() begins its iteration where the last one left off. That is the best that can be - * done in that situation. - */ - CloseableIterator getIterator() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIsSeekable) { - try { - mInputStream.seek(mFirstRecordPointer); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - mCurrentIterator = new BAMFileIterator(); - return mCurrentIterator; - } - - @Override - CloseableIterator getIterator(final SAMFileSpan chunks) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!(chunks instanceof BAMFileSpan)) { - throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); - } - - // Create an iterator over the given chunk boundaries. - mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); - return mCurrentIterator; - } - - /** - * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know - * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, - * the rightmost bound is guaranteed to be after the last read in the file. - * @return An unbounded pointer to the first record in the BAM file. - */ - @Override - SAMFileSpan getFilePointerSpanningReads() { - return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); - } - - /** - * Prepare to iterate through the SAMRecords that match the given interval. - * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed - * before calling any of the methods that return an iterator. - * - * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting - * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate - * matches the specified interval. - * - * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect - * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. - * - * @param sequence Reference sequence sought. - * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. - * A value of zero implies the start of the reference sequence. - * @param end A value of zero implies the end of the reference sequence. - * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval - * specified by start and end. If false, the SAMRecords need only overlap the interval. - * @return Iterator for the matching SAMRecords - */ - CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); - return mCurrentIterator; - } - - /** - * Prepare to iterate through the SAMRecords with the given alignment start. - * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed - * before calling any of the methods that return an iterator. - * - * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting - * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate - * matches the specified interval. - * - * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect - * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. - * - * @param sequence Reference sequence sought. - * @param start Alignment start sought. - * @return Iterator for the matching SAMRecords. - */ - CloseableIterator queryAlignmentStart(final String sequence, final int start) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); - return mCurrentIterator; - } - - public CloseableIterator queryUnmapped() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - try { - final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); - if (startOfLastLinearBin != -1) { - mInputStream.seek(startOfLastLinearBin); - } else { - // No mapped reads in file, just start at the first read in file. - mInputStream.seek(mFirstRecordPointer); - } - mCurrentIterator = new BAMFileIndexUnmappedIterator(); - return mCurrentIterator; - } catch (IOException e) { - throw new RuntimeException("IOException seeking to unmapped reads", e); - } - } - - /** - * Reads the header from the file or stream - * @param source Note that this is used only for reporting errors. - */ - private void readHeader(final String source) - throws IOException { - - final byte[] buffer = new byte[4]; - mStream.readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { - throw new IOException("Invalid BAM file header"); - } - - final int headerTextLength = mStream.readInt(); - final String textHeader = mStream.readString(headerTextLength); - final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); - headerCodec.setValidationStringency(mValidationStringency); - mFileHeader = headerCodec.decode(new StringLineReader(textHeader), - source); - - final int sequenceCount = mStream.readInt(); - if (mFileHeader.getSequenceDictionary().size() > 0) { - // It is allowed to have binary sequences but no text sequences, so only validate if both are present - if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { - throw new SAMFormatException("Number of sequences in text header (" + - mFileHeader.getSequenceDictionary().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); - } - for (int i = 0; i < sequenceCount; i++) { - final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); - final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); - if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + - source); - } - if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + - source); - } - } - } else { - // If only binary sequences are present, copy them into mFileHeader - final List sequences = new ArrayList(sequenceCount); - for (int i = 0; i < sequenceCount; i++) { - sequences.add(readSequenceRecord(source)); - } - mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); - } - } - - /** - * Reads a single binary sequence record from the file or stream - * @param source Note that this is used only for reporting errors. - */ - private SAMSequenceRecord readSequenceRecord(final String source) { - final int nameLength = mStream.readInt(); - if (nameLength <= 1) { - throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); - } - final String sequenceName = mStream.readString(nameLength - 1); - // Skip the null terminator - mStream.readByte(); - final int sequenceLength = mStream.readInt(); - return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); - } - - /** - * Iterator for non-indexed sequential iteration through all SAMRecords in file. - * Starting point of iteration is wherever current file position is when the iterator is constructed. - */ - private class BAMFileIterator implements CloseableIterator { - private SAMRecord mNextRecord = null; - private final BAMRecordCodec bamRecordCodec; - private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file - - BAMFileIterator() { - this(true); - } - - /** - * @param advance Trick to enable subclass to do more setup before advancing - */ - BAMFileIterator(final boolean advance) { - this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); - this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); - - if (advance) { - advance(); - } - } - - public void close() { - if (mCurrentIterator != null && this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - public boolean hasNext() { - return (mNextRecord != null); - } - - public SAMRecord next() { - final SAMRecord result = mNextRecord; - advance(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - void advance() { - try { - mNextRecord = getNextRecord(); - - if (mNextRecord != null) { - ++this.samRecordIndex; - // Because some decoding is done lazily, the record needs to remember the validation stringency. - mNextRecord.setValidationStringency(mValidationStringency); - - if (mValidationStringency != ValidationStringency.SILENT) { - final List validationErrors = mNextRecord.isValid(); - SAMUtils.processValidationErrors(validationErrors, - this.samRecordIndex, BAMFileReader.this.getValidationStringency()); - } - } - if (eagerDecode && mNextRecord != null) { - mNextRecord.eagerDecode(); - } - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - /** - * Read the next record from the input stream. - */ - SAMRecord getNextRecord() throws IOException { - final long startCoordinate = mInputStream.getFilePointer(); - final SAMRecord next = bamRecordCodec.decode(); - final long stopCoordinate = mInputStream.getFilePointer(); - - if(mFileReader != null && next != null) - next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); - - return next; - } - - /** - * @return The record that will be return by the next call to next() - */ - protected SAMRecord peek() { - return mNextRecord; - } - } - - /** - * Prepare to iterate through SAMRecords matching the target interval. - * @param sequence Desired reference sequence. - * @param start 1-based start of target interval, inclusive. - * @param end 1-based end of target interval, inclusive. - * @param queryType contained, overlapping, or starting-at query. - */ - private CloseableIterator createIndexIterator(final String sequence, - final int start, - final int end, - final QueryType queryType) { - long[] filePointers = null; - - // Hit the index to determine the chunk boundaries for the required data. - final SAMFileHeader fileHeader = getFileHeader(); - final int referenceIndex = fileHeader.getSequenceIndex(sequence); - if (referenceIndex != -1) { - final BAMIndex fileIndex = getIndex(); - final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); - filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; - } - - // Create an iterator over the above chunk boundaries. - final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); - - // Add some preprocessing filters for edge-case reads that don't fit into this - // query type. - return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); - } - - enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} - - /** - * Look for BAM index file according to standard naming convention. - * - * @param dataFile BAM file name. - * @return Index file name, or null if not found. - */ - private static File findIndexFile(final File dataFile) { - // If input is foo.bam, look for foo.bai - final String bamExtension = ".bam"; - File indexFile; - final String fileName = dataFile.getName(); - if (fileName.endsWith(bamExtension)) { - final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; - indexFile = new File(dataFile.getParent(), bai); - if (indexFile.exists()) { - return indexFile; - } - } - - // If foo.bai doesn't exist look for foo.bam.bai - indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); - if (indexFile.exists()) { - return indexFile; - } else { - return null; - } - } - - private class BAMFileIndexIterator extends BAMFileIterator { - - private long[] mFilePointers = null; - private int mFilePointerIndex = 0; - private long mFilePointerLimit = -1; - - /** - * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. - * @param filePointers the block / offset combination, stored in chunk format. - */ - BAMFileIndexIterator(final long[] filePointers) { - super(false); // delay advance() until after construction - mFilePointers = filePointers; - advance(); - } - - SAMRecord getNextRecord() - throws IOException { - // Advance to next file block if necessary - while (mInputStream.getFilePointer() >= mFilePointerLimit) { - if (mFilePointers == null || - mFilePointerIndex >= mFilePointers.length) { - return null; - } - final long startOffset = mFilePointers[mFilePointerIndex++]; - final long endOffset = mFilePointers[mFilePointerIndex++]; - mInputStream.seek(startOffset); - mFilePointerLimit = endOffset; - } - // Pull next record from stream - return super.getNextRecord(); - } - } - - /** - * A decorating iterator that filters out records that are outside the bounds of the - * given query parameters. - */ - private class BAMQueryFilteringIterator implements CloseableIterator { - /** - * The wrapped iterator. - */ - private final CloseableIterator wrappedIterator; - - /** - * The next record to be returned. Will be null if no such record exists. - */ - private SAMRecord mNextRecord; - - private final int mReferenceIndex; - private final int mRegionStart; - private final int mRegionEnd; - private final QueryType mQueryType; - - public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { - this.wrappedIterator = iterator; - final SAMFileHeader fileHeader = getFileHeader(); - mReferenceIndex = fileHeader.getSequenceIndex(sequence); - mRegionStart = start; - if (queryType == QueryType.STARTING_AT) { - mRegionEnd = mRegionStart; - } else { - mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; - } - mQueryType = queryType; - mNextRecord = advance(); - } - - /** - * Returns true if a next element exists; false otherwise. - */ - public boolean hasNext() { - return mNextRecord != null; - } - - /** - * Gets the next record from the given iterator. - * @return The next SAM record in the iterator. - */ - public SAMRecord next() { - if(!hasNext()) - throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); - final SAMRecord currentRead = mNextRecord; - mNextRecord = advance(); - return currentRead; - } - - /** - * Closes down the existing iterator. - */ - public void close() { - if (this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - SAMRecord advance() { - while (true) { - // Pull next record from stream - if(!wrappedIterator.hasNext()) - return null; - - final SAMRecord record = wrappedIterator.next(); - // If beyond the end of this reference sequence, end iteration - final int referenceIndex = record.getReferenceIndex(); - if (referenceIndex != mReferenceIndex) { - if (referenceIndex < 0 || - referenceIndex > mReferenceIndex) { - return null; - } - // If before this reference sequence, continue - continue; - } - if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { - // Quick exit to avoid expensive alignment end calculation - return record; - } - final int alignmentStart = record.getAlignmentStart(); - // If read is unmapped but has a coordinate, return it if the coordinate is within - // the query region, regardless of whether the mapped mate will be returned. - final int alignmentEnd; - if (mQueryType == QueryType.STARTING_AT) { - alignmentEnd = -1; - } else { - alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? - record.getAlignmentEnd(): alignmentStart); - } - - if (alignmentStart > mRegionEnd) { - // If scanned beyond target region, end iteration - return null; - } - // Filter for overlap with region - if (mQueryType == QueryType.CONTAINED) { - if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { - return record; - } - } else if (mQueryType == QueryType.OVERLAPPING) { - if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { - return record; - } - } else { - if (alignmentStart == mRegionStart) { - return record; - } - } - } - } - } - - private class BAMFileIndexUnmappedIterator extends BAMFileIterator { - private BAMFileIndexUnmappedIterator() { - while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - advance(); - } - } - } - -} diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index 5d349e72e..e9335a86d 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -40,6 +40,10 @@ public class GATKChunk extends Chunk { super(start,stop); } + public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) { + super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd); + } + public GATKChunk(final Chunk chunk) { super(chunk.getChunkStart(),chunk.getChunkEnd()); } diff --git a/public/java/src/net/sf/samtools/PicardNamespaceUtils.java b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java new file mode 100644 index 000000000..b645f8fdc --- /dev/null +++ b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package net.sf.samtools; + +/** + * Utils that insist on being in the same package as Picard. + */ +public class PicardNamespaceUtils { + /** + * Private constructor only. Do not instantiate. + */ + private PicardNamespaceUtils() {} + + public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) { + read.setFileSource(fileSource); + } +} diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java deleted file mode 100644 index d825c23d5..000000000 --- a/public/java/src/net/sf/samtools/util/BAMInputStream.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package net.sf.samtools.util; - -import java.io.IOException; - -/** - * An input stream formulated for use reading BAM files. Supports - */ -public interface BAMInputStream { - /** - * Seek to the given position in the file. Note that pos is a special virtual file pointer, - * not an actual byte offset. - * - * @param pos virtual file pointer - */ - public void seek(final long pos) throws IOException; - - /** - * @return virtual file pointer that can be passed to seek() to return to the current position. This is - * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between - * the two. - */ - public long getFilePointer(); - - /** - * Determines whether or not the inflater will re-calculated the CRC on the decompressed data - * and check it against the value stored in the GZIP header. CRC checking is an expensive - * operation and should be used accordingly. - */ - public void setCheckCrcs(final boolean check); - - public int read() throws java.io.IOException; - - public int read(byte[] bytes) throws java.io.IOException; - - public int read(byte[] bytes, int i, int i1) throws java.io.IOException; - - public long skip(long l) throws java.io.IOException; - - public int available() throws java.io.IOException; - - public void close() throws java.io.IOException; - - public void mark(int i); - - public void reset() throws java.io.IOException; - - public boolean markSupported(); -} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java deleted file mode 100755 index fae2fc89b..000000000 --- a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java +++ /dev/null @@ -1,483 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package net.sf.samtools.util; - - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.RandomAccessFile; -import java.net.URL; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; - -import net.sf.samtools.FileTruncatedException; - -/* - * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. - * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. - * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the - * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. - * - * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format - */ -public class BlockCompressedInputStream extends InputStream implements BAMInputStream { - private InputStream mStream = null; - private SeekableStream mFile = null; - private byte[] mFileBuffer = null; - private byte[] mCurrentBlock = null; - private int mCurrentOffset = 0; - private long mBlockAddress = 0; - private int mLastBlockLength = 0; - private final BlockGunzipper blockGunzipper = new BlockGunzipper(); - - - /** - * Note that seek() is not supported if this ctor is used. - */ - public BlockCompressedInputStream(final InputStream stream) { - mStream = IOUtil.toBufferedStream(stream); - mFile = null; - } - - /** - * Use this ctor if you wish to call seek() - */ - public BlockCompressedInputStream(final File file) - throws IOException { - mFile = new SeekableFileStream(file); - mStream = null; - - } - - public BlockCompressedInputStream(final URL url) { - mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); - mStream = null; - } - - /** - * For providing some arbitrary data source. No additional buffering is - * provided, so if the underlying source is not buffered, wrap it in a - * SeekableBufferedStream before passing to this ctor. - */ - public BlockCompressedInputStream(final SeekableStream strm) { - mFile = strm; - mStream = null; - } - - /** - * Determines whether or not the inflater will re-calculated the CRC on the decompressed data - * and check it against the value stored in the GZIP header. CRC checking is an expensive - * operation and should be used accordingly. - */ - public void setCheckCrcs(final boolean check) { - this.blockGunzipper.setCheckCrcs(check); - } - - /** - * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the - * next caller of a method for this input stream. The next caller might be the same thread or another thread. - * Note that although the next caller can read this many bytes without blocking, the available() method call itself - * may block in order to fill an internal buffer if it has been exhausted. - */ - public int available() - throws IOException { - if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { - readBlock(); - } - if (mCurrentBlock == null) { - return 0; - } - return mCurrentBlock.length - mCurrentOffset; - } - - /** - * Closes the underlying InputStream or RandomAccessFile - */ - public void close() - throws IOException { - if (mFile != null) { - mFile.close(); - mFile = null; - } else if (mStream != null) { - mStream.close(); - mStream = null; - } - // Encourage garbage collection - mFileBuffer = null; - mCurrentBlock = null; - } - - /** - * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. - * If no byte is available because the end of the stream has been reached, the value -1 is returned. - * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. - - * @return the next byte of data, or -1 if the end of the stream is reached. - */ - public int read() - throws IOException { - return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; - } - - /** - * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes - * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, - * or an exception is thrown. - * - * read(buf) has the same effect as read(buf, 0, buf.length). - * - * @param buffer the buffer into which the data is read. - * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of - * the stream has been reached. - */ - public int read(final byte[] buffer) - throws IOException { - return read(buffer, 0, buffer.length); - } - - private volatile ByteArrayOutputStream buf = null; - private static final byte eol = '\n'; - private static final byte eolCr = '\r'; - - /** - * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), - * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). - * - * @return A String containing the contents of the line, excluding the line terminating - * character, or null if the end of the stream has been reached - * - * @exception IOException If an I/O error occurs - * @ - */ - public String readLine() throws IOException { - int available = available(); - if (available == 0) { - return null; - } - if(null == buf){ // lazy initialisation - buf = new ByteArrayOutputStream(8192); - } - buf.reset(); - boolean done = false; - boolean foundCr = false; // \r found flag - while (!done) { - int linetmpPos = mCurrentOffset; - int bCnt = 0; - while((available-- > 0)){ - final byte c = mCurrentBlock[linetmpPos++]; - if(c == eol){ // found \n - done = true; - break; - } else if(foundCr){ // previous char was \r - --linetmpPos; // current char is not \n so put it back - done = true; - break; - } else if(c == eolCr){ // found \r - foundCr = true; - continue; // no ++bCnt - } - ++bCnt; - } - if(mCurrentOffset < linetmpPos){ - buf.write(mCurrentBlock, mCurrentOffset, bCnt); - mCurrentOffset = linetmpPos; - } - available = available(); - if(available == 0){ - // EOF - done = true; - } - } - return buf.toString(); - } - - /** - * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read - * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. - * - * This method blocks until input data is available, end of file is detected, or an exception is thrown. - * - * @param buffer buffer into which data is read. - * @param offset the start offset in array b at which the data is written. - * @param length the maximum number of bytes to read. - * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of - * the stream has been reached. - */ - public int read(final byte[] buffer, int offset, int length) - throws IOException { - final int originalLength = length; - while (length > 0) { - final int available = available(); - if (available == 0) { - // Signal EOF to caller - if (originalLength == length) { - return -1; - } - break; - } - final int copyLength = Math.min(length, available); - System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); - mCurrentOffset += copyLength; - offset += copyLength; - length -= copyLength; - } - return originalLength - length; - } - - /** - * Seek to the given position in the file. Note that pos is a special virtual file pointer, - * not an actual byte offset. - * - * @param pos virtual file pointer - */ - public void seek(final long pos) - throws IOException { - if (mFile == null) { - throw new IOException("Cannot seek on stream based file"); - } - // Decode virtual file pointer - // Upper 48 bits is the byte offset into the compressed stream of a block. - // Lower 16 bits is the byte offset into the uncompressed stream inside the block. - final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); - final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); - final int available; - if (mBlockAddress == compressedOffset && mCurrentBlock != null) { - available = mCurrentBlock.length; - } else { - mFile.seek(compressedOffset); - mBlockAddress = compressedOffset; - mLastBlockLength = 0; - readBlock(); - available = available(); - } - if (uncompressedOffset > available || - (uncompressedOffset == available && !eof())) { - throw new IOException("Invalid file pointer: " + pos); - } - mCurrentOffset = uncompressedOffset; - } - - private boolean eof() throws IOException { - if (mFile.eof()) { - return true; - } - // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. - return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); - } - - /** - * @return virtual file pointer that can be passed to seek() to return to the current position. This is - * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between - * the two. - */ - public long getFilePointer() { - if (mCurrentOffset == mCurrentBlock.length) { - // If current offset is at the end of the current block, file pointer should point - // to the beginning of the next block. - return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); - } - return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); - } - - public static long getFileBlock(final long bgzfOffset) { - return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); - } - - /** - * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). - * @return true if the given file looks like a valid BGZF file. - */ - public static boolean isValidFile(final InputStream stream) - throws IOException { - if (!stream.markSupported()) { - throw new RuntimeException("Cannot test non-buffered stream"); - } - stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; - final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - stream.reset(); - return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); - } - - private static boolean isValidBlockHeader(final byte[] buffer) { - return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && - (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && - (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && - buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && - buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && - buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); - } - - private void readBlock() - throws IOException { - - if (mFileBuffer == null) { - mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; - } - int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - if (count == 0) { - // Handle case where there is no empty gzip block at end. - mCurrentOffset = 0; - mBlockAddress += mLastBlockLength; - mCurrentBlock = new byte[0]; - return; - } - if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { - throw new IOException("Premature end of file"); - } - final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; - if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { - throw new IOException("Unexpected compressed block length: " + blockLength); - } - final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; - count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); - if (count != remaining) { - throw new FileTruncatedException("Premature end of file"); - } - inflateBlock(mFileBuffer, blockLength); - mCurrentOffset = 0; - mBlockAddress += mLastBlockLength; - mLastBlockLength = blockLength; - } - - private void inflateBlock(final byte[] compressedBlock, final int compressedLength) - throws IOException { - final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); - byte[] buffer = mCurrentBlock; - mCurrentBlock = null; - if (buffer == null || buffer.length != uncompressedLength) { - try { - buffer = new byte[uncompressedLength]; - } catch (NegativeArraySizeException e) { - throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); - } - } - blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); - mCurrentBlock = buffer; - } - - private int readBytes(final byte[] buffer, final int offset, final int length) - throws IOException { - if (mFile != null) { - return readBytes(mFile, buffer, offset, length); - } else if (mStream != null) { - return readBytes(mStream, buffer, offset, length); - } else { - return 0; - } - } - - private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = file.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private int unpackInt16(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8)); - } - - private int unpackInt32(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8) | - ((buffer[offset+2] & 0xFF) << 16) | - ((buffer[offset+3] & 0xFF) << 24)); - } - - public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} - - public static FileTermination checkTermination(final File file) - throws IOException { - final long fileSize = file.length(); - if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { - return FileTermination.DEFECTIVE; - } - final RandomAccessFile raFile = new RandomAccessFile(file, "r"); - try { - raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); - byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; - raFile.readFully(buf); - if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { - return FileTermination.HAS_TERMINATOR_BLOCK; - } - final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); - buf = new byte[bufsize]; - raFile.seek(fileSize - bufsize); - raFile.read(buf); - for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; - i >= 0; --i) { - if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, - buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { - continue; - } - final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; - if (buf.length - i == totalBlockSizeMinusOne + 1) { - return FileTermination.HAS_HEALTHY_LAST_BLOCK; - } else { - return FileTermination.DEFECTIVE; - } - } - return FileTermination.DEFECTIVE; - } finally { - raFile.close(); - } - } - - private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { - for (int i = 0; i < length; ++i) { - if (preamble[i] != buf[i + startOffset]) { - return false; - } - } - return true; - } -} - - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index f9ed0cb74..a3ce6dd27 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -25,9 +25,14 @@ import java.util.NoSuchElementException; */ /** - * A queue of locus context entries. + * The two goals of the LocusView are as follows: + * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch + * between iterating over all bases in a region, only covered bases in a region covered by + * reads, only bases in a region covered by RODs, or any other sort of trigger track + * implementation one can think of. + * 2) To manage the copious number of iterators that have to be jointly pulled through the + * genome to make a locus traversal function. */ - public abstract class LocusView extends LocusIterator implements View { /** * The locus bounding this view. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java similarity index 58% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java index 0a6173c1e..164971365 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java @@ -27,8 +27,10 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.LinkedList; import java.util.List; /** @@ -38,7 +40,7 @@ import java.util.List; * Time: 10:47 PM * To change this template use File | Settings | File Templates. */ -class SAMReaderPosition { +class BAMAccessPlan { private final SAMReaderID reader; private final BlockInputStream inputStream; @@ -51,7 +53,7 @@ class SAMReaderPosition { private long nextBlockAddress; - SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { this.reader = reader; this.inputStream = inputStream; @@ -84,11 +86,45 @@ class SAMReaderPosition { } /** - * Retrieves the last offset of interest in the block returned by getBlockAddress(). - * @return First block of interest in this segment. + * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. + * @param blockAddress Block address for which to search. + * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. + * @return list of chunks containing that block. */ - public int getLastOffsetInBlock() { - return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536; + public List getSpansOverlappingBlock(long blockAddress, long filePosition) { + List spansOverlapping = new LinkedList(); + // While the position iterator overlaps the given block, pull out spans to report. + while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { + // Create a span over as much of the block as is covered by this chunk. + int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + + // Calculate the end of this span. If the span extends past this block, cap it using the current file position. + long blockEnd; + int blockOffsetEnd; + if(blockAddress < positionIterator.peek().getBlockEnd()) { + blockEnd = filePosition; + blockOffsetEnd = 0; + } + else { + blockEnd = positionIterator.peek().getBlockEnd(); + blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); + } + + GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); + + if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) + spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); + + // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. + if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) + break; + + // If the position iterator ends before the block ends, pull the position iterator forward. + if(positionIterator.peek().getBlockEnd() <= blockAddress) + positionIterator.next(); + } + + return spansOverlapping; } public void reset() { @@ -111,20 +147,16 @@ class SAMReaderPosition { * @param filePosition The current position within the file. */ void advancePosition(final long filePosition) { - nextBlockAddress = filePosition >> 16; + nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); // Check the current file position against the iterator; if the iterator is before the current file position, // draw the iterator forward. Remember when performing the check that coordinates are half-open! - while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) positionIterator.next(); - // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. - if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) { - nextBlockAddress = positionIterator.peek().getBlockStart(); - //System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress); - break; - } - } + // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) + nextBlockAddress = positionIterator.peek().getBlockStart(); // If we've shot off the end of the block pointer, notify consumers that iteration is complete. if(!positionIterator.hasNext()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java index f468d2020..d75e91bf3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java @@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher { private final ExecutorService threadPool; - private final Queue inputQueue; + private final Queue inputQueue; public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { threadPool = Executors.newFixedThreadPool(numThreads); fileHandleCache = new FileHandleCache(numFileHandles); - inputQueue = new LinkedList(); + inputQueue = new LinkedList(); threadPool.execute(new BlockLoader(this,fileHandleCache,true)); } @@ -58,7 +58,7 @@ public class BGZFBlockLoadingDispatcher { * Initiates a request for a new block load. * @param readerPosition Position at which to load. */ - void queueBlockLoad(final SAMReaderPosition readerPosition) { + void queueBlockLoad(final BAMAccessPlan readerPosition) { synchronized(inputQueue) { inputQueue.add(readerPosition); inputQueue.notify(); @@ -69,7 +69,7 @@ public class BGZFBlockLoadingDispatcher { * Claims the next work request from the queue. * @return The next work request, or null if none is available. */ - SAMReaderPosition claimNextWorkRequest() { + BAMAccessPlan claimNextWorkRequest() { synchronized(inputQueue) { while(inputQueue.isEmpty()) { try { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java index cb37bad31..fda5d818c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -26,24 +26,21 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; -import net.sf.samtools.util.BAMInputStream; -import net.sf.samtools.util.BlockCompressedFilePointerUtil; import net.sf.samtools.util.BlockCompressedInputStream; -import net.sf.samtools.util.RuntimeEOFException; -import net.sf.samtools.util.SeekableStream; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; -import java.util.Iterator; import java.util.LinkedList; +import java.util.List; /** * Presents decompressed blocks to the SAMFileReader. */ -public class BlockInputStream extends SeekableStream implements BAMInputStream { +public class BlockInputStream extends InputStream { /** * Mechanism for triggering block loads. */ @@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { private Throwable error; /** - * Current position. + * Current accessPlan. */ - private SAMReaderPosition position; + private BAMAccessPlan accessPlan; /** * A stream of compressed data blocks. @@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { */ private final BlockCompressedInputStream validatingInputStream; - /** - * Has the buffer been filled since last request? - */ - private boolean bufferFilled = false; - /** * Create a new block presenting input stream with a dedicated buffer. * @param dispatcher the block loading messenger. @@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { this.dispatcher = dispatcher; // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. - this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to // the point in the file just following the last read. These two arrays should never be empty; initializing @@ -151,7 +143,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { synchronized(lock) { // Find the current block within the input stream. int blockIndex; - for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++) + for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) ; filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); } @@ -164,51 +156,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { return filePointer; } - public void seek(long target) { - //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); - synchronized(lock) { - clearBuffers(); - - // Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified. - position.advancePosition(target); - - // If the position advances past the end of the target, that must mean that we seeked to a point at the end - // of one of the chunk list's subregions. Make a note of our current position and punt on loading any data. - if(target < position.getBlockAddress() << 16) { - blockOffsets.clear(); - blockOffsets.add(0); - blockPositions.clear(); - blockPositions.add(target); - } - else { - waitForBufferFill(); - // A buffer fill will load the relevant data from the shard, but the buffer position still needs to be - // advanced as appropriate. - Iterator blockOffsetIterator = blockOffsets.descendingIterator(); - Iterator blockPositionIterator = blockPositions.descendingIterator(); - while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) { - final int blockOffset = blockOffsetIterator.next(); - final long blockPosition = blockPositionIterator.next(); - if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) { - buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF)); - break; - } - } - } - - if(validatingInputStream != null) { - try { - validatingInputStream.seek(target); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to validate against Picard input stream",ex); - } - } - } - } - private void clearBuffers() { - this.position.reset(); + this.accessPlan.reset(); // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. // Indicate no data to be read. @@ -225,29 +174,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { public boolean eof() { synchronized(lock) { // TODO: Handle multiple empty BGZF blocks at end of the file. - return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length); + return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); } } - public void setCheckCrcs(final boolean check) { - // TODO: Implement - } - /** - * Submits a new access plan for the given dataset. - * @param position The next seek point for BAM data in this reader. + * Submits a new access plan for the given dataset and seeks to the given point. + * @param accessPlan The next seek point for BAM data in this reader. */ - public void submitAccessPlan(final SAMReaderPosition position) { + public void submitAccessPlan(final BAMAccessPlan accessPlan) { //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); - synchronized(lock) { - // Assume that the access plan is going to tell us to start where we are and move forward. - // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. - if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) - position.advancePosition(this.position.getBlockAddress() << 16); + this.accessPlan = accessPlan; + accessPlan.reset(); + + clearBuffers(); + + // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). + // TODO: Don't pass these empty chunks in. + accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); + + if(accessPlan.getBlockAddress() >= 0) { + waitForBufferFill(); } - this.position = position; + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + private void compactBuffer() { // Compact buffer to maximize storage space. int bytesToRemove = 0; @@ -286,27 +247,14 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { * Push contents of incomingBuffer into the end of this buffer. * MUST be called from a thread that is NOT the reader thread. * @param incomingBuffer The data being pushed into this input stream. - * @param position target position for the data. + * @param accessPlan target access plan for the data. * @param filePosition the current position of the file pointer */ - public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { synchronized(lock) { try { - compactBuffer(); - // Open up the buffer for more reading. - buffer.limit(buffer.capacity()); - - // Advance the position to take the most recent read into account. - final long lastBlockAddress = position.getBlockAddress(); - final int blockOffsetStart = position.getFirstOffsetInBlock(); - final int blockOffsetEnd = position.getLastOffsetInBlock(); - - // Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block. - final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16; - - byte[] validBytes = null; if(validatingInputStream != null) { - validBytes = new byte[incomingBuffer.remaining()]; + byte[] validBytes = new byte[incomingBuffer.remaining()]; byte[] currentBytes = new byte[incomingBuffer.remaining()]; int pos = incomingBuffer.position(); @@ -317,7 +265,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { incomingBuffer.position(pos); long currentFilePointer = validatingInputStream.getFilePointer(); - validatingInputStream.seek(lastBlockAddress << 16); + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); validatingInputStream.read(validBytes); validatingInputStream.seek(currentFilePointer); @@ -325,33 +273,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); } - this.position = position; - position.advancePosition(filePosition << 16); + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); - if(buffer.remaining() < incomingBuffer.remaining()) { - //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + // Get the spans overlapping this particular block... + List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); + + // ...and advance the block + this.accessPlan = accessPlan; + accessPlan.advancePosition(makeFilePointer(filePosition, 0)); + + if(buffer.remaining() < incomingBuffer.remaining()) lock.wait(); - //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); + + final int bytesInIncomingBuffer = incomingBuffer.limit(); + + for(GATKChunk spanOverlapping: spansOverlapping) { + // Clear out the endcap tracking state and add in the starting position for this transfer. + blockOffsets.removeLast(); + blockOffsets.add(buffer.position()); + blockPositions.removeLast(); + blockPositions.add(spanOverlapping.getChunkStart()); + + // Stream the buffer into the data stream. + incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); + incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); + buffer.put(incomingBuffer); + + // Add the endcap for this transfer. + blockOffsets.add(buffer.position()); + blockPositions.add(spanOverlapping.getChunkEnd()); } - // Remove the last position in the list and add in the last read position, in case the two are different. - blockOffsets.removeLast(); - blockOffsets.add(buffer.position()); - blockPositions.removeLast(); - blockPositions.add(lastBlockAddress << 16 | blockOffsetStart); - - // Stream the buffer into the data stream. - incomingBuffer.position(blockOffsetStart); - incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd)); - buffer.put(incomingBuffer); - - // Then, add the last position read to the very end of the list, just past the end of the last buffer. - blockOffsets.add(buffer.position()); - blockPositions.add(endOfRead); - // Set up the buffer for reading. buffer.flip(); - bufferFilled = true; lock.notify(); } @@ -447,12 +403,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { if(remaining < length) return length - remaining; - // Otherwise, if at eof(), return -1. - else if(eof()) - return -1; - - // Otherwise, we must've hit a bug in the system. - throw new ReviewedStingException("BUG: read returned no data, but eof() reports false."); + // Otherwise, return -1. + return -1; } public void close() { @@ -472,20 +424,26 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { private void waitForBufferFill() { synchronized(lock) { - bufferFilled = false; if(buffer.remaining() == 0 && !eof()) { //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); - dispatcher.queueBlockLoad(position); + dispatcher.queueBlockLoad(accessPlan); try { lock.wait(); } catch(InterruptedException ex) { throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); } - - if(bufferFilled && buffer.remaining() == 0) - throw new RuntimeEOFException("No more data left in InputStream"); } } } + + /** + * Create an encoded BAM file pointer given the address of a BGZF block and an offset. + * @param blockAddress Physical address on disk of a BGZF block. + * @param blockOffset Offset into the uncompressed data stored in the BGZF block. + * @return 64-bit pointer encoded according to the BAM spec. + */ + public static long makeFilePointer(final long blockAddress, final int blockOffset) { + return blockAddress << 16 | blockOffset; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java index ab4299802..81a37e53c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -70,29 +70,29 @@ class BlockLoader implements Runnable { public void run() { for(;;) { - SAMReaderPosition readerPosition = null; + BAMAccessPlan accessPlan = null; try { - readerPosition = dispatcher.claimNextWorkRequest(); - FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + accessPlan = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader()); - long blockAddress = readerPosition.getBlockAddress(); + //long blockAddress = readerPosition.getBlockAddress(); //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); - ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress()); long nextBlockAddress = position(inputStream); - fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream); ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; int bytesCopied = block.remaining(); - BlockInputStream bamInputStream = readerPosition.getInputStream(); - bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + BlockInputStream bamInputStream = accessPlan.getInputStream(); + bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress); //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); } catch(Throwable error) { - if(readerPosition != null && readerPosition.getInputStream() != null) - readerPosition.getInputStream().reportException(error); + if(accessPlan != null && accessPlan.getInputStream() != null) + accessPlan.getInputStream().reportException(error); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java new file mode 100644 index 000000000..4005f1c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; +import java.util.NoSuchElementException; + +/** + * High efficiency filtering iterator designed to filter out reads only included + * in the query results due to the granularity of the BAM index. + * + * Built into the BAM index is a notion of 16kbase granularity -- an index query for + * two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will + * return exactly the same regions within the BAM file. This iterator is optimized + * to subtract out reads which do not at all overlap the interval list passed to the + * constructor. + * + * Example: + * interval list: chr20:6-10 + * Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15. + * Reads that would be discarded by the filter: chr20:1-5, chr20:11-15. + */ +class IntervalOverlapFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private CloseableIterator iterator; + + /** + * The next read, queued up and ready to go. + */ + private SAMRecord nextRead; + + /** + * Rather than using the straight genomic bounds, use filter out only mapped reads. + */ + private boolean keepOnlyUnmappedReads; + + /** + * Custom representation of interval bounds. + * Makes it simpler to track current position. + */ + private int[] intervalContigIndices; + private int[] intervalStarts; + private int[] intervalEnds; + + /** + * Position within the interval list. + */ + private int currentBound = 0; + + public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { + this.iterator = iterator; + + // Look at the interval list to detect whether we should worry about unmapped reads. + // If we find a mix of mapped/unmapped intervals, throw an exception. + boolean foundMappedIntervals = false; + for(GenomeLoc location: intervals) { + if(! GenomeLoc.isUnmapped(location)) + foundMappedIntervals = true; + keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); + } + + + if(foundMappedIntervals) { + if(keepOnlyUnmappedReads) + throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalContigIndices = new int[intervals.size()]; + this.intervalStarts = new int[intervals.size()]; + this.intervalEnds = new int[intervals.size()]; + int i = 0; + for(GenomeLoc interval: intervals) { + intervalContigIndices[i] = interval.getContigIndex(); + intervalStarts[i] = interval.getStart(); + intervalEnds[i] = interval.getStop(); + i++; + } + } + + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(nextRead == null) + throw new NoSuchElementException("No more reads left in this iterator."); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); + } + + + public void close() { + iterator.close(); + } + + private void advance() { + nextRead = null; + + if(!iterator.hasNext()) + return; + + SAMRecord candidateRead = iterator.next(); + while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { + if(!keepOnlyUnmappedReads) { + // Mapped read filter; check against GenomeLoc-derived bounds. + if(readEndsOnOrAfterStartingBound(candidateRead)) { + // This read ends after the current interval begins. + // Promising, but this read must be checked against the ending bound. + if(readStartsOnOrBeforeEndingBound(candidateRead)) { + // Yes, this read is within both bounds. This must be our next read. + nextRead = candidateRead; + break; + } + else { + // Oops, we're past the end bound. Increment the current bound and try again. + currentBound++; + continue; + } + } + } + else { + // Found an unmapped read. We're done. + if(candidateRead.getReadUnmappedFlag()) { + nextRead = candidateRead; + break; + } + } + + // No more reads available. Stop the search. + if(!iterator.hasNext()) + break; + + // No reasonable read found; advance the iterator. + candidateRead = iterator.next(); + } + } + + /** + * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its + * end will be distorted, so rely only on the alignment start. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { + return + // Read ends on a later contig, or... + read.getReferenceIndex() > intervalContigIndices[currentBound] || + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + } + + /** + * Check whether the read lies before the end of the current bound. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { + return + // Read starts on a prior contig, or... + read.getReferenceIndex() < intervalContigIndices[currentBound] || + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 8d73b1b15..96b55674a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -36,7 +36,7 @@ import java.util.Map; */ public class ReadShard extends Shard { /** - * What is the maximum number of reads which should go into a read shard. + * What is the maximum number of reads per BAM file which should go into a read shard. */ public static int MAX_READS = 10000; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index c040b53c4..27b9e7f77 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SimpleTimer; @@ -567,9 +566,14 @@ public class SAMDataSource { if(threadAllocation.getNumIOThreads() > 0) { BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); } - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); iteratorMap.put(readers.getReader(id), iterator); @@ -577,8 +581,6 @@ public class SAMDataSource { MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); - - return applyDecoratingIterators(shard.getReadMetrics(), enableVerification, readProperties.useOriginalBaseQualities(), @@ -592,6 +594,49 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } + private class BAMCodecIterator implements CloseableIterator { + private final BlockInputStream inputStream; + private final SAMFileReader reader; + private final BAMRecordCodec codec; + private SAMRecord nextRead; + + private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { + this.inputStream = inputStream; + this.reader = reader; + this.codec = codec; + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); + } + + private void advance() { + final long startCoordinate = inputStream.getFilePointer(); + nextRead = codec.decode(); + final long stopCoordinate = inputStream.getFilePointer(); + + if(reader != null && nextRead != null) + PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate)))); + } + } + /** * Filter reads based on user-specified criteria. * @@ -871,12 +916,9 @@ public class SAMDataSource { public ReaderInitializer call() { final File indexFile = findIndexFile(readerID.samFile); try { - if (threadAllocation.getNumIOThreads() > 0) { + if (threadAllocation.getNumIOThreads() > 0) blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(blockInputStream,indexFile,false); - } - else - reader = new SAMFileReader(readerID.samFile,indexFile,false); + reader = new SAMFileReader(readerID.samFile,indexFile,false); } catch ( RuntimeIOException e ) { if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) throw new UserException.CouldNotReadInputFile(readerID.samFile, e); @@ -933,167 +975,6 @@ public class SAMDataSource { */ private class ReadGroupMapping extends HashMap {} - /** - * Filters out reads that do not overlap the current GenomeLoc. - * Note the custom implementation: BAM index querying returns all reads that could - * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. - */ - private class IntervalOverlapFilteringIterator implements CloseableIterator { - /** - * The wrapped iterator. - */ - private CloseableIterator iterator; - - /** - * The next read, queued up and ready to go. - */ - private SAMRecord nextRead; - - /** - * Rather than using the straight genomic bounds, use filter out only mapped reads. - */ - private boolean keepOnlyUnmappedReads; - - /** - * Custom representation of interval bounds. - * Makes it simpler to track current position. - */ - private int[] intervalContigIndices; - private int[] intervalStarts; - private int[] intervalEnds; - - /** - * Position within the interval list. - */ - private int currentBound = 0; - - public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { - this.iterator = iterator; - - // Look at the interval list to detect whether we should worry about unmapped reads. - // If we find a mix of mapped/unmapped intervals, throw an exception. - boolean foundMappedIntervals = false; - for(GenomeLoc location: intervals) { - if(! GenomeLoc.isUnmapped(location)) - foundMappedIntervals = true; - keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); - } - - - if(foundMappedIntervals) { - if(keepOnlyUnmappedReads) - throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); - this.intervalContigIndices = new int[intervals.size()]; - this.intervalStarts = new int[intervals.size()]; - this.intervalEnds = new int[intervals.size()]; - int i = 0; - for(GenomeLoc interval: intervals) { - intervalContigIndices[i] = interval.getContigIndex(); - intervalStarts[i] = interval.getStart(); - intervalEnds[i] = interval.getStop(); - i++; - } - } - - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(nextRead == null) - throw new NoSuchElementException("No more reads left in this iterator."); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); - } - - - public void close() { - iterator.close(); - } - - private void advance() { - nextRead = null; - - if(!iterator.hasNext()) - return; - - SAMRecord candidateRead = iterator.next(); - while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { - if(!keepOnlyUnmappedReads) { - // Mapped read filter; check against GenomeLoc-derived bounds. - if(readEndsOnOrAfterStartingBound(candidateRead)) { - // This read ends after the current interval begins. - // Promising, but this read must be checked against the ending bound. - if(readStartsOnOrBeforeEndingBound(candidateRead)) { - // Yes, this read is within both bounds. This must be our next read. - nextRead = candidateRead; - break; - } - else { - // Oops, we're past the end bound. Increment the current bound and try again. - currentBound++; - continue; - } - } - } - else { - // Found an unmapped read. We're done. - if(candidateRead.getReadUnmappedFlag()) { - nextRead = candidateRead; - break; - } - } - - // No more reads available. Stop the search. - if(!iterator.hasNext()) - break; - - // No reasonable read found; advance the iterator. - candidateRead = iterator.next(); - } - } - - /** - * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its - * end will be distorted, so rely only on the alignment start. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { - return - // Read ends on a later contig, or... - read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); - } - - /** - * Check whether the read lies before the end of the current bound. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { - return - // Read starts on a prior contig, or... - read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); - } - } - /** * Locates the index file alongside the given BAM, if present. * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index d1f5d80da..da11d36dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -17,9 +17,21 @@ import java.util.List; import java.util.NoSuchElementException; /** - * Buffer shards of data which may or may not contain multiple loci into - * iterators of all data which cover an interval. Its existence is an homage - * to Mark's stillborn WindowMaker, RIP 2009. + * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci + * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp + * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of + * loci to only those covered by the given interval list. + * + * Example: + * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 + * Incoming intervals: chr20:3-7 + * + * Locus iterator by state will produce the following stream of data: + * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, + * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} + * + * WindowMakerIterator will then filter the incoming stream, emitting the following stream: + * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} * * @author mhanna * @version 0.1 diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 316a20a70..6edae3816 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -470,7 +470,7 @@ public class LocusIteratorByState extends LocusIterator { if (op == CigarOperator.D) { if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so - pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); + pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); size++; nDeletions++; if (read.getMappingQuality() == 0) @@ -479,7 +479,7 @@ public class LocusIteratorByState extends LocusIterator { } else { if (!filterBaseInRead(read, location.getStart())) { - pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); + pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()))); size++; if (read.getMappingQuality() == 0) nMQ0Reads++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index d99e7c353..1d14a7f35 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -102,7 +102,9 @@ public class TraverseLoci extends TraversalEngine,Locu } /** - * Gets the best view of loci for this walker given the available data. + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index c078be2f2..154612d25 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -39,7 +39,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { @@ -200,7 +203,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 4e3d4048b..626460be6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -77,20 +77,20 @@ import java.util.Map; *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. - * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. * - * The first 20 lines of such a file is shown below. + * The first 20 lines of such a file is shown below. * * The file begins with a series of comment lines describing: * ** The number of counted loci * ** The number of counted bases * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases - * - * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. * * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change - * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. - * + * *

  * # Counted Sites    19451059
  * # Counted Bases    56582018
@@ -129,13 +129,14 @@ import java.util.Map;
  *   -cov DinucCovariate \
  *   -recalFile my_reads.recal_data.csv
  * 
- * */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality -@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta +@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) +// Filter out all reads with zero or unavailable mapping quality +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { @@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); /** @@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker> covariateClasses = new PluginManager( Covariate.class ).getPlugins(); - final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins(); - final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins(); + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); // Print and exit if that's what was requested - if ( LIST_ONLY ) { - logger.info( "Available covariates:" ); - for( Class covClass : covariateClasses ) { - logger.info( covClass.getSimpleName() ); + if (LIST_ONLY) { + logger.info("Available covariates:"); + for (Class covClass : covariateClasses) { + logger.info(covClass.getSimpleName()); } logger.info(""); - System.exit( 0 ); // Early exit here because user requested it + System.exit(0); // Early exit here because user requested it } // Warn the user if no dbSNP file or other variant mask was specified - if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) { + if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) { throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."); } // Initialize the requested covariates by parsing the -cov argument // First add the required covariates - if( requiredClasses.size() == 2) { // readGroup and reported quality score - requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here - requestedCovariates.add( new QualityScoreCovariate() ); - } else { + if (requiredClasses.size() == 2) { // readGroup and reported quality score + requestedCovariates.add(new ReadGroupCovariate()); // Order is important here + requestedCovariates.add(new QualityScoreCovariate()); + } + else { throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); } // Next add the standard covariates if -standard was specified by the user - if( USE_STANDARD_COVARIATES ) { + if (USE_STANDARD_COVARIATES) { // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order // A list of Classes can't be sorted, but a list of Class names can be final List standardClassNames = new ArrayList(); - for( Class covClass : standardClasses ) { - standardClassNames.add( covClass.getName() ); + for (Class covClass : standardClasses) { + standardClassNames.add(covClass.getName()); } Collections.sort(standardClassNames); // Sort the list of class names - for( String className : standardClassNames ) { - for( Class covClass : standardClasses ) { // Find the class that matches this class name - if( covClass.getName().equals( className ) ) { + for (String className : standardClassNames) { + for (Class covClass : standardClasses) { // Find the class that matches this class name + if (covClass.getName().equals(className)) { try { - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class foundClass = true; - if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { + if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { try { // Now that we've found a matching class, try to instantiate it - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker 0 ) { + if (gatkRead.getBaseQualities()[offset] > 0) { byte[] bases = gatkRead.getReadBases(); byte refBase = ref.getBase(); // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( bases[offset] ) ) { + if (BaseUtils.isRegularBase(bases[offset])) { // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || - !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || + !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) { // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( counter, gatkRead, offset, refBase ); + updateDataFromRead(counter, gatkRead, offset, refBase); - } else { // calculate SOLID reference insertion rate - if( refBase == bases[offset] ) { + } + else { // calculate SOLID reference insertion rate + if (refBase == bases[offset]) { counter.solidInsertedReferenceBases++; - } else { + } + else { counter.otherColorSpaceInconsistency++; } } @@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker= DBSNP_VALIDATION_CHECK_FREQUENCY ) { + if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) { counter.lociSinceLastDbsnpCheck = 0; - if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { - final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; - final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; + if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) { + final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases; + final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases; - if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { - Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + - String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); + if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) { + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel)); DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file } } @@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker keyList = new ArrayList(); - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { keyList.add((Comparable) comp); } Collections.sort(keyList); - for( Comparable comp : keyList ) { + for (Comparable comp : keyList) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val); } } } - private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { - for( Object comp : data.keySet() ) { + private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + for (Object comp : data.keySet()) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappings( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappings(recalTableStream, curPos + 1, key, (Map) val); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 18b33c0e8..311e33f8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; import java.util.List; @@ -256,32 +257,6 @@ public class RecalDataManager { public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); - // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if (readGroup == null) { - if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { - if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName()); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP); - readGroup.setPlatform(RAC.DEFAULT_PLATFORM); - ((GATKSAMRecord) read).setReadGroup(readGroup); - } - else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName()); - } - } - - if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user - final String oldPlatform = readGroup.getPlatform(); - readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP); - readGroup.setPlatform(oldPlatform); - ((GATKSAMRecord) read).setReadGroup(readGroup); - } - if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { readGroup.setPlatform(RAC.FORCE_PLATFORM); } @@ -310,7 +285,7 @@ public class RecalDataManager { public static void parseColorSpace(final GATKSAMRecord read) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { + if (ReadUtils.isSOLiDRead(read)) { if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { @@ -408,7 +383,7 @@ public class RecalDataManager { } public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { - if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { + if (ReadUtils.isSOLiDRead(read)) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; @@ -637,21 +612,17 @@ public class RecalDataManager { final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for (int i = 0; i < numRequestedCovariates; i++) { + for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType); - for (int j = 0; j < readLength; j++) { - //copy values into a 2D array that allows all covar types to be extracted at once for - //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. - covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; - } + for (int j = 0; j < readLength; j++) + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. } return covariateValues_offset_x_covar; } /** - * Perform a ceratin transversion (A <-> C or G <-> T) on the base. + * Perform a certain transversion (A <-> C or G <-> T) on the base. * * @param base the base [AaCcGgTt] * @return the transversion of the base, or the input base if it's not one of the understood ones diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 7f3035f1e..9752b1dee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection { // Shared Command Line Arguments ////////////////////////////////// @Hidden - @Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.") - public String DEFAULT_READ_GROUP = null; - @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @Hidden - @Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") - public String FORCE_READ_GROUP = null; - @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) public int WINDOW_SIZE = 5; - /** - * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. - */ - @Hidden - @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) - public int HOMOPOLYMER_NBACK = 7; - @Hidden - @Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false) - public boolean EXCEPTION_IF_NO_TILE = false; - /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. @@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) public int CONTEXT_SIZE = 8; + /** + * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. + */ + @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) + public int HOMOPOLYMER_NBACK = 7; + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index a8006d506..cd848cd9e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -86,12 +86,12 @@ import java.util.regex.Pattern; * -o my_reads.recal.bam \ * -recalFile my_reads.recal_data.csv * - * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @WalkerName("TableRecalibration") -@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires -I input.bam, it also requires -R reference.fasta public class TableRecalibrationWalker extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; @@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker> classes = new PluginManager(Covariate.class).getPlugins(); @@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + for (Class covClass : classes) { + if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { foundClass = true; try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) + List newRecords = new ArrayList(oldRecords.size() + 1); + for (SAMProgramRecord record : oldRecords) { + if (!record.getId().startsWith(PROGRAM_RECORD_NAME)) newRecords.add(record); } newRecords.add(programRecord); header.setProgramRecords(newRecords); // Write out the new header - OUTPUT_BAM.writeHeader( header ); + OUTPUT_BAM.writeHeader(header); } } /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * * @param line A line of CSV data read from the recalibration table data file */ private void addCSVData(final File file, final String line) { final String[] vals = line.split(","); // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); } @@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). @@ -64,28 +66,31 @@ public class BaseUtils { /** * Returns the base substitution type of the 2 state SNP + * * @param base1 * @param base2 * @return */ - public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) { + public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) { BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION; //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t); return t; } - public static boolean isTransition( byte base1, byte base2 ) { + public static boolean isTransition(byte base1, byte base2) { int b1 = simpleBaseToBaseIndex(base1); int b2 = simpleBaseToBaseIndex(base2); return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; } - public static boolean isTransversion( byte base1, byte base2 ) { - return ! isTransition(base1, base2); + public static boolean isTransversion(byte base1, byte base2) { + return !isTransition(base1, base2); } - /** Private constructor. No instantiating this class! */ + /** + * Private constructor. No instantiating this class! + */ private BaseUtils() {} static public boolean basesAreEqual(byte base1, byte base2) { @@ -96,7 +101,6 @@ public class BaseUtils { return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); } - /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -163,33 +167,37 @@ public class BaseUtils { /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ static public int simpleBaseToBaseIndex(byte base) { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } - /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ @Deprecated @@ -197,29 +205,37 @@ public class BaseUtils { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } static public int extendedBaseToBaseIndex(byte base) { switch (base) { case 'd': - case 'D': return DELETION_INDEX; + case 'D': + return DELETION_INDEX; case 'n': - case 'N': return NO_CALL_INDEX; + case 'N': + return NO_CALL_INDEX; - default: return simpleBaseToBaseIndex(base); + default: + return simpleBaseToBaseIndex(base); } } @@ -232,11 +248,6 @@ public class BaseUtils { return simpleBaseToBaseIndex(base) != -1; } - @Deprecated - static public boolean isNBase(char base) { - return isNBase((byte)base); - } - static public boolean isNBase(byte base) { return base == 'N' || base == 'n'; } @@ -244,68 +255,83 @@ public class BaseUtils { /** * Converts a base index to a simple base * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return A, C, G, T, or '.' if the index can't be understood */ static public byte baseIndexToSimpleBase(int baseIndex) { switch (baseIndex) { - case 0: return 'A'; - case 1: return 'C'; - case 2: return 'G'; - case 3: return 'T'; - default: return '.'; + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return '.'; } } @Deprecated static public char baseIndexToSimpleBaseAsChar(int baseIndex) { - return (char)baseIndexToSimpleBase(baseIndex); + return (char) baseIndexToSimpleBase(baseIndex); } /** * Converts a base index to a base index representing its cross-talk partner * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return 1, 0, 3, 2, or -1 if the index can't be understood */ static public int crossTalkPartnerIndex(int baseIndex) { switch (baseIndex) { - case 0: return 1; // A -> C - case 1: return 0; // C -> A - case 2: return 3; // G -> T - case 3: return 2; // T -> G - default: return -1; + case 0: + return 1; // A -> C + case 1: + return 0; // C -> A + case 2: + return 3; // G -> T + case 3: + return 2; // T -> G + default: + return -1; } } /** * Converts a base to the base representing its cross-talk partner * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return C, A, T, G, or '.' if the base can't be understood */ @Deprecated static public char crossTalkPartnerBase(char base) { - return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); + return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); } /** * Return the complement of a base index. * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) + * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) * @return the complementary base index */ static public byte complementIndex(int baseIndex) { switch (baseIndex) { - case 0: return 3; // a -> t - case 1: return 2; // c -> g - case 2: return 1; // g -> c - case 3: return 0; // t -> a - default: return -1; // wtf? + case 0: + return 3; // a -> t + case 1: + return 2; // c -> g + case 2: + return 1; // g -> c + case 3: + return 0; // t -> a + default: + return -1; // wtf? } } - /** + /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * * @param base the base [AaCcGgTt] @@ -314,20 +340,25 @@ public class BaseUtils { static public byte simpleComplement(byte base) { switch (base) { case 'A': - case 'a': return 'T'; + case 'a': + return 'T'; case 'C': - case 'c': return 'G'; + case 'c': + return 'G'; case 'G': - case 'g': return 'C'; + case 'g': + return 'C'; case 'T': - case 't': return 'A'; - default: return base; + case 't': + return 'A'; + default: + return base; } } @Deprecated static public char simpleComplement(char base) { - return (char)simpleComplement((byte)base); + return (char) simpleComplement((byte) base); } /** @@ -349,7 +380,7 @@ public class BaseUtils { /** * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) * - * @param bases the byte array of bases + * @param bases the byte array of bases * @return the complement of the base byte array */ static public byte[] simpleComplement(byte[] bases) { @@ -382,7 +413,7 @@ public class BaseUtils { /** * Complement a char array of bases * - * @param bases the char array of bases + * @param bases the char array of bases * @return the complement of the base char array */ @Deprecated @@ -399,7 +430,7 @@ public class BaseUtils { /** * Reverse complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the reverse complement of the String */ @Deprecated @@ -407,11 +438,10 @@ public class BaseUtils { return new String(simpleReverseComplement(bases.getBytes())); } - /** * Complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the complement of the String */ @Deprecated @@ -451,7 +481,7 @@ public class BaseUtils { /** * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. * - * @param baseCounts counts of a,c,g,t in order. + * @param baseCounts counts of a,c,g,t in order. * @return the most common base */ static public byte mostFrequentSimpleBase(int[] baseCounts) { @@ -461,13 +491,13 @@ public class BaseUtils { /** * For the most frequent base in the sequence, return the percentage of the read it constitutes. * - * @param sequence the read sequence - * @return the percentage of the read that's made up of the most frequent base + * @param sequence the read sequence + * @return the percentage of the read that's made up of the most frequent base */ static public double mostFrequentBaseFraction(byte[] sequence) { int[] baseCounts = new int[4]; - for ( byte base : sequence ) { + for (byte base : sequence) { int baseIndex = simpleBaseToBaseIndex(base); if (baseIndex >= 0) { @@ -477,7 +507,7 @@ public class BaseUtils { int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); - return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length); + return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length); } // -------------------------------------------------------------------------------- @@ -531,50 +561,50 @@ public class BaseUtils { static public byte getRandomBase(char excludeBase) { return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); } - - - /** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, + + /** + * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is + * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period + * of 2 (it has a period of 4 as well), and so does + * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range * or if specified minPeriod is greater than the sequence length. - * + * * @param seq * @return */ public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = ( minPeriod > seq.length ? seq.length : minPeriod ); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for ( int pos = period ; pos < seq.length ; pos++ ) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if ( Character.toUpperCase( seq[pos] ) != - Character.toUpperCase( seq[offset] ) - ) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if ( offset == 0 ) period = pos+1; - else period = pos-- ; - - } - } - return period; + int period = (minPeriod > seq.length ? seq.length : minPeriod); + // we assume that bases [0,period-1] repeat themselves and check this assumption + // until we find correct period + + for (int pos = period; pos < seq.length; pos++) { + + int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' + // if our current hypothesis holds, base[pos] must be the same as base[offset] + + if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { + + // period we have been trying so far does not work. + // two possibilities: + // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; + // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; + // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance + // pos will be autoincremented and we will be checking next base + // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? + // hence we should first check if it matches the first base of the sequence, and to do that + // we set period to pos (thus trying the hypothesis that bases from start up to the current one, + // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base + // on the next loop re-entrance after pos is autoincremented) + if (offset == 0) + period = pos + 1; + else + period = pos--; + + } + } + return period; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java index 4f01f2b7a..597dc4803 100644 --- a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -87,7 +87,7 @@ public enum NGSPlatform { /** * Returns the NGSPlatform corresponding to the PL tag in the read group * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object - * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match + * @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match */ public static final NGSPlatform fromReadGroupPL(final String plFromRG) { if ( plFromRG == null ) return UNKNOWN; @@ -105,4 +105,14 @@ public enum NGSPlatform { return UNKNOWN; } + + /** + * checks whether or not the requested platform is listed in the set (and is not unknown) + * + * @param platform the read group string that describes the platform used + * @return true if the platform is known (i.e. it's in the list and is not UNKNOWN) + */ + public static final boolean isKnown (final String platform) { + return fromReadGroupPL(platform) != UNKNOWN; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 82e403842..70ad70f43 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup pileup = new UnifiedPileupElementTracker(); for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important + pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; @@ -204,7 +204,7 @@ public abstract class AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); // -------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 921da2a1f..506442d03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement { public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { - super(read, offset, type == Type.DELETION, false, false); // extended events are slated for removal + super(read, offset, type == Type.DELETION, false, false, false); // extended events are slated for removal this.read = read; this.offset = offset; this.eventLength = eventLength; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index a4830223e..9df22700e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -24,6 +24,7 @@ public class PileupElement implements Comparable { protected final GATKSAMRecord read; protected final int offset; protected final boolean isDeletion; + protected final boolean isBeforeDeletion; protected final boolean isBeforeInsertion; protected final boolean isNextToSoftClip; @@ -33,6 +34,7 @@ public class PileupElement implements Comparable { * @param read the read we are adding to the pileup * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) * @param isDeletion whether or not this base is a deletion + * @param isBeforeDeletion whether or not this base is before a deletion * @param isBeforeInsertion whether or not this base is before an insertion * @param isNextToSoftClip whether or not this base is next to a soft clipped base */ @@ -40,13 +42,14 @@ public class PileupElement implements Comparable { "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { if (offset < 0 && isDeletion) throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); this.read = read; this.offset = offset; this.isDeletion = isDeletion; + this.isBeforeDeletion = isBeforeDeletion; this.isBeforeInsertion = isBeforeInsertion; this.isNextToSoftClip = isNextToSoftClip; } @@ -55,6 +58,10 @@ public class PileupElement implements Comparable { return isDeletion; } + public boolean isBeforeDeletion() { + return isBeforeDeletion; + } + public boolean isBeforeInsertion() { return isBeforeInsertion; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index df334f557..357195daa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -96,7 +96,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 20b100001..7a6ebef21 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -71,7 +71,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false)); + pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false)); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index c9b81a9d3..b3b9ab555 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -30,7 +30,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class GATKReportUnitTest extends BaseTest { - @Test + @Test(enabled = false) public void testParse() throws Exception { String reportPath = validationDataLocation + "exampleGATKReport.eval"; GATKReport report = new GATKReport(reportPath); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 729503f84..520fb7040 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false, false, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false); + PileupElement readp = new PileupElement(read, 0, false, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false); Assert.assertFalse(readp.getRead().isReducedRead()); diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 42a027be1..085e0b008 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils} import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import net.sf.samtools.SAMFileHeader import java.util.Collections @@ -50,6 +50,8 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { IntervalUtils.parseIntervalArguments(parser, intervals) Collections.sort(parsedLocs) Collections.unmodifiableList(parsedLocs) + val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) + Collections.unmodifiableList(mergedLocs) } lazy val contigs = locs.map(_.getContig).distinct.toSeq diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index db0d187c9..b23350557 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils class GATKIntervalsUnitTest { private final lazy val hg18Reference = new File(BaseTest.hg18Reference) @@ -60,7 +61,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + Assert.assertEquals(IntervalUtils.splitFixedIntervals(gi.locs, 189894).size(), 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -77,4 +78,17 @@ class GATKIntervalsUnitTest { Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) } + + @Test + def testSortAndMergeIntervals() { + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30")) + } + + private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) { + Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + } }