diff --git a/ivy.xml b/ivy.xml index 96c1de844..ee24bc367 100644 --- a/ivy.xml +++ b/ivy.xml @@ -76,7 +76,7 @@ - + diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java new file mode 100644 index 000000000..5005b6265 --- /dev/null +++ b/public/java/src/net/sf/samtools/BAMFileReader.java @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader extends SAMFileReader.ReaderImplementation { + // True if reading from a File rather than an InputStream + private boolean mIsSeekable = false; + + // For converting bytes into other primitive types + private BinaryCodec mStream = null; + + // Underlying compressed data stream. + private final BAMInputStream mInputStream; + private SAMFileHeader mFileHeader = null; + + // Populated if the file is seekable and an index exists + private File mIndexFile; + private BAMIndex mIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + + // If true, all SAMRecords are fully decoded as they are read. + private final boolean eagerDecode; + + // For error-checking. + private ValidationStringency mValidationStringency; + + // For creating BAMRecords + private SAMRecordFactory samRecordFactory; + + /** + * Use the caching index reader implementation rather than the disk-hit-per-file model. + */ + private boolean mEnableIndexCaching = false; + + /** + * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. + */ + private boolean mEnableIndexMemoryMapping = true; + + /** + * Add information about the origin (reader and position) to SAM records. + */ + private SAMFileReader mFileReader = null; + + /** + * Prepare to read BAM from a stream (not seekable) + * @param stream source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final InputStream stream, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = false; + mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(null); + } + + /** + * Prepare to read BAM from a file (seekable) + * @param file source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final File file, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); + if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { + System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + + " is older than BAM " + file.getAbsolutePath()); + } + } + + BAMFileReader(final SeekableStream strm, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), + indexFile, + eagerDecode, + strm.getSource(), + validationStringency, + factory); + } + + private BAMFileReader(final BAMInputStream inputStream, + final File indexFile, + final boolean eagerDecode, + final String source, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = true; + mInputStream = inputStream; + mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(source); + mFirstRecordPointer = inputStream.getFilePointer(); + } + + /** + * If true, writes the source of every read into the source SAMRecords. + * @param enabled true to write source information into each SAMRecord. + */ + void enableFileSource(final SAMFileReader reader, final boolean enabled) { + this.mFileReader = enabled ? reader : null; + } + + /** + * If true, uses the caching version of the index reader. + * @param enabled true to write source information into each SAMRecord. + */ + public void enableIndexCaching(final boolean enabled) { + if(mIndex != null) + throw new SAMException("Unable to turn on index caching; index file has already been loaded."); + this.mEnableIndexCaching = enabled; + } + + /** + * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). + * This is slower but more scalable when accessing large numbers of BAM files sequentially. + * @param enabled True to use memory mapping, false to use regular I/O. + */ + public void enableIndexMemoryMapping(final boolean enabled) { + if (mIndex != null) { + throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); + } + this.mEnableIndexMemoryMapping = enabled; + } + + @Override void enableCrcChecking(final boolean enabled) { + this.mInputStream.setCheckCrcs(enabled); + } + + @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } + + /** + * @return true if ths is a BAM file, and has an index + */ + public boolean hasIndex() { + return (mIndexFile != null); + } + + /** + * Retrieves the index for the given file type. Ensure that the index is of the specified type. + * @return An index of the given type. + */ + public BAMIndex getIndex() { + if(mIndexFile == null) + throw new SAMException("No index is available for this BAM file."); + if(mIndex == null) + mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) + : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); + return mIndex; + } + + void close() { + if (mStream != null) { + mStream.close(); + } + if (mIndex != null) { + mIndex.close(); + } + mStream = null; + mFileHeader = null; + mIndex = null; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Set error-checking level for subsequent SAMRecord reads. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + SAMFileReader.ValidationStringency getValidationStringency() { + return this.mValidationStringency; + } + + /** + * Prepare to iterate through the SAMRecords in file order. + * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, + * that iterator must be closed before getIterator() can be called again. + * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to + * getIterator() begins its iteration where the last one left off. That is the best that can be + * done in that situation. + */ + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + @Override + CloseableIterator getIterator(final SAMFileSpan chunks) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!(chunks instanceof BAMFileSpan)) { + throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); + } + + // Create an iterator over the given chunk boundaries. + mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); + return mCurrentIterator; + } + + /** + * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know + * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, + * the rightmost bound is guaranteed to be after the last read in the file. + * @return An unbounded pointer to the first record in the BAM file. + */ + @Override + SAMFileSpan getFilePointerSpanningReads() { + return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); + } + + /** + * Prepare to iterate through the SAMRecords that match the given interval. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. + * A value of zero implies the start of the reference sequence. + * @param end A value of zero implies the end of the reference sequence. + * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval + * specified by start and end. If false, the SAMRecords need only overlap the interval. + * @return Iterator for the matching SAMRecords + */ + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords with the given alignment start. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Alignment start sought. + * @return Iterator for the matching SAMRecords. + */ + CloseableIterator queryAlignmentStart(final String sequence, final int start) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); + return mCurrentIterator; + } + + public CloseableIterator queryUnmapped() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + try { + final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); + if (startOfLastLinearBin != -1) { + mInputStream.seek(startOfLastLinearBin); + } else { + // No mapped reads in file, just start at the first read in file. + mInputStream.seek(mFirstRecordPointer); + } + mCurrentIterator = new BAMFileIndexUnmappedIterator(); + return mCurrentIterator; + } catch (IOException e) { + throw new RuntimeException("IOException seeking to unmapped reads", e); + } + } + + /** + * Reads the header from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private void readHeader(final String source) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(mValidationStringency); + mFileHeader = headerCodec.decode(new StringLineReader(textHeader), + source); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + mFileHeader.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + source); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + source); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(source)); + } + mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } + } + + /** + * Reads a single binary sequence record from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private SAMSequenceRecord readSequenceRecord(final String source) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); + } + + /** + * Iterator for non-indexed sequential iteration through all SAMRecords in file. + * Starting point of iteration is wherever current file position is when the iterator is constructed. + */ + private class BAMFileIterator implements CloseableIterator { + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec; + private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file + + BAMFileIterator() { + this(true); + } + + /** + * @param advance Trick to enable subclass to do more setup before advancing + */ + BAMFileIterator(final boolean advance) { + this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); + this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (mCurrentIterator != null && this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + + if (mNextRecord != null) { + ++this.samRecordIndex; + // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setValidationStringency(mValidationStringency); + + if (mValidationStringency != ValidationStringency.SILENT) { + final List validationErrors = mNextRecord.isValid(); + SAMUtils.processValidationErrors(validationErrors, + this.samRecordIndex, BAMFileReader.this.getValidationStringency()); + } + } + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + /** + * Read the next record from the input stream. + */ + SAMRecord getNextRecord() throws IOException { + final long startCoordinate = mInputStream.getFilePointer(); + final SAMRecord next = bamRecordCodec.decode(); + final long stopCoordinate = mInputStream.getFilePointer(); + + if(mFileReader != null && next != null) + next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); + + return next; + } + + /** + * @return The record that will be return by the next call to next() + */ + protected SAMRecord peek() { + return mNextRecord; + } + } + + /** + * Prepare to iterate through SAMRecords matching the target interval. + * @param sequence Desired reference sequence. + * @param start 1-based start of target interval, inclusive. + * @param end 1-based end of target interval, inclusive. + * @param queryType contained, overlapping, or starting-at query. + */ + private CloseableIterator createIndexIterator(final String sequence, + final int start, + final int end, + final QueryType queryType) { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + final SAMFileHeader fileHeader = getFileHeader(); + final int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMIndex fileIndex = getIndex(); + final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); + filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; + } + + // Create an iterator over the above chunk boundaries. + final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); + + // Add some preprocessing filters for edge-case reads that don't fit into this + // query type. + return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); + } + + enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} + + /** + * Look for BAM index file according to standard naming convention. + * + * @param dataFile BAM file name. + * @return Index file name, or null if not found. + */ + private static File findIndexFile(final File dataFile) { + // If input is foo.bam, look for foo.bai + final String bamExtension = ".bam"; + File indexFile; + final String fileName = dataFile.getName(); + if (fileName.endsWith(bamExtension)) { + final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; + indexFile = new File(dataFile.getParent(), bai); + if (indexFile.exists()) { + return indexFile; + } + } + + // If foo.bai doesn't exist look for foo.bam.bai + indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + private class BAMFileIndexIterator extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + + /** + * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. + * @param filePointers the block / offset combination, stored in chunk format. + */ + BAMFileIndexIterator(final long[] filePointers) { + super(false); // delay advance() until after construction + mFilePointers = filePointers; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + // Advance to next file block if necessary + while (mInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + return super.getNextRecord(); + } + } + + /** + * A decorating iterator that filters out records that are outside the bounds of the + * given query parameters. + */ + private class BAMQueryFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private final CloseableIterator wrappedIterator; + + /** + * The next record to be returned. Will be null if no such record exists. + */ + private SAMRecord mNextRecord; + + private final int mReferenceIndex; + private final int mRegionStart; + private final int mRegionEnd; + private final QueryType mQueryType; + + public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { + this.wrappedIterator = iterator; + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + mRegionStart = start; + if (queryType == QueryType.STARTING_AT) { + mRegionEnd = mRegionStart; + } else { + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + } + mQueryType = queryType; + mNextRecord = advance(); + } + + /** + * Returns true if a next element exists; false otherwise. + */ + public boolean hasNext() { + return mNextRecord != null; + } + + /** + * Gets the next record from the given iterator. + * @return The next SAM record in the iterator. + */ + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); + final SAMRecord currentRead = mNextRecord; + mNextRecord = advance(); + return currentRead; + } + + /** + * Closes down the existing iterator. + */ + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + SAMRecord advance() { + while (true) { + // Pull next record from stream + if(!wrappedIterator.hasNext()) + return null; + + final SAMRecord record = wrappedIterator.next(); + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + // If read is unmapped but has a coordinate, return it if the coordinate is within + // the query region, regardless of whether the mapped mate will be returned. + final int alignmentEnd; + if (mQueryType == QueryType.STARTING_AT) { + alignmentEnd = -1; + } else { + alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? + record.getAlignmentEnd(): alignmentStart); + } + + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + return null; + } + // Filter for overlap with region + if (mQueryType == QueryType.CONTAINED) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else if (mQueryType == QueryType.OVERLAPPING) { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } else { + if (alignmentStart == mRegionStart) { + return record; + } + } + } + } + } + + private class BAMFileIndexUnmappedIterator extends BAMFileIterator { + private BAMFileIndexUnmappedIterator() { + while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + advance(); + } + } + } + +} diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index 623f46291..4692c6671 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -25,6 +25,7 @@ package net.sf.samtools; import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan { super(); } + /** + * Create a new GATKBAMFileSpan from an existing BAMFileSpan. + * @param sourceFileSpan + */ + public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) { + if(!(sourceFileSpan instanceof BAMFileSpan)) + throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead"); + BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan; + for(Chunk chunk: sourceBAMFileSpan.getChunks()) + add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk)); + } + /** * Convenience constructor to construct a BAM file span from * a single chunk. diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index f590809e2..5d349e72e 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -69,6 +69,22 @@ public class GATKChunk extends Chunk { super.setChunkEnd(value); } + public long getBlockStart() { + return getChunkStart() >>> 16; + } + + public int getBlockOffsetStart() { + return (int)(getChunkStart() & 0xFFFF); + } + + public long getBlockEnd() { + return getChunkEnd() >>> 16; + } + + public int getBlockOffsetEnd() { + return ((int)getChunkEnd() & 0xFFFF); + } + /** * Computes an approximation of the uncompressed size of the * chunk, in bytes. Can be used to determine relative weights diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java new file mode 100644 index 000000000..d825c23d5 --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package net.sf.samtools.util; + +import java.io.IOException; + +/** + * An input stream formulated for use reading BAM files. Supports + */ +public interface BAMInputStream { + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) throws IOException; + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer(); + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check); + + public int read() throws java.io.IOException; + + public int read(byte[] bytes) throws java.io.IOException; + + public int read(byte[] bytes, int i, int i1) throws java.io.IOException; + + public long skip(long l) throws java.io.IOException; + + public int available() throws java.io.IOException; + + public void close() throws java.io.IOException; + + public void mark(int i); + + public void reset() throws java.io.IOException; + + public boolean markSupported(); +} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java new file mode 100755 index 000000000..fae2fc89b --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java @@ -0,0 +1,483 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools.util; + + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import net.sf.samtools.FileTruncatedException; + +/* + * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. + * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. + * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the + * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. + * + * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format + */ +public class BlockCompressedInputStream extends InputStream implements BAMInputStream { + private InputStream mStream = null; + private SeekableStream mFile = null; + private byte[] mFileBuffer = null; + private byte[] mCurrentBlock = null; + private int mCurrentOffset = 0; + private long mBlockAddress = 0; + private int mLastBlockLength = 0; + private final BlockGunzipper blockGunzipper = new BlockGunzipper(); + + + /** + * Note that seek() is not supported if this ctor is used. + */ + public BlockCompressedInputStream(final InputStream stream) { + mStream = IOUtil.toBufferedStream(stream); + mFile = null; + } + + /** + * Use this ctor if you wish to call seek() + */ + public BlockCompressedInputStream(final File file) + throws IOException { + mFile = new SeekableFileStream(file); + mStream = null; + + } + + public BlockCompressedInputStream(final URL url) { + mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); + mStream = null; + } + + /** + * For providing some arbitrary data source. No additional buffering is + * provided, so if the underlying source is not buffered, wrap it in a + * SeekableBufferedStream before passing to this ctor. + */ + public BlockCompressedInputStream(final SeekableStream strm) { + mFile = strm; + mStream = null; + } + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check) { + this.blockGunzipper.setCheckCrcs(check); + } + + /** + * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the + * next caller of a method for this input stream. The next caller might be the same thread or another thread. + * Note that although the next caller can read this many bytes without blocking, the available() method call itself + * may block in order to fill an internal buffer if it has been exhausted. + */ + public int available() + throws IOException { + if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { + readBlock(); + } + if (mCurrentBlock == null) { + return 0; + } + return mCurrentBlock.length - mCurrentOffset; + } + + /** + * Closes the underlying InputStream or RandomAccessFile + */ + public void close() + throws IOException { + if (mFile != null) { + mFile.close(); + mFile = null; + } else if (mStream != null) { + mStream.close(); + mStream = null; + } + // Encourage garbage collection + mFileBuffer = null; + mCurrentBlock = null; + } + + /** + * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. + * If no byte is available because the end of the stream has been reached, the value -1 is returned. + * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. + + * @return the next byte of data, or -1 if the end of the stream is reached. + */ + public int read() + throws IOException { + return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; + } + + /** + * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes + * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, + * or an exception is thrown. + * + * read(buf) has the same effect as read(buf, 0, buf.length). + * + * @param buffer the buffer into which the data is read. + * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer) + throws IOException { + return read(buffer, 0, buffer.length); + } + + private volatile ByteArrayOutputStream buf = null; + private static final byte eol = '\n'; + private static final byte eolCr = '\r'; + + /** + * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), + * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). + * + * @return A String containing the contents of the line, excluding the line terminating + * character, or null if the end of the stream has been reached + * + * @exception IOException If an I/O error occurs + * @ + */ + public String readLine() throws IOException { + int available = available(); + if (available == 0) { + return null; + } + if(null == buf){ // lazy initialisation + buf = new ByteArrayOutputStream(8192); + } + buf.reset(); + boolean done = false; + boolean foundCr = false; // \r found flag + while (!done) { + int linetmpPos = mCurrentOffset; + int bCnt = 0; + while((available-- > 0)){ + final byte c = mCurrentBlock[linetmpPos++]; + if(c == eol){ // found \n + done = true; + break; + } else if(foundCr){ // previous char was \r + --linetmpPos; // current char is not \n so put it back + done = true; + break; + } else if(c == eolCr){ // found \r + foundCr = true; + continue; // no ++bCnt + } + ++bCnt; + } + if(mCurrentOffset < linetmpPos){ + buf.write(mCurrentBlock, mCurrentOffset, bCnt); + mCurrentOffset = linetmpPos; + } + available = available(); + if(available == 0){ + // EOF + done = true; + } + } + return buf.toString(); + } + + /** + * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read + * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. + * + * This method blocks until input data is available, end of file is detected, or an exception is thrown. + * + * @param buffer buffer into which data is read. + * @param offset the start offset in array b at which the data is written. + * @param length the maximum number of bytes to read. + * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer, int offset, int length) + throws IOException { + final int originalLength = length; + while (length > 0) { + final int available = available(); + if (available == 0) { + // Signal EOF to caller + if (originalLength == length) { + return -1; + } + break; + } + final int copyLength = Math.min(length, available); + System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); + mCurrentOffset += copyLength; + offset += copyLength; + length -= copyLength; + } + return originalLength - length; + } + + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) + throws IOException { + if (mFile == null) { + throw new IOException("Cannot seek on stream based file"); + } + // Decode virtual file pointer + // Upper 48 bits is the byte offset into the compressed stream of a block. + // Lower 16 bits is the byte offset into the uncompressed stream inside the block. + final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); + final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); + final int available; + if (mBlockAddress == compressedOffset && mCurrentBlock != null) { + available = mCurrentBlock.length; + } else { + mFile.seek(compressedOffset); + mBlockAddress = compressedOffset; + mLastBlockLength = 0; + readBlock(); + available = available(); + } + if (uncompressedOffset > available || + (uncompressedOffset == available && !eof())) { + throw new IOException("Invalid file pointer: " + pos); + } + mCurrentOffset = uncompressedOffset; + } + + private boolean eof() throws IOException { + if (mFile.eof()) { + return true; + } + // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. + return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + } + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer() { + if (mCurrentOffset == mCurrentBlock.length) { + // If current offset is at the end of the current block, file pointer should point + // to the beginning of the next block. + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); + } + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); + } + + public static long getFileBlock(final long bgzfOffset) { + return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); + } + + /** + * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). + * @return true if the given file looks like a valid BGZF file. + */ + public static boolean isValidFile(final InputStream stream) + throws IOException { + if (!stream.markSupported()) { + throw new RuntimeException("Cannot test non-buffered stream"); + } + stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + stream.reset(); + return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); + } + + private static boolean isValidBlockHeader(final byte[] buffer) { + return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && + (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && + (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && + buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && + buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && + buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); + } + + private void readBlock() + throws IOException { + + if (mFileBuffer == null) { + mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; + } + int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + if (count == 0) { + // Handle case where there is no empty gzip block at end. + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mCurrentBlock = new byte[0]; + return; + } + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + throw new IOException("Premature end of file"); + } + final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; + if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { + throw new IOException("Unexpected compressed block length: " + blockLength); + } + final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; + count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); + if (count != remaining) { + throw new FileTruncatedException("Premature end of file"); + } + inflateBlock(mFileBuffer, blockLength); + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mLastBlockLength = blockLength; + } + + private void inflateBlock(final byte[] compressedBlock, final int compressedLength) + throws IOException { + final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); + byte[] buffer = mCurrentBlock; + mCurrentBlock = null; + if (buffer == null || buffer.length != uncompressedLength) { + try { + buffer = new byte[uncompressedLength]; + } catch (NegativeArraySizeException e) { + throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); + } + } + blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); + mCurrentBlock = buffer; + } + + private int readBytes(final byte[] buffer, final int offset, final int length) + throws IOException { + if (mFile != null) { + return readBytes(mFile, buffer, offset, length); + } else if (mStream != null) { + return readBytes(mStream, buffer, offset, length); + } else { + return 0; + } + } + + private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = file.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + private int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } + + public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} + + public static FileTermination checkTermination(final File file) + throws IOException { + final long fileSize = file.length(); + if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { + return FileTermination.DEFECTIVE; + } + final RandomAccessFile raFile = new RandomAccessFile(file, "r"); + try { + raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; + raFile.readFully(buf); + if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { + return FileTermination.HAS_TERMINATOR_BLOCK; + } + final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); + buf = new byte[bufsize]; + raFile.seek(fileSize - bufsize); + raFile.read(buf); + for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; + i >= 0; --i) { + if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, + buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { + continue; + } + final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; + if (buf.length - i == totalBlockSizeMinusOne + 1) { + return FileTermination.HAS_HEALTHY_LAST_BLOCK; + } else { + return FileTermination.DEFECTIVE; + } + } + return FileTermination.DEFECTIVE; + } finally { + raFile.close(); + } + } + + private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { + for (int i = 0; i < length; ++i) { + if (preamble[i] != buf[i + startOffset]) { + return false; + } + } + return true; + } +} + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index bed1e710e..9e1be5bca 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -331,12 +331,12 @@ public abstract class CommandLineProgram { * used to indicate an error occured * * @param msg the message - * @param e the error + * @param t the error */ - public static void exitSystemWithError(String msg, final Exception e) { + public static void exitSystemWithError(String msg, final Throwable t) { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("stack trace %n"); - e.printStackTrace(); + t.printStackTrace(); errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); @@ -392,10 +392,10 @@ public abstract class CommandLineProgram { /** * used to indicate an error occured * - * @param e the exception occured + * @param t the exception that occurred */ - public static void exitSystemWithError(Exception e) { - exitSystemWithError(e.getMessage(), e); + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); } /** diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index f920d90ef..9e2c9a818 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -45,7 +45,7 @@ import java.util.*; * * The IntervalBinding is a formal GATK argument that bridges between a walker and * the engine to construct intervals for traversal at runtime. The IntervalBinding can - * either be a RodBinding, a string of one or more intervals, or a file with interval strings. + * either be a RodBinding, a string of one interval, or a file with interval strings. * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. * * Note that this class is immutable. @@ -108,4 +108,8 @@ public final class IntervalBinding { return intervals; } + + public String toString() { + return getSource(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b8488dc9a..b4d337d8d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -97,13 +96,20 @@ public class CommandLineGATK extends CommandLineExecutable { // lazy loaded, so they aren't caught elsewhere and made into User Exceptions exitSystemWithUserError(e); } catch (net.sf.samtools.SAMException e) { - // Let's try this out and see how it is received by our users + checkForTooManyOpenFilesProblem(e.getMessage()); exitSystemWithSamError(e); - } catch (Exception e) { - exitSystemWithError(e); + } catch (Throwable t) { + checkForTooManyOpenFilesProblem(t.getMessage()); + exitSystemWithError(t); } } + private static void checkForTooManyOpenFilesProblem(String message) { + // Special case the "Too many open files" error because it's a common User Error for which we know what to do + if ( message != null && message.indexOf("Too many open files") != -1 ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + } + /** * Creates the a short blurb about the GATK, copyright info, and where to get documentation. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f8e87aa58..f2e0b5d0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; @@ -126,6 +127,11 @@ public class GenomeAnalysisEngine { */ private Collection filters; + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + /** * A currently hacky unique name for this GATK instance */ @@ -199,6 +205,9 @@ public class GenomeAnalysisEngine { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + // Prepare the data for traversal. initializeDataSources(); @@ -218,7 +227,7 @@ public class GenomeAnalysisEngine { // create the output streams " initializeOutputStreams(microScheduler.getOutputTracker()); - ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); // execute the microscheduler, storing the results return microScheduler.execute(this.walker, shardStrategy); @@ -266,6 +275,16 @@ public class GenomeAnalysisEngine { return Collections.unmodifiableList(filters); } + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + Integer numCPUThreads = tags.containsKey("cpu") ? Integer.parseInt(tags.getValue("cpu")) : null; + Integer numIOThreads = tags.containsKey("io") ? Integer.parseInt(tags.getValue("io")) : null; + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads); + } + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. @@ -286,7 +305,7 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); } - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); } protected DownsamplingMethod getDownsamplingMethod() { @@ -397,103 +416,49 @@ public class GenomeAnalysisEngine { * @param intervals intervals * @return the sharding strategy */ - protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original - // sharding system; it's required with the new sharding system only for locus walkers. - if(readsDataSource != null && !readsDataSource.hasIndex() ) { - if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - Shard.ShardType shardType; if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - shardType = Shard.ShardType.LOCUS; + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); } - else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) - shardType = Shard.ShardType.READ; else - throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files"); - - List region; - if(intervals != null) - region = intervals.toList(); - else { - region = new ArrayList(); - for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences()) - region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength())); - } - - return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region); + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + final int SHARD_SIZE = walker instanceof RodWalker ? 100000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); } - - ShardStrategy shardStrategy; - ShardStrategyFactory.SHATTER_STRATEGY shardType; - - long SHARD_SIZE = 100000L; - - if (walker instanceof LocusWalker) { - if (walker instanceof RodWalker) SHARD_SIZE *= 1000; - - if (intervals != null && !intervals.isEmpty()) { - if (readsDataSource == null) - throw new IllegalArgumentException("readsDataSource is null"); - if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE,getGenomeLocParser()); - } else if (walker instanceof ReadWalker || - walker instanceof DuplicateWalker) { - shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; - - if (intervals != null && !intervals.isEmpty()) { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } - } else if (walker instanceof ReadPairWalker) { - if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } else - throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName()); - - return shardStrategy; } protected boolean flashbackData() { @@ -751,6 +716,8 @@ public class GenomeAnalysisEngine { return new SAMDataSource( samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, genomeLocParser, argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, @@ -763,8 +730,7 @@ public class GenomeAnalysisEngine { getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, - argCollection.defaultBaseQualities, - !argCollection.disableLowMemorySharding); + argCollection.defaultBaseQualities); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8078a1ea4..64b63dcd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -194,10 +194,14 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) - public int numberOfThreads = 1; + /** How many threads should be allocated to this analysis. */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + public Integer numberOfThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false) + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) + public Integer numberOfBAMFileHandles = null; + + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; // -------------------------------------------------------------------------------------------------------------- @@ -292,9 +296,6 @@ public class GATKArgumentCollection { @Hidden public boolean allowIntervalsWithUnindexedBAM = false; - @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false) - public boolean disableLowMemorySharding = false; - // -------------------------------------------------------------------------------------------------------------- // // methods @@ -365,7 +366,11 @@ public class GATKArgumentCollection { (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } - if (other.numberOfThreads != this.numberOfThreads) { + if (!other.numberOfThreads.equals(this.numberOfThreads)) { + return false; + } + if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) || + (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) { return false; } if (other.intervalMerging != this.intervalMerging) { @@ -389,9 +394,6 @@ public class GATKArgumentCollection { if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; - if (disableLowMemorySharding != other.disableLowMemorySharding) - return false; - return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java deleted file mode 100644 index de938e845..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 7, 2011 - * Time: 2:46:34 PM - * To change this template use File | Settings | File Templates. - */ -public class BAMBlockStartIterator implements Iterator { - /** - * How large is a BGZF header? - */ - private static int BGZF_HEADER_SIZE = 18; - - /** - * Where within the header does the BLOCKSIZE actually live? - */ - private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2; - - private FileChannel bamInputChannel; - private ByteBuffer headerByteBuffer; - - private long nextLocation = 0; - - public BAMBlockStartIterator(File bamFile) { - try { - FileInputStream bamInputStream = new FileInputStream(bamFile); - bamInputChannel = bamInputStream.getChannel(); - - headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE); - headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - } - catch(IOException ex) { - throw new StingException("Could not open file",ex); - } - } - - public boolean hasNext() { - return nextLocation != -1; - } - - public Long next() { - long currentLocation = nextLocation; - advance(); - return currentLocation; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator"); - } - - private void advance() { - int readStatus; - - headerByteBuffer.clear(); - try { - readStatus = bamInputChannel.read(headerByteBuffer); - } - catch(IOException ex) { - throw new StingException("Could not read header data",ex); - } - - if(readStatus == -1) { - nextLocation = -1; - try { - bamInputChannel.close(); - } - catch(IOException ex) { - throw new StingException("Could not close input file",ex); - } - return; - } - - headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION); - int blockSize = headerByteBuffer.getShort(); - - try { - bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1); - nextLocation = bamInputChannel.position(); - } - catch(IOException ex) { - throw new StingException("Could not reposition input stream",ex); - } - } - - public static void main(String argv[]) throws IOException { - BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam")); - int i = 0; - while(blockStartIterator.hasNext()) - System.out.printf("%d -> %d%n",i++,blockStartIterator.next()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java deleted file mode 100644 index 4d91fb45f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKChunk; -import net.sf.samtools.LinearIndex; - -import java.util.*; - -/** - * Represents the contents of a bam index file for one reference. - * A BAM index (.bai) file contains information for all references in the bam file. - * This class describes the data present in the index file for one of these references; - * including the bins, chunks, and linear index. - */ -class BAMIndexContent { - /** - * The reference sequence for the data currently loaded. - */ - private final int mReferenceSequence; - - /** - * A list of all bins in the above reference sequence. - */ - private final BinList mBinList; - - /** - * The linear index for the reference sequence above. - */ - private final LinearIndex mLinearIndex; - - - /** - * @param referenceSequence Content corresponds to this reference. - * @param bins Array of bins represented by this content, possibly sparse - * @param numberOfBins Number of non-null bins - * @param linearIndex Additional index used to optimize queries - */ - BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) { - this.mReferenceSequence = referenceSequence; - this.mBinList = new BinList(bins, numberOfBins); - this.mLinearIndex = linearIndex; - } - - /** - * Reference for this Content - */ - public int getReferenceSequence() { - return mReferenceSequence; - } - - /** - * Does this content have anything in this bin? - */ - public boolean containsBin(final GATKBin bin) { - return mBinList.getBin(bin.getBinNumber()) != null; - } - - /** - * @return iterable list of bins represented by this content - */ - public BinList getBins() { - return mBinList; - } - - /** - * @return the number of non-null bins represented by this content - */ - int getNumberOfNonNullBins() { - return mBinList.getNumberOfNonNullBins(); - } - - /** - * @return all chunks associated with all bins in this content - */ - public List getAllChunks() { - List allChunks = new ArrayList(); - for (GATKBin b : mBinList) - if (b.getChunkList() != null) { - allChunks.addAll(Arrays.asList(b.getChunkList())); - } - return Collections.unmodifiableList(allChunks); - } - - /** - * @return the linear index represented by this content - */ - public LinearIndex getLinearIndex() { - return mLinearIndex; - } - - /** - * This class is used to encapsulate the list of Bins store in the BAMIndexContent - * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure - */ - class BinList implements Iterable { - - private final GATKBin[] mBinArray; - public final int numberOfNonNullBins; - public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based - - /** - * @param binArray a sparse array representation of the bins. The index into the array is the bin number. - * @param numberOfNonNullBins - */ - BinList(GATKBin[] binArray, int numberOfNonNullBins) { - this.mBinArray = binArray; - this.numberOfNonNullBins = numberOfNonNullBins; - this.maxBinNumber = mBinArray.length - 1; - } - - GATKBin getBin(int binNumber) { - if (binNumber > maxBinNumber) return null; - return mBinArray[binNumber]; - } - - int getNumberOfNonNullBins() { - return numberOfNonNullBins; - } - - /** - * Gets an iterator over all non-null bins. - * - * @return An iterator over all bins. - */ - public Iterator iterator() { - return new BinIterator(); - } - - private class BinIterator implements Iterator { - /** - * Stores the bin # of the Bin currently in use. - */ - private int nextBin; - - public BinIterator() { - nextBin = 0; - } - - /** - * Are there more bins in this set, waiting to be returned? - * - * @return True if more bins are remaining. - */ - public boolean hasNext() { - while (nextBin <= maxBinNumber) { - if (getBin(nextBin) != null) return true; - nextBin++; - } - return false; - } - - /** - * Gets the next bin in the provided BinList. - * - * @return the next available bin in the BinList. - */ - public GATKBin next() { - if (!hasNext()) - throw new NoSuchElementException("This BinIterator is currently empty"); - GATKBin result = getBin(nextBin); - nextBin++; - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a bin iterator"); - } - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java deleted file mode 100644 index 15a372ca6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; - -import java.util.HashMap; -import java.util.Map; - -/** - * Models a bin at which all BAM files in the merged input stream overlap. - */ -class BAMOverlap { - public final int start; - public final int stop; - - private final Map bins = new HashMap(); - - public BAMOverlap(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - public void addBin(final SAMReaderID id, final Bin bin) { - bins.put(id,bin); - } - - public Bin getBin(final SAMReaderID id) { - return bins.get(id); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index 521bcd5a3..762722fcd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator { /** * Create a new BAM schedule based on the given index. - * @param indexFiles Index files. + * @param dataSource The SAM data source to use. * @param intervals List of */ - public BAMSchedule(final Map indexFiles, final List intervals) { + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); - referenceSequence = intervals.get(0).getContigIndex(); + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); createScheduleFile(); - readerIDs.addAll(indexFiles.keySet()); + readerIDs.addAll(dataSource.getReaderIDs()); for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = indexFiles.get(reader); + final GATKBAMIndex index = dataSource.getIndex(reader); final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); @@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator { if(selectedIterators.isEmpty()) return; + // Create the target schedule entry BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { PeekableIterator scheduleIterator = scheduleIterators.get(reader); BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); @@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator { scheduleIterator.next(); } + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + nextScheduleEntry = mergedScheduleEntry; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 47eb55b28..dca4cc771 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.*; @@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator { private FilePointer nextFilePointer = null; - private final GenomeLocSortedSet loci; + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; - private final PeekableIterator locusIterator; + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); + for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { + // Match only on sequence name; trust startup validation to make sure all the sequences match. + if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) + intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); + } + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } - private GenomeLoc currentLocus; + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } - public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } + + + private BAMScheduler(final SAMDataSource dataSource) { this.dataSource = dataSource; - for(SAMReaderID reader: dataSource.getReaderIDs()) - indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(); + Map currentPosition = dataSource.getCurrentPosition(); + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; } public boolean hasNext() { @@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator { if(!hasNext()) throw new NoSuchElementException("No next element available in interval sharder"); FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; advance(); + return currentFilePointer; } @@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator { if(loci.isEmpty()) return; - nextFilePointer = null; while(nextFilePointer == null && currentLocus != null) { // special case handling of the unmapped shard. if(currentLocus == GenomeLoc.UNMAPPED) { nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); currentLocus = null; continue; } @@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator { int coveredRegionStop = Integer.MAX_VALUE; GenomeLoc coveredRegion = null; - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); // No overlapping data at all. if(scheduleEntry != null) { @@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator { } else { // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); for(SAMReaderID reader: indexFiles.keySet()) nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); } @@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator { // Early exit if no bins were found. if(coveredRegion == null) { // for debugging only: maximum split is 16384. - if(currentLocus.size() > 16384) { - GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - } - else { - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - } + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; continue; } // Early exit if only part of the first interval was found. if(currentLocus.startsBefore(coveredRegion)) { - // for debugging only: maximum split is 16384. int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); GenomeLoc[] splitContigs = currentLocus.split(splitPoint); nextFilePointer.addLocation(splitContigs[0]); @@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator { /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM indices. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex(); + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { if(bamScheduleIterator != null) bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentLocus.getContigIndex(); + lastReferenceSequenceLoaded = currentContigIndex; // Naive algorithm: find all elements in current contig for proper schedule creation. List lociInContig = new LinkedList(); for(GenomeLoc locus: loci) { - if(locus.getContigIndex() == lastReferenceSequenceLoaded) + if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); } if(!bamScheduleIterator.hasNext()) @@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator { return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; } + /** + * Create a span from the given start point to the end of the file. + * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). + * @return A file span from the given point to the end of the file. + */ + private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { + return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java new file mode 100644 index 000000000..f468d2020 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.LinkedList; +import java.util.Queue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Preloads BGZF blocks in preparation for unzipping and data processing. + * TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should + * TODO: be replaced with a central thread management strategy. + */ +public class BGZFBlockLoadingDispatcher { + /** + * The file handle cache, used when allocating blocks from the dispatcher. + */ + private final FileHandleCache fileHandleCache; + + private final ExecutorService threadPool; + + private final Queue inputQueue; + + public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { + threadPool = Executors.newFixedThreadPool(numThreads); + fileHandleCache = new FileHandleCache(numFileHandles); + inputQueue = new LinkedList(); + + threadPool.execute(new BlockLoader(this,fileHandleCache,true)); + } + + /** + * Initiates a request for a new block load. + * @param readerPosition Position at which to load. + */ + void queueBlockLoad(final SAMReaderPosition readerPosition) { + synchronized(inputQueue) { + inputQueue.add(readerPosition); + inputQueue.notify(); + } + } + + /** + * Claims the next work request from the queue. + * @return The next work request, or null if none is available. + */ + SAMReaderPosition claimNextWorkRequest() { + synchronized(inputQueue) { + while(inputQueue.isEmpty()) { + try { + inputQueue.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item"); + } + } + return inputQueue.poll(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java new file mode 100644 index 000000000..e377f865d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BAMInputStream; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.RuntimeEOFException; +import net.sf.samtools.util.SeekableStream; +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends SeekableStream implements BAMInputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current position. + */ + private SAMReaderPosition position; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Has the buffer been filled since last request? + */ + private boolean bufferFilled = false; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.samFile.length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.samFile); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + if(buffer.remaining() > 0) { + // If there's data in the buffer, figure out from whence it came. + final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0; + final int blockOffset = buffer.position(); + filePointer = blockAddress << 16 | blockOffset; + } + else { + // Otherwise, find the next position to load. + filePointer = position.getBlockAddress() << 16; + } + } + + if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) + throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", + BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer), + BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()))); + + return filePointer; + } + + public void seek(long target) { + // TODO: Validate the seek point. + //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); + synchronized(lock) { + clearBuffers(); + position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target)); + waitForBufferFill(); + buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target)); + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(target); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + } + + private void clearBuffers() { + this.position.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + blockOffsets.clear(); + blockPositions.clear(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return position != null && position.getBlockAddress() >= length; + } + } + + public void setCheckCrcs(final boolean check) { + // TODO: Implement + } + + /** + * Submits a new access plan for the given dataset. + * @param position The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final SAMReaderPosition position) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + synchronized(lock) { + // Assume that the access plan is going to tell us to start where we are and move forward. + // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. + if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) + position.advancePosition(this.position.getBlockAddress()); + } + this.position = position; + } + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first block in the series. + while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) { + bytesToRemove += blockOffsets.remove(); + blockPositions.remove(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + buffer.position(bytesToRemove); + buffer.compact(); + + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param position target position for the data. + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + synchronized(lock) { + try { + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Advance the position to take the most recent read into account. + long lastReadPosition = position.getBlockAddress(); + + byte[] validBytes = null; + if(validatingInputStream != null) { + validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(lastReadPosition << 16); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + this.position = position; + position.advancePosition(filePosition); + + if(buffer.remaining() < incomingBuffer.remaining()) { + //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + lock.wait(); + //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); + } + + // Queue list of block offsets / block positions. + blockOffsets.add(buffer.position()); + blockPositions.add(lastReadPosition); + + buffer.put(incomingBuffer); + + // Set up the buffer for reading. + buffer.flip(); + bufferFilled = true; + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + + if(validatingInputStream != null) { + byte[] validBytes = new byte[length]; + try { + validatingInputStream.read(validBytes,offset,length); + for(int i = offset; i < offset+length; i++) { + if(bytes[i] != validBytes[i]) { + System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId()); + throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); + } + } + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + return length - remaining; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + bufferFilled = false; + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(position); + try { + lock.wait(); + } + catch(InterruptedException ex) { + // TODO: handle me. + throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); + } + + if(bufferFilled && buffer.remaining() == 0) + throw new RuntimeEOFException("No more data left in InputStream"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java new file mode 100644 index 000000000..ab4299802 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +/** + * An engine for loading blocks. + */ +class BlockLoader implements Runnable { + /** + * Coordinates the input queue. + */ + private BGZFBlockLoadingDispatcher dispatcher; + + /** + * A cache from which to retrieve open file handles. + */ + private final FileHandleCache fileHandleCache; + + /** + * Whether asynchronous decompression should happen. + */ + private final boolean decompress; + + /** + * An direct input buffer for incoming data from disk. + */ + private final ByteBuffer inputBuffer; + + public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) { + this.dispatcher = dispatcher; + this.fileHandleCache = fileHandleCache; + this.decompress = decompress; + + this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + inputBuffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void run() { + for(;;) { + SAMReaderPosition readerPosition = null; + try { + readerPosition = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + + long blockAddress = readerPosition.getBlockAddress(); + //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); + + ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + long nextBlockAddress = position(inputStream); + fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + + ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; + int bytesCopied = block.remaining(); + + BlockInputStream bamInputStream = readerPosition.getInputStream(); + bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + + //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); + } + catch(Throwable error) { + if(readerPosition != null && readerPosition.getInputStream() != null) + readerPosition.getInputStream().reportException(error); + } + } + + } + + private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException { + FileChannel channel = inputStream.getChannel(); + + // Read the block header + channel.position(blockAddress); + + int uncompressedDataSize = 0; + int bufferSize = 0; + + do { + inputBuffer.clear(); + inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + inputBuffer.flip(); + if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) + throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass."); + + // Verify that the file was read at a valid point. + if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 || + unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 || + unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG || + unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN || + unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 || + unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) { + throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position"); + } + + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET); + bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1; + + // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block. + inputBuffer.limit(bufferSize); + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block. + uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4); + //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize); + } + while(uncompressedDataSize == 0 && channel.position() < channel.size()); + + // Prepare the buffer for reading. + inputBuffer.flip(); + + return inputBuffer; + } + + private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException { + final int compressedBufferSize = bgzfBlock.remaining(); + + // Determine the uncompressed buffer size ( + bgzfBlock.position(bgzfBlock.limit()-4); + int uncompressedBufferSize = bgzfBlock.getInt(); + byte[] uncompressedContent = new byte[uncompressedBufferSize]; + + // Bound the CDATA section of the buffer. + bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH); + bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + byte[] compressedContent = new byte[bgzfBlock.remaining()]; + ByteBuffer.wrap(compressedContent).put(bgzfBlock); + + // Decompress the buffer. + final Inflater inflater = new Inflater(true); + inflater.setInput(compressedContent); + int bytesUncompressed = inflater.inflate(uncompressedContent); + if(bytesUncompressed != uncompressedBufferSize) + throw new ReviewedStingException("Error decompressing block"); + + return ByteBuffer.wrap(uncompressedContent); + } + + private long position(final FileInputStream inputStream) throws IOException { + return inputStream.getChannel().position(); + } + + private int unpackUByte8(final ByteBuffer buffer,final int position) { + return buffer.get(position) & 0xFF; + } + + private int unpackUInt16(final ByteBuffer buffer,final int position) { + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + return buffer.getShort(position) & 0xFFFF; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java new file mode 100644 index 000000000..29de6eb37 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index e4141f61c..df7827250 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.SAMFileSpan; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -40,28 +41,25 @@ import java.util.*; */ public class FilePointer { protected final SortedMap fileSpans = new TreeMap(); - protected final BAMOverlap overlap; - protected final List locations; + protected final List locations = new ArrayList(); /** * Does this file pointer point into an unmapped region? */ protected final boolean isRegionUnmapped; - public FilePointer() { - this((BAMOverlap)null); - } - - public FilePointer(final GenomeLoc location) { - this.overlap = null; - this.locations = Collections.singletonList(location); - this.isRegionUnmapped = GenomeLoc.isUnmapped(location); - } - - public FilePointer(final BAMOverlap overlap) { - this.overlap = overlap; - this.locations = new ArrayList(); - this.isRegionUnmapped = false; + public FilePointer(final GenomeLoc... locations) { + this.locations.addAll(Arrays.asList(locations)); + boolean foundMapped = false, foundUnmapped = false; + for(GenomeLoc location: locations) { + if(GenomeLoc.isUnmapped(location)) + foundUnmapped = true; + else + foundMapped = true; + } + if(foundMapped && foundUnmapped) + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + this.isRegionUnmapped = foundUnmapped; } /** @@ -217,4 +215,20 @@ public class FilePointer { fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); combined.addFileSpans(initialElement.getKey(),fileSpan); } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FilePointer:%n"); + builder.append("\tlocations = {"); + builder.append(Utils.join(";",locations)); + builder.append("}%n\tregions = %n"); + for(Map.Entry entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 4ddf28dce..f78693c27 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -25,419 +25,58 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.AbstractBAMFileIndex; -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.*; +import java.util.Iterator; /** - * Shard intervals based on position within the BAM file. - * - * @author mhanna - * @version 0.1 + * Handles the process of aggregating BAM intervals into individual shards. + * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder. */ -public class IntervalSharder { - private static Logger logger = Logger.getLogger(IntervalSharder.class); +public class IntervalSharder implements Iterator { + /** + * The iterator actually laying out the data for BAM scheduling. + */ + private final PeekableIterator wrappedIterator; - public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - return new IntervalSharder.FilePointerIterator(dataSource,loci); + /** + * The parser, for interval manipulation. + */ + private final GenomeLocParser parser; + + public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); + } + + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + } + + public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser()); + } + + private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) { + wrappedIterator = new PeekableIterator(scheduler); + this.parser = parser; + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); } /** - * A lazy-loading iterator over file pointers. + * Accumulate shards where there's no additional cost to processing the next shard in the sequence. + * @return The next file pointer to process. */ - private static class FilePointerIterator implements Iterator { - final SAMDataSource dataSource; - final GenomeLocSortedSet loci; - final PeekableIterator locusIterator; - final Queue cachedFilePointers = new LinkedList(); - - public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - this.dataSource = dataSource; - this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - advance(); - } - - public boolean hasNext() { - return !cachedFilePointers.isEmpty(); - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("FilePointerIterator iteration is complete"); - FilePointer filePointer = cachedFilePointers.remove(); - if(cachedFilePointers.isEmpty()) - advance(); - return filePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator"); - } - - private void advance() { - GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); - String contig = null; - - // If the next section of the BAM to be processed is unmapped, handle this region separately. - while(locusIterator.hasNext() && nextBatch.isEmpty()) { - contig = null; - while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) { - GenomeLoc nextLocus = locusIterator.next(); - contig = nextLocus.getContig(); - nextBatch.add(nextLocus); - } - } - - if(nextBatch.size() > 0) { - cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); - } - } + public FilePointer next() { + FilePointer current = wrappedIterator.next(); + while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); + return current; } - /** - * Merge / split intervals based on an awareness of the structure of the BAM file. - * @param dataSource - * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads. - * @param loci - * @return - */ - private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { - // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs. - if(contig == null) { - FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - filePointer.addFileSpans(id,null); - return Collections.singletonList(filePointer); - } - - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - List filePointers = new ArrayList(); - FilePointer lastFilePointer = null; - BAMOverlap lastBAMOverlap = null; - - Map readerToIndexMap = new HashMap(); - IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator(); - for(SAMReaderID id: dataSource.getReaderIDs()) { - final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); - // If this contig can't be found in the reference, skip over it. - if(referenceSequence == null && contig != null) - continue; - final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id); - binMerger.addReader(id, - index, - referenceSequence.getSequenceIndex(), - index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator()); - // Cache the reader for later data lookup. - readerToIndexMap.put(id,index); - } - - PeekableIterator binIterator = new PeekableIterator(binMerger); - - for(GenomeLoc location: loci) { - if(!location.getContig().equals(contig)) - throw new ReviewedStingException("Location outside bounds of contig"); - - if(!binIterator.hasNext()) - break; - - int locationStart = location.getStart(); - final int locationStop = location.getStop(); - - // Advance to first bin. - while(binIterator.peek().stop < locationStart) - binIterator.next(); - - // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make - // sure the extending bin is not pruned from the list. - List bamOverlaps = new ArrayList(); - while(binIterator.hasNext() && binIterator.peek().stop <= locationStop) - bamOverlaps.add(binIterator.next()); - if(binIterator.hasNext() && binIterator.peek().start <= locationStop) - bamOverlaps.add(binIterator.peek()); - - // Bins found; try to match bins with locations. - Iterator bamOverlapIterator = bamOverlaps.iterator(); - - while(locationStop >= locationStart) { - int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0; - int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0; - - while(binStop < locationStart && bamOverlapIterator.hasNext()) { - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - lastBAMOverlap = bamOverlapIterator.next(); - lastFilePointer = new FilePointer(lastBAMOverlap); - binStart = lastFilePointer.overlap.start; - binStop = lastFilePointer.overlap.stop; - } - - if(locationStart < binStart) { - // The region starts before the first bin in the sequence. Add the region occurring before the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - final int regionStop = Math.min(locationStop,binStart-1); - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop); - lastFilePointer = new FilePointer(subset); - - locationStart = regionStop + 1; - } - else if(locationStart > binStop) { - // The region starts after the last bin in the sequence. Add the region occurring after the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.add(new FilePointer(subset)); - - locationStart = locationStop + 1; - } - else { - if(lastFilePointer == null) - throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer."); - - // The start of the region overlaps the bin. Add the overlapping subset. - final int regionStop = Math.min(locationStop,binStop); - lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop)); - locationStart = regionStop + 1; - } - } - } - - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - // Lookup the locations for every file pointer in the index. - for(SAMReaderID id: readerToIndexMap.keySet()) { - BrowseableBAMIndex index = readerToIndexMap.get(id); - for(FilePointer filePointer: filePointers) - filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id))); - } - - return filePointers; - } - - private static class BinMergingIterator implements Iterator { - private PriorityQueue binQueue = new PriorityQueue(); - private Queue pendingOverlaps = new LinkedList(); - - public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) { - binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins))); - } - - public boolean hasNext() { - return pendingOverlaps.size() > 0 || !binQueue.isEmpty(); - } - - public BAMOverlap next() { - if(!hasNext()) - throw new NoSuchElementException("No elements left in merging iterator"); - if(pendingOverlaps.isEmpty()) - advance(); - return pendingOverlaps.remove(); - } - - public void advance() { - List bins = new ArrayList(); - int boundsStart, boundsStop; - - // Prime the pump - if(binQueue.isEmpty()) - return; - bins.add(getNextBin()); - boundsStart = bins.get(0).getStart(); - boundsStop = bins.get(0).getStop(); - - // Accumulate all the bins that overlap the current bin, in sorted order. - while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) { - ReaderBin bin = getNextBin(); - bins.add(bin); - boundsStart = Math.min(boundsStart,bin.getStart()); - boundsStop = Math.max(boundsStop,bin.getStop()); - } - - List> range = new ArrayList>(); - int start = bins.get(0).getStart(); - int stop = bins.get(0).getStop(); - while(start <= boundsStop) { - // Find the next stopping point. - for(ReaderBin bin: bins) { - stop = Math.min(stop,bin.getStop()); - if(start < bin.getStart()) - stop = Math.min(stop,bin.getStart()-1); - } - - range.add(new Pair(start,stop)); - // If the last entry added included the last element, stop. - if(stop >= boundsStop) - break; - - // Find the next start. - start = stop + 1; - for(ReaderBin bin: bins) { - if(start >= bin.getStart() && start <= bin.getStop()) - break; - else if(start < bin.getStart()) { - start = bin.getStart(); - break; - } - } - } - - // Add the next series of BAM overlaps to the window. - for(Pair window: range) { - BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second); - for(ReaderBin bin: bins) - bamOverlap.addBin(bin.id,bin.bin); - pendingOverlaps.add(bamOverlap); - } - } - - public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } - - private ReaderBin peekNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.peek(); - return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin()); - } - - private ReaderBin getNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.remove(); - ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin()); - if(current.hasNextBin()) - binQueue.add(current); - return readerBin; - } - - } - - /** - * Filters out bins not at the lowest level in the tree. - */ - private static class LowestLevelBinFilteringIterator implements Iterator { - private BrowseableBAMIndex index; - private Iterator wrappedIterator; - - private Bin nextBin; - - public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) { - this.index = index; - this.wrappedIterator = iterator; - advance(); - } - - public boolean hasNext() { - return nextBin != null; - } - - public Bin next() { - Bin bin = nextBin; - advance(); - return bin; - } - - public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); } - - private void advance() { - nextBin = null; - while(wrappedIterator.hasNext() && nextBin == null) { - Bin bin = wrappedIterator.next(); - if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1) - nextBin = bin; - } - } - } + public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } } - -class BinQueueState implements Comparable { - private final SAMReaderID id; - private final BrowseableBAMIndex index; - private final int referenceSequence; - private final PeekableIterator bins; - - private int firstLocusInCurrentBin; - private int lastLocusInCurrentBin; - - public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bins = new PeekableIterator(bins); - refreshLocusInBinCache(); - } - - public SAMReaderID getReaderID() { - return id; - } - - public BrowseableBAMIndex getIndex() { - return index; - } - - public int getReferenceSequence() { - return referenceSequence; - } - - public boolean hasNextBin() { - return bins.hasNext(); - } - - public Bin peekNextBin() { - return bins.peek(); - } - - public Bin nextBin() { - Bin nextBin = bins.next(); - refreshLocusInBinCache(); - return nextBin; - } - - public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) { - if(!this.bins.hasNext() && !other.bins.hasNext()) return 0; - if(!this.bins.hasNext()) return -1; - if(!this.bins.hasNext()) return 1; - - // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid. - if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 || - other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) { - throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid."); - } - - // Straight integer subtraction works here because lhsStart, rhsStart always positive. - if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin) - return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin; - - // Straight integer subtraction works here because lhsStop, rhsStop always positive. - return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin; - } - - private void refreshLocusInBinCache() { - firstLocusInCurrentBin = -1; - lastLocusInCurrentBin = -1; - if(bins.hasNext()) { - Bin bin = bins.peek(); - firstLocusInCurrentBin = index.getFirstLocusInBin(bin); - lastLocusInCurrentBin = index.getLastLocusInBin(bin); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java new file mode 100644 index 000000000..585b63457 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import java.util.Iterator; + +/** + * Batch granular file pointers into potentially larger shards. + */ +public class LocusShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + public boolean hasNext() { + return filePointers.hasNext(); + } + + public Shard next() { + FilePointer current = filePointers.next(); + while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0) + current = current.combine(parser,filePointers.next()); + return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + }; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java deleted file mode 100755 index a5ca07853..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileSpan; -import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * A sharding strategy for loci based on reading of the index. - */ -public class LocusShardStrategy implements ShardStrategy { - /** - * The data source to use when performing this sharding. - */ - private final SAMDataSource reads; - - /** - * the parser for creating shards - */ - private GenomeLocParser genomeLocParser; - - /** - * An iterator through the available file pointers. - */ - private final Iterator filePointerIterator; - - /** - * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs - * @param reads Data source from which to load index data. - * @param locations List of locations for which to load data. - */ - public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) { - this.reads = reads; - this.genomeLocParser = genomeLocParser; - - if(!reads.isEmpty()) { - GenomeLocSortedSet intervals; - if(locations == null) { - // If no locations were passed in, shard the entire BAM file. - SAMFileHeader header = reads.getHeader(); - intervals = new GenomeLocSortedSet(genomeLocParser); - - for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) { - // Check this sequence against the reference sequence dictionary. - // TODO: Do a better job of merging reads + reference. - SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName()); - if(refSequenceRecord != null) { - final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength()); - intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length)); - } - } - } - else - intervals = locations; - - if(reads.isLowMemoryShardingEnabled()) { - /* - Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - List filePointers = new ArrayList(); - while(filePointerIterator.hasNext()) - filePointers.add(filePointerIterator.next()); - this.filePointerIterator = filePointers.iterator(); - */ - this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - } - else - this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); - } - else { - final int maxShardSize = 100000; - List filePointers = new ArrayList(); - if(locations == null) { - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop))); - } - } - } - else { - for(GenomeLoc interval: locations) { - while(interval.size() > maxShardSize) { - filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1))); - interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - filePointers.add(new FilePointer(interval)); - } - } - filePointerIterator = filePointers.iterator(); - } - - } - - /** - * returns true if there are additional shards - * - * @return false if we're done processing shards - */ - public boolean hasNext() { - return filePointerIterator.hasNext(); - } - - public long shardNumber = 0; - - /** - * gets the next Shard - * - * @return the next shard - */ - public LocusShard next() { - FilePointer nextFilePointer = filePointerIterator.next(); - Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null; - - /* - System.out.printf("Shard %d: interval = {",++shardNumber); - for(GenomeLoc locus: nextFilePointer.locations) - System.out.printf("%s;",locus); - System.out.printf("}; "); - - if(fileSpansBounding == null) - System.out.printf("no shard data%n"); - else { - SortedMap sortedSpans = new TreeMap(fileSpansBounding); - for(Map.Entry entry: sortedSpans.entrySet()) { - System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue()); - } - } - */ - - return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding); - } - - /** we don't support the remove command */ - public void remove() { - throw new UnsupportedOperationException("ShardStrategies don't support remove()"); - } - - /** - * makes the IntervalShard iterable, i.e. usable in a for loop. - * - * @return - */ - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java deleted file mode 100644 index bf5f33dc3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.Iterator; - -/** - * Handles the process of aggregating BAM intervals into individual shards. - */ -public class LowMemoryIntervalSharder implements Iterator { - /** - * The iterator actually laying out the data for BAM scheduling. - */ - private final PeekableIterator wrappedIterator; - - /** - * The parser, for interval manipulation. - */ - private final GenomeLocParser parser; - - public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); - parser = loci.getGenomeLocParser(); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Accumulate shards where there's no additional cost to processing the next shard in the sequence. - * @return The next file pointer to process. - */ - public FilePointer next() { - FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); - return current; - } - - public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java deleted file mode 100644 index 278eeb898..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.List; - -/** - * A single, monolithic shard bridging all available data. - * @author mhanna - * @version 0.1 - */ -public class MonolithicShard extends Shard { - /** - * Creates a new monolithic shard of the given type. - * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic. - * @param locs Intervals that this monolithic shard should process. - */ - public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) { - super(parser, shardType, locs, readsDataSource, null, false); - if(shardType != ShardType.LOCUS && shardType != ShardType.READ) - throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType); - } - - /** - * String representation of this shard. - * @return "entire genome". - */ - @Override - public String toString() { - return "entire genome"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java deleted file mode 100644 index 28b737f28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Create a giant shard representing all the data in the input BAM(s). - * - * @author mhanna - * @version 0.1 - */ -public class MonolithicShardStrategy implements ShardStrategy { - /** - * The single shard associated with this sharding strategy. - */ - private MonolithicShard shard; - - /** - * Create a new shard strategy for shards of the given type. - * @param shardType The shard type. - */ - public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) { - shard = new MonolithicShard(parser,readsDataSource,shardType,region); - } - - /** - * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator; - * will only return another copy of the active iterator. - * @return A copy of this. - */ - public Iterator iterator() { - return this; - } - - /** - * Returns true if the monolithic shard has not yet been consumed, or false otherwise. - * @return True if shard has been consumed, false otherwise. - */ - public boolean hasNext() { - return shard != null; - } - - /** - * Returns the monolithic shard if it has not already been retrieved. - * @return The monolithic shard. - * @throws NoSuchElementException if no such data exists. - */ - public Shard next() { - if(shard == null) - throw new NoSuchElementException("Monolithic shard has already been retrived."); - - Shard working = shard; - shard = null; - return working; - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a shard strategy"); - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - * @param size adjust the next size to this - */ - public void adjustNextShardSize( long size ) { - throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard."); - } - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 4d9c9092d..5f40c0ea5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -35,10 +35,15 @@ import java.util.Map; * @version 0.1 */ public class ReadShard extends Shard { + /** + * What is the maximum number of reads which should go into a read shard. + */ + public static final int MAX_READS = 10000; + /** * The reads making up this shard. */ - private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS); + private final Collection reads = new ArrayList(MAX_READS); public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); @@ -66,7 +71,7 @@ public class ReadShard extends Shard { * @return True if this shard's buffer is full (and the shard can buffer reads). */ public boolean isBufferFull() { - return reads.size() > ReadShardStrategy.MAX_READS; + return reads.size() > ReadShard.MAX_READS; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java new file mode 100644 index 000000000..fa8a7d454 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + */ +public class ReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + if(!fileSpan.isEmpty()) + selectedReaders.put(id,fileSpan); + } + + if(selectedReaders.size() > 0) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java deleted file mode 100755 index 5ea75dbb0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.SAMFileSpan; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.*; - -/** - * The sharding strategy for reads using a simple counting mechanism. Each read shard - * has a specific number of reads (default to 10K) which is configured in the constructor. - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - */ -public class ReadShardStrategy implements ShardStrategy { - /** - * What is the maximum number of reads which should go into a read shard. - */ - protected static final int MAX_READS = 10000; - - /** - * The data source used to shard. - */ - private final SAMDataSource dataSource; - - /** - * The intervals to be processed. - */ - private final GenomeLocSortedSet locations; - - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** our storage of the genomic locations they'd like to shard over */ - private final List filePointers = new ArrayList(); - - /** - * Iterator over the list of file pointers. - */ - private final Iterator filePointerIterator; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentFilePointer; - - /** - * Ending position of the last shard in the file. - */ - private Map position; - - /** - * An indicator whether the strategy has sharded into the unmapped region. - */ - private boolean isIntoUnmappedRegion = false; - - private final GenomeLocParser parser; - - /** - * Create a new read shard strategy, loading read shards from the given BAM file. - * @param dataSource Data source from which to load shards. - * @param locations intervals to use for sharding. - */ - public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) { - this.dataSource = dataSource; - this.parser = parser; - this.position = this.dataSource.getCurrentPosition(); - this.locations = locations; - - if(locations != null) - filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations); - else - filePointerIterator = filePointers.iterator(); - - if(filePointerIterator.hasNext()) - currentFilePointer = filePointerIterator.next(); - - advance(); - } - - /** - * do we have another read shard? - * @return True if any more data is available. False otherwise. - */ - public boolean hasNext() { - return nextShard != null; - } - - /** - * Retrieves the next shard, if available. - * @return The next shard, if available. - * @throws java.util.NoSuchElementException if no such shard is available. - */ - public Shard next() { - if(!hasNext()) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - public void advance() { - Map shardPosition = new HashMap(); - nextShard = null; - - if(locations != null) { - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; - - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); - } - - if(selectedReaders.size() > 0) { - Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - dataSource.fillShard(shard); - - if(!shard.isBufferEmpty()) { - nextShard = shard; - break; - } - } - - selectedReaders.clear(); - currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null; - } - } - else { - // todo -- this nulling of intervals is a bit annoying since readwalkers without - // todo -- any -L values need to be special cased throughout the code. - Shard shard = new ReadShard(parser,dataSource,position,null,false); - dataSource.fillShard(shard); - nextShard = !shard.isBufferEmpty() ? shard : null; - } - - this.position = dataSource.getCurrentPosition(); - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - - /** - * Convenience method for using ShardStrategy in an foreach loop. - * @return A iterator over shards. - */ - public Iterator iterator() { - return this; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java deleted file mode 100644 index c76c1d8ae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 2, 2011 - * Time: 4:36:40 PM - * To change this template use File | Settings | File Templates. - */ -class ReaderBin { - public final SAMReaderID id; - public final BrowseableBAMIndex index; - public final int referenceSequence; - public final Bin bin; - - public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bin = bin; - } - - public int getStart() { - return index.getFirstLocusInBin(bin); - } - - public int getStop() { - return index.getLastLocusInBin(bin); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 8452aadfd..0a1eb0563 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -37,8 +37,10 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -71,7 +73,7 @@ public class SAMDataSource { /** * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. */ - private final GenomeLocParser genomeLocParser; + protected final GenomeLocParser genomeLocParser; /** * Identifiers for the readers driving this data source. @@ -91,13 +93,18 @@ public class SAMDataSource { /** * How far along is each reader? */ - private final Map readerPositions = new HashMap(); + private final Map readerPositions = new HashMap(); /** * The merged header. */ private final SAMFileHeader mergedHeader; + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + /** * The sort order of the BAM files. Files without a sort order tag are assumed to be * in coordinate order. @@ -131,17 +138,24 @@ public class SAMDataSource { private final SAMResourcePool resourcePool; /** - * Whether to enable the new low-memory sharding mechanism. + * Asynchronously loads BGZF blocks. */ - private boolean enableLowMemorySharding = false; + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. */ - public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocParser) { + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, @@ -150,8 +164,7 @@ public class SAMDataSource { new ValidationExclusion(), new ArrayList(), false, - false, - true); + false); } /** @@ -159,6 +172,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -167,9 +182,10 @@ public class SAMDataSource { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, - boolean enableLowMemorySharding) { + boolean generateExtendedEvents) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, useOriginalBaseQualities, strictness, @@ -182,8 +198,7 @@ public class SAMDataSource { BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ - (byte) -1, - enableLowMemorySharding); + (byte) -1); } /** @@ -205,6 +220,8 @@ public class SAMDataSource { */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -217,13 +234,19 @@ public class SAMDataSource { BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, - byte defaultBaseQualities, - boolean enableLowMemorySharding) { - this.enableLowMemorySharding(enableLowMemorySharding); + byte defaultBaseQualities) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + else + dispatcher = null; + validationStringency = strictness; for (SAMReaderID readerID : samFiles) { if (!readerID.samFile.canRead()) @@ -235,10 +258,13 @@ public class SAMDataSource { SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. - for(SAMFileReader reader: readers.values()) { + for(SAMReaderID readerID: readerIDs) { // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); + headers.put(readerID,header); + if ( header.getReadGroups().isEmpty() ) { throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); @@ -275,7 +301,7 @@ public class SAMDataSource { qmode, refReader, defaultBaseQualities); - + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -296,12 +322,10 @@ public class SAMDataSource { originalToMergedReadGroupMappings.put(id,mappingToMerged); } - if(enableLowMemorySharding) { - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); } resourcePool.releaseReaders(readers); @@ -314,22 +338,6 @@ public class SAMDataSource { */ public ReadProperties getReadsInfo() { return readProperties; } - /** - * Enable experimental low-memory sharding. - * @param enable True to enable sharding. False otherwise. - */ - public void enableLowMemorySharding(final boolean enable) { - enableLowMemorySharding = enable; - } - - /** - * Returns whether low-memory sharding is enabled. - * @return True if enabled, false otherwise. - */ - public boolean isLowMemoryShardingEnabled() { - return enableLowMemorySharding; - } - /** * Checks to see whether any reads files are supplying data. * @return True if no reads files are supplying data to the traversal; false otherwise. @@ -368,7 +376,7 @@ public class SAMDataSource { * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. */ - public Map getCurrentPosition() { + public Map getCurrentPosition() { return readerPositions; } @@ -381,7 +389,7 @@ public class SAMDataSource { } public SAMFileHeader getHeader(SAMReaderID id) { - return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); + return headers.get(id); } /** @@ -404,45 +412,21 @@ public class SAMDataSource { return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); } - /** - * No read group collisions at this time because only one SAM file is currently supported. - * @return False always. - */ - public boolean hasReadGroupCollisions() { - return hasReadGroupCollisions; - } - /** * True if all readers have an index. * @return True if all readers have an index. */ public boolean hasIndex() { - if(enableLowMemorySharding) - return readerIDs.size() == bamIndices.size(); - else { - for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { - if(!reader.hasIndex()) - return false; - } - return true; - } + return readerIDs.size() == bamIndices.size(); } /** * Gets the index for a particular reader. Always preloaded. - * TODO: Should return object of type GATKBAMIndex, but cannot because there - * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new - * TODO: sharding system goes live. * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ - public Object getIndex(final SAMReaderID id) { - if(enableLowMemorySharding) - return bamIndices.get(id); - else { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - return readers.getReader(id).getBrowseableIndex(); - } + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); } /** @@ -454,7 +438,7 @@ public class SAMDataSource { } /** - * Gets the cumulative read metrics for shards already processed. + * Gets the cumulative read metrics for shards already processed. * @return Cumulative read metrics. */ public ReadMetrics getCumulativeReadMetrics() { @@ -507,10 +491,6 @@ public class SAMDataSource { } public StingSAMIterator seek(Shard shard) { - // todo: refresh monolithic sharding implementation - if(shard instanceof MonolithicShard) - return seekMonolithic(shard); - if(shard.buffersReads()) { return shard.iterator(); } @@ -540,7 +520,7 @@ public class SAMDataSource { */ private void initializeReaderPositions(SAMReaders readers) { for(SAMReaderID id: getReaderIDs()) - readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads()); + readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); } /** @@ -548,7 +528,6 @@ public class SAMDataSource { * @param readers Readers from which to load data. * @param shard The shard specifying the data limits. * @param enableVerification True to verify. For compatibility with old sharding strategy. - * TODO: Collapse this flag when the two sharding systems are merged. * @return An iterator over the selected data. */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { @@ -559,14 +538,20 @@ public class SAMDataSource { for(SAMReaderID id: getReaderIDs()) { CloseableIterator iterator = null; - if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null) - continue; - iterator = shard.getFileSpans().get(id) != null ? - readers.getReader(id).iterator(shard.getFileSpans().get(id)) : - readers.getReader(id).queryUnmapped(); + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + } + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(readProperties.getReadBufferSize() != null) iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); - if(shard.getGenomeLocs() != null) + if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); mergingIterator.addIterator(readers.getReader(id),iterator); } @@ -584,33 +569,6 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } - /** - * A stopgap measure to handle monolithic sharding - * @param shard the (monolithic) shard. - * @return An iterator over the monolithic shard. - */ - private StingSAMIterator seekMonolithic(Shard shard) { - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); - for(SAMReaderID id: getReaderIDs()) - mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator()); - - return applyDecoratingIterators(shard.getReadMetrics(), - shard instanceof ReadShard, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getDownsamplingMethod().toFraction, - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.defaultBaseQualities()); - } - /** * Adds this read to the given shard. * @param shard The shard to which to add the read. @@ -618,7 +576,7 @@ public class SAMDataSource { * @param read The read to add to the shard. */ private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing(); + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); shard.addRead(read); readerPositions.put(id,endChunk); } @@ -689,19 +647,6 @@ public class SAMDataSource { this.maxEntries = maxEntries; } - /** - * Dangerous internal method; retrieves any set of readers, whether in iteration or not. - * Used to handle non-exclusive, stateless operations, such as index queries. - * @return Any collection of SAMReaders, whether in iteration or not. - */ - protected SAMReaders getReadersWithoutLocking() { - synchronized(this) { - if(allResources.size() == 0) - createNewResource(); - } - return allResources.get(0); - } - /** * Choose a set of readers from the pool to use for this query. When complete, * @return @@ -753,6 +698,11 @@ public class SAMDataSource { */ private final Map readers = new LinkedHashMap(); + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + /** * Derive a new set of readers from the Reads metadata. * @param readerIDs reads to load. @@ -760,12 +710,20 @@ public class SAMDataSource { */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = new SAMFileReader(readerID.samFile); + File indexFile = findIndexFile(readerID.samFile); + + SAMFileReader reader = null; + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + inputStreams.put(readerID,blockInputStream); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); - reader.enableIndexMemoryMapping(false); - if(!enableLowMemorySharding) - reader.enableIndexCaching(true); reader.setValidationStringency(validationStringency); final SAMFileHeader header = reader.getFileHeader(); @@ -786,6 +744,15 @@ public class SAMDataSource { return readers.get(id); } + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + /** * Searches for the reader id of this reader. * @param reader Reader for which to search. @@ -883,7 +850,7 @@ public class SAMDataSource { * Filters out reads that do not overlap the current GenomeLoc. * Note the custom implementation: BAM index querying returns all reads that could * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. + * down performance, this implementation is highly customized to its task. */ private class IntervalOverlapFilteringIterator implements CloseableIterator { /** @@ -903,7 +870,7 @@ public class SAMDataSource { /** * Custom representation of interval bounds. - * Makes it simpler to track current position. + * Makes it simpler to track current position. */ private int[] intervalContigIndices; private int[] intervalStarts; @@ -941,7 +908,7 @@ public class SAMDataSource { i++; } } - + advance(); } @@ -1070,6 +1037,40 @@ public class SAMDataSource { return indexFile; } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * @return + */ + public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); + return shardBalancer; + } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java new file mode 100644 index 000000000..f9f6539a7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class SAMReaderPosition { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = filePosition; + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + try { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + positionIterator.next(); + // Check to see if the iterator has more data available. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) { + nextBlockAddress = positionIterator.peek().getBlockStart(); + break; + } + } + } + catch(Exception ex) { + throw new ReviewedStingException(""); + } + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java new file mode 100644 index 000000000..962208086 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Iterator; + +/** + * Balances maximally granular file pointers into shards of reasonable size. + */ +public abstract class ShardBalancer implements Iterable { + protected SAMDataSource readsDataSource; + protected PeekableIterator filePointers; + protected GenomeLocParser parser; + + public void initialize(final SAMDataSource readsDataSource, final Iterator filePointers, final GenomeLocParser parser) { + this.readsDataSource = readsDataSource; + this.filePointers = new PeekableIterator(filePointers); + this.parser = parser; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java deleted file mode 100644 index 989cf9fce..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import java.util.Iterator; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 4:55:37 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Interface ShardStrategy - *

- * The base interface for the sharding strategy; before we had a base abstract - * class, but not this will be an interface to accomidate read based sharding - */ -public interface ShardStrategy extends Iterator, Iterable { -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java deleted file mode 100644 index 780b41ef7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * - * User: aaron - * Date: Apr 6, 2009 - * Time: 7:09:22 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 6, 2009 - *

- * Class ShardStrategyFactory - *

- * The Shard Strategy Factory, use this class to create and transfer shard strategies - * between different approaches. - */ -public class ShardStrategyFactory { - public enum SHATTER_STRATEGY { - MONOLITHIC, // Put all of the available data into one shard. - LOCUS_EXPERIMENTAL, - READS_EXPERIMENTAL - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L); - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser,readsDataSource,null); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); - } - - } - - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l); - - } - - /** - * get a new shatter strategy - * - * @param readsDataSource The reads used to shatter this file. - * @param referenceDataSource The reference used to shatter this file. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return A strategy for shattering this data. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser, readsDataSource,lst); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented"); - } - - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 673df6dfa..577db0965 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -30,10 +30,12 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; -import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; +import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram { // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals GenomeLocSortedSet intervalSortedSet = null; @@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram { logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); @@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram { logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index c8c79bb14..2c33a19b8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -29,6 +29,14 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.CreateSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; +import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -36,13 +44,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * Loads reference data from fasta file * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile index; + private IndexedFastaSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -173,7 +185,7 @@ public class ReferenceDataSource { logger.info("Treating existing index file as complete."); } - index = new CachingIndexedFastaSequenceFile(fastaFile); + reference = new CachingIndexedFastaSequenceFile(fastaFile); } catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); @@ -192,6 +204,52 @@ public class ReferenceDataSource { * @return IndexedFastaSequenceFile that was created from file */ public IndexedFastaSequenceFile getReference() { - return this.index; + return this.reference; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + return shards; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 162baed00..b0043e68c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; @@ -88,7 +87,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } - public Object execute( Walker walker, ShardStrategy shardStrategy ) { + public Object execute( Walker walker, Iterable shardStrategy ) { // Fast fail for walkers not supporting TreeReducible interface. if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index deafcd0cc..ff5e1064b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; @@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler { * @param walker Computation to perform over dataset. * @param shardStrategy A strategy for sharding the data. */ - public Object execute(Walker walker, ShardStrategy shardStrategy) { + public Object execute(Walker walker, Iterable shardStrategy) { walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index e731b9864..d013db7e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -30,11 +30,11 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads the informations associated with the reads * @param reference the reference file * @param rods the rods to include in the traversal - * @param nThreadsToUse Number of threads to utilize. + * @param threadAllocation Number of threads to utilize. * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (walker instanceof TreeReducible && nThreadsToUse > 1) { + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse)); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads()); } else { - if(nThreadsToUse > 1) + if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods); } @@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * * @return the return type of the walker */ - public abstract Object execute(Walker walker, ShardStrategy shardStrategy); + public abstract Object execute(Walker walker, Iterable shardStrategy); /** * Retrieves the object responsible for tracking and managing output. diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 7bf518fd5..09ae02bd9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -6,10 +6,9 @@ import org.broad.tribble.annotation.Strand; import org.broad.tribble.dbsnp.OldDbSNPFeature; import org.broad.tribble.gelitext.GeliTextFeature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -187,30 +186,23 @@ public class VariantContextAdaptors { } Map attributes = new HashMap(); - attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; if ( index < 0 ) return null; // we weren't given enough reference context to create the VariantContext Byte refBaseForIndel = new Byte(ref.getBases()[index]); - Map genotypes = null; - VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel); - return vc; + final VariantContextBuilder builder = new VariantContextBuilder(); + builder.source(name).id(dbsnp.getRsID()); + builder.loc(dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0)); + builder.alleles(alleles); + builder.referenceBaseForIndel(refBaseForIndel); + return builder.make(); } else return null; // can't handle anything else } } - public static VCFHeader createVCFHeader(Set hInfo, VariantContext vc) { - HashSet names = new LinkedHashSet(); - for ( Genotype g : vc.getGenotypesSortedByName() ) { - names.add(g.getSampleName()); - } - - return new VCFHeader(hInfo == null ? new HashSet() : hInfo, names); - } - // -------------------------------------------------------------------------------------------------------------- // // GELI to VariantContext @@ -257,20 +249,15 @@ public class VariantContextAdaptors { else genotypeAlleles.add(refAllele); } - Map attributes = new HashMap(); + Map attributes = new HashMap(); Collection genotypes = new ArrayList(); - MutableGenotype call = new MutableGenotype(name, genotypeAlleles); - - // set the likelihoods, depth, and RMS mapping quality values - //call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.getLikelihoods()); - //call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaximumMappingQual()); - //call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.getDepthOfCoverage()); + Genotype call = new Genotype(name, genotypeAlleles); // add the call to the genotype list, and then use this list to create a VariantContext genotypes.add(call); alleles.add(refAllele); - VariantContext vc = VariantContextUtils.toVC(name, ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()), alleles, genotypes, geli.getLODBestToReference(), null, attributes); - return vc; + GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()); + return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make(); } else return null; // can't handle anything else } @@ -329,7 +316,7 @@ public class VariantContextAdaptors { String[] samples = hapmap.getSampleIDs(); String[] genotypeStrings = hapmap.getGenotypes(); - Map genotypes = new HashMap(samples.length); + GenotypesContext genotypes = GenotypesContext.create(samples.length); for ( int i = 0; i < samples.length; i++ ) { // ignore bad genotypes if ( genotypeStrings[i].contains("N") ) @@ -358,16 +345,13 @@ public class VariantContextAdaptors { } Genotype g = new Genotype(samples[i], myAlleles); - genotypes.put(samples[i], g); + genotypes.add(g); } - HashMap attrs = new HashMap(1); - attrs.put(VariantContext.ID_KEY, hapmap.getName()); - long end = hapmap.getEnd(); if ( deletionLength > 0 ) end += deletionLength; - VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel); + VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).referenceBaseForIndel(refBaseForIndel).make(); return vc; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 347e870c8..6452c7b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -6,9 +6,10 @@ import java.util.TreeMap; * Holds values for a column in a GATK report table */ public class GATKReportColumn extends TreeMap { - private String columnName; - private Object defaultValue; - private boolean display; + final private String columnName; + final private Object defaultValue; + final private String format; + final private boolean display; /** * Construct the column object, specifying the column name, default value, and whether or not the column should be displayed @@ -18,11 +19,17 @@ public class GATKReportColumn extends TreeMap { * @param display if true, the column will be displayed in the final output */ public GATKReportColumn(String columnName, Object defaultValue, boolean display) { + this(columnName, defaultValue, display, null); + } + + public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) { this.columnName = columnName; this.defaultValue = defaultValue; this.display = display; + this.format = format == null ? null : (format.equals("") ? null : format); } + /** * Initialize an element in the column with a default value * @@ -55,7 +62,7 @@ public class GATKReportColumn extends TreeMap { * @return the string value at the specified position in the column, or the default value if the element is not set */ public String getStringValue(Object primaryKey) { - return toString(getWithoutSideEffects(primaryKey)); + return formatValue(getWithoutSideEffects(primaryKey)); } /** @@ -77,7 +84,7 @@ public class GATKReportColumn extends TreeMap { for (Object obj : this.values()) { if (obj != null) { - int width = toString(obj).length(); + int width = formatValue(obj).length(); if (width > maxWidth) { maxWidth = width; @@ -93,10 +100,12 @@ public class GATKReportColumn extends TreeMap { * @param obj The object to convert to a string * @return The string representation of the column */ - private static String toString(Object obj) { + private String formatValue(Object obj) { String value; if (obj == null) { value = "null"; + } else if ( format != null ) { + value = String.format(format, obj); } else if (obj instanceof Float) { value = String.format("%.8f", (Float) obj); } else if (obj instanceof Double) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 2fd5ad7e3..95c2a14fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -250,13 +250,12 @@ public class GATKReportTable { * @param defaultValue the default value for the column */ public void addColumn(String columnName, Object defaultValue) { - if (!isValidName(columnName)) { - throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed."); - } - - addColumn(columnName, defaultValue, true); + addColumn(columnName, defaultValue, null); } + public void addColumn(String columnName, Object defaultValue, String format) { + addColumn(columnName, defaultValue, true, format); + } /** * Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file. * @@ -265,7 +264,14 @@ public class GATKReportTable { * @param display if true - the column will be displayed; if false - the column will be hidden */ public void addColumn(String columnName, Object defaultValue, boolean display) { - columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display)); + addColumn(columnName, defaultValue, display, null); + } + + public void addColumn(String columnName, Object defaultValue, boolean display, String format) { + if (!isValidName(columnName)) { + throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed."); + } + columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format)); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java new file mode 100644 index 000000000..0c81af07b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.resourcemanagement; + +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Models how threads are distributed between various components of the GATK. + */ +public class ThreadAllocation { + /** + * The number of CPU threads to be used by the GATK. + */ + private final int numCPUThreads; + + /** + * Number of threads to devote exclusively to IO. Default is 0. + */ + private final int numIOThreads; + + public int getNumCPUThreads() { + return numCPUThreads; + } + + public int getNumIOThreads() { + return numIOThreads; + } + + /** + * Construct the default thread allocation. + */ + public ThreadAllocation() { + this(1,null,null); + } + + /** + * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. + * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). + * @param totalThreads Complete number of threads to allocate. + * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numIOThreads Total number of threads allocated exclusively to IO. + */ + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { + // If no allocation information is present, allocate all threads to CPU + if(numCPUThreads == null && numIOThreads == null) { + this.numCPUThreads = totalThreads; + this.numIOThreads = 0; + } + // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). + else if(numIOThreads == null) { + if(numCPUThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = totalThreads - numCPUThreads; + } + // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). + else if(numCPUThreads == null) { + if(numIOThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); + this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); + this.numIOThreads = numIOThreads; + } + else { + if(numCPUThreads + numIOThreads != totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = numIOThreads; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 3a21e97a4..833107bd3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -54,18 +55,18 @@ public class AlleleBalance extends InfoFieldAnnotation { if ( !vc.isBiallelic() ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( !vc.hasGenotypes() ) return null; double ratio = 0.0; double totalWeights = 0.0; - for ( Map.Entry genotype : genotypes.entrySet() ) { + for ( Genotype genotype : genotypes ) { // we care only about het calls - if ( !genotype.getValue().isHet() ) + if ( !genotype.isHet() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; @@ -84,8 +85,8 @@ public class AlleleBalance extends InfoFieldAnnotation { continue; // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much - ratio += genotype.getValue().getNegLog10PError() * ((double)refCount / (double)(refCount + altCount)); - totalWeights += genotype.getValue().getNegLog10PError(); + ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount)); + totalWeights += genotype.getLog10PError(); } else if ( vc.isIndel() && context.hasExtendedEventPileup() ) { final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); if ( indelPileup == null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 5ed2a6761..0acd3e841 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -59,10 +59,8 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( ! vc.hasGenotypes() ) return null; - - Map map = new HashMap(); - VariantContextUtils.calculateChromosomeCounts(vc, map, true); - return map; + + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } public List getKeyNames() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8098de5b1..ab38b69cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 94b0636f4..551f8e2cf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -89,9 +89,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { - final Set> genotypes = vc.getGenotypes().entrySet(); - for ( final Map.Entry genotype : genotypes ) { - final AlignmentContext thisContext = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : vc.getGenotypes()) { + final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); if ( thisContext != null ) { final ReadBackedPileup thisPileup; if (thisContext.hasExtendedEventPileup()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index f068ed895..795cdbeb5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -26,20 +27,18 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; - private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; + private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; int refCount = 0; int hetCount = 0; int homCount = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { - Genotype g = genotype.getValue(); - + for ( final Genotype g : genotypes ) { if ( g.isNoCall() ) continue; @@ -47,7 +46,7 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress // Right now we just ignore genotypes that are not confident, but this throws off // our HW ratios. More analysis is needed to determine the right thing to do when // the genotyper cannot decide whether a given sample is het or hom var. - if ( g.getNegLog10PError() < MIN_NEG_LOG10_PERROR ) + if ( g.getLog10PError() > MIN_LOG10_PERROR ) continue; if ( g.isHomRef() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 8728e5aa4..640ab036b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -32,7 +33,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; @@ -51,8 +52,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno double hetCount = 0.0; double homCount = 0.0; int N = 0; // number of samples that have likelihoods - for ( final Map.Entry genotypeMap : genotypes.entrySet() ) { - Genotype g = genotypeMap.getValue(); + for ( final Genotype g : genotypes ) { if ( g.isNoCall() || !g.hasLikelihoods() ) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index b942d9817..d555463bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -28,19 +29,19 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( stratifiedContexts.size() == 0 ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; int depth = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { + for ( final Genotype genotype : genotypes ) { // we care only about variant calls with likelihoods - if ( genotype.getValue().isHomRef() ) + if ( genotype.isHomRef() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; @@ -50,7 +51,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double QD = 10.0 * vc.getNegLog10PError() / (double)depth; + double QD = -10.0 * vc.getLog10PError() / (double)depth; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 93e093248..c5a2df1fd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; @@ -32,7 +33,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if ( stratifiedContexts.size() == 0 ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; @@ -42,8 +43,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (vc.isSNP() && vc.isBiallelic()) { // todo - no current support for multiallelic snps - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { continue; } @@ -52,8 +53,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar } else if (vc.isIndel() || vc.isMixed()) { - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { continue; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ee08cfa3b..cbf536e4f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -47,11 +47,11 @@ import java.util.Map; public class SampleList extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( vc.isMonomorphic() || !vc.hasGenotypes() ) + if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; StringBuffer samples = new StringBuffer(); - for ( Genotype genotype : vc.getGenotypesSortedByName() ) { + for ( Genotype genotype : vc.getGenotypesOrderedByName() ) { if ( genotype.isCalled() && !genotype.isHomRef() ){ if ( samples.length() > 0 ) samples.append(","); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 85977bf8e..1956dac6c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public enum InfoFieldKey { EFFECT_KEY ("SNPEFF_EFFECT", -1), IMPACT_KEY ("SNPEFF_IMPACT", 0), - CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), - AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), - GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), - GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), - TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), - EXON_ID_KEY ("SNPEFF_EXON_ID", 7), - FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1); + FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7), + EXON_ID_KEY ("SNPEFF_EXON_ID", 8); // Actual text of the key private final String keyName; @@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // are validated against this list. public enum EffectType { // High-impact effects: - FRAME_SHIFT (EffectFunctionalClass.NONE, false), - STOP_GAINED (EffectFunctionalClass.NONSENSE, false), - START_LOST (EffectFunctionalClass.NONE, false), - SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false), - SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false), - EXON_DELETED (EffectFunctionalClass.NONE, false), - STOP_LOST (EffectFunctionalClass.NONE, false), + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + EXON_DELETED, + FRAME_SHIFT, + STOP_GAINED, + STOP_LOST, // Moderate-impact effects: - NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false), - CODON_CHANGE (EffectFunctionalClass.NONE, false), - CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_DELETION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false), - UTR_5_DELETED (EffectFunctionalClass.NONE, false), - UTR_3_DELETED (EffectFunctionalClass.NONE, false), + NON_SYNONYMOUS_CODING, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + UTR_5_DELETED, + UTR_3_DELETED, // Low-impact effects: - SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - START_GAINED (EffectFunctionalClass.NONE, false), + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + START_GAINED, + SYNONYMOUS_CODING, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, // Modifiers: - NONE (EffectFunctionalClass.NONE, true), - CHROMOSOME (EffectFunctionalClass.NONE, true), - INTERGENIC (EffectFunctionalClass.NONE, true), - UPSTREAM (EffectFunctionalClass.NONE, true), - UTR_5_PRIME (EffectFunctionalClass.NONE, true), - CDS (EffectFunctionalClass.NONE, true), - GENE (EffectFunctionalClass.NONE, true), - TRANSCRIPT (EffectFunctionalClass.NONE, true), - EXON (EffectFunctionalClass.NONE, true), - INTRON (EffectFunctionalClass.NONE, true), - UTR_3_PRIME (EffectFunctionalClass.NONE, true), - DOWNSTREAM (EffectFunctionalClass.NONE, true), - INTRON_CONSERVED (EffectFunctionalClass.NONE, true), - INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true), - REGULATION (EffectFunctionalClass.NONE, true), - CUSTOM (EffectFunctionalClass.NONE, true), - WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true); - - private final EffectFunctionalClass functionalClass; - private final boolean isModifier; - - EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) { - this.functionalClass = functionalClass; - this.isModifier = isModifier; - } - - public EffectFunctionalClass getFunctionalClass() { - return functionalClass; - } - - public boolean isModifier() { - return isModifier; - } + NONE, + CHROMOSOME, + CUSTOM, + CDS, + GENE, + TRANSCRIPT, + EXON, + INTRON_CONSERVED, + UTR_5_PRIME, + UTR_3_PRIME, + DOWNSTREAM, + INTRAGENIC, + INTERGENIC, + INTERGENIC_CONSERVED, + UPSTREAM, + REGULATION, + INTRON } - // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of - // classifying some of the LOW impact effects as MODIFIERs. + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER. public enum EffectImpact { MODIFIER (0), LOW (1), @@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio UNKNOWN } - // We assign a functional class to each SnpEff effect. + // SnpEff assigns a functional class to each effect. public enum EffectFunctionalClass { NONE (0), SILENT (1), @@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public List getKeyNames() { return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), InfoFieldKey.GENE_NAME_KEY.getKeyName(), InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), - InfoFieldKey.EXON_ID_KEY.getKeyName(), - InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName() + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } @@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return Arrays.asList( new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())), new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"), new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())) + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio protected static class SnpEffEffect { private EffectType effect; private EffectImpact impact; + private EffectFunctionalClass functionalClass; private String codonChange; private String aminoAcidChange; private String geneName; @@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private String parseError = null; private boolean isWellFormed = true; - private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11; - // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, - // errors come after warnings, not vice versa: - private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; - private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + // If there is either a warning OR an error, it will be in the last field. If there is both + // a warning AND an error, the warning will be in the second-to-last field, and the error will + // be in the last field. + private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1; + private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2; + private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1; - private static final int SNPEFF_CODING_FIELD_INDEX = 5; + // Position of the field indicating whether the effect is coding or non-coding. This field is used + // in selecting the most significant effect, but is not included in the annotations we return + // since it can be deduced from the SNPEFF_GENE_BIOTYPE field. + private static final int SNPEFF_CODING_FIELD_INDEX = 6; public SnpEffEffect ( String effectName, String[] effectMetadata ) { parseEffectName(effectName); @@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private void parseEffectMetadata ( String[] effectMetadata ) { if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { - if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { - parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) { + parseError(String.format("SnpEff issued the following warning or error: \"%s\"", + effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR])); } - else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { - parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) { + parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"", + effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR], + effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR])); } else { parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", @@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return; } - if ( effect != null && effect.isModifier() ) { - impact = EffectImpact.MODIFIER; + // The impact field will never be empty, and should always contain one of the enumerated values: + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } - else { + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + // The functional class field will be empty when the effect has no functional class associated with it: + if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) { try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()])); } } + else { + functionalClass = EffectFunctionalClass.NONE; + } codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + // The coding field will be empty when SnpEff has no coding info for the effect: if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); @@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return true; } else if ( impact.isSameImpactAs(other.impact) ) { - return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass()); + return functionalClass.isHigherPriorityThan(other.functionalClass); } return false; @@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString()); addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); - addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString()); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index ea11391d9..c9ea7a3b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -222,8 +222,33 @@ public class VariantAnnotator extends RodWalker implements Ann if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } - for ( String expression : expressionsToUse ) - hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + // for the expressions, pull the info header line from the header of the resource rod + for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + // special case the ID field + if ( expression.fieldName.equals("ID") ) { + hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); + continue; + } + VCFInfoHeaderLine targetHeaderLine = null; + for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + if ( line instanceof VCFInfoHeaderLine ) { + VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + if ( infoline.getName().equals(expression.fieldName) ) { + targetHeaderLine = infoline; + break; + } + } + } + + if ( targetHeaderLine != null ) { + if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER ) + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + else + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + } else { + hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + } + } engine.invokeAnnotationInitializationMethods(hInfo); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index e4bc0d5d5..d4442dc5d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -34,7 +34,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.util.*; @@ -49,20 +51,20 @@ public class VariantAnnotatorEngine { private AnnotatorCompatibleWalker walker; private GenomeAnalysisEngine toolkit; - private static class VAExpression { + protected static class VAExpression { public String fullName, fieldName; public RodBinding binding; - public VAExpression(String fullEpression, List> bindings) { - int indexOfDot = fullEpression.lastIndexOf("."); + public VAExpression(String fullExpression, List> bindings) { + int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) - throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); + throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); - fullName = fullEpression; - fieldName = fullEpression.substring(indexOfDot+1); + fullName = fullExpression; + fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullEpression.substring(0, indexOfDot); + String bindingName = fullExpression.substring(0, indexOfDot); for ( RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; @@ -97,6 +99,8 @@ public class VariantAnnotatorEngine { requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } + protected List getRequestedExpressions() { return requestedExpressions; } + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); @@ -160,11 +164,10 @@ public class VariantAnnotatorEngine { } public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences - annotateDBs(tracker, ref, vc, infoAnnotations); + vc = annotateDBs(tracker, ref, vc, infoAnnotations); // annotate expressions where available annotateExpressions(tracker, ref, infoAnnotations); @@ -177,20 +180,20 @@ public class VariantAnnotatorEngine { } // generate a new annotated VC - final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations); + VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc)); + return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } - private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); // annotate dbsnp id if available and not already there - if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) ) - infoAnnotations.put(VariantContext.ID_KEY, rsID); + if ( rsID != null && vc.emptyID() ) + vc = new VariantContextBuilder(vc).id(rsID).make(); } else { boolean overlapsComp = false; for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { @@ -202,6 +205,8 @@ public class VariantAnnotatorEngine { infoAnnotations.put(dbSet.getValue(), overlapsComp); } } + + return vc; } private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { @@ -211,21 +216,25 @@ public class VariantAnnotatorEngine { continue; VariantContext vc = VCs.iterator().next(); - if ( vc.hasAttribute(expression.fieldName) ) + // special-case the ID field + if ( expression.fieldName.equals("ID") ) { + if ( vc.hasID() ) + infoAnnotations.put(expression.fullName, vc.getID()); + } else if ( vc.hasAttribute(expression.fieldName) ) { infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } - private Map annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( requestedGenotypeAnnotations.size() == 0 ) return vc.getGenotypes(); - Map genotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry g : vc.getGenotypes().entrySet() ) { - Genotype genotype = g.getValue(); - AlignmentContext context = stratifiedContexts.get(g.getKey()); + GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { - genotypes.put(g.getKey(), genotype); + genotypes.add(genotype); continue; } @@ -235,7 +244,7 @@ public class VariantAnnotatorEngine { if ( result != null ) genotypeAnnotations.putAll(result); } - genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased())); + genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased())); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 7f6dabeec..f827856be 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -36,10 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -125,7 +122,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { protected static String line = null; private final double MIN_PROB_ERROR = 0.000001; - private final double MAX_GENOTYPE_QUALITY = 6.0; + private final double MAX_GENOTYPE_QUALITY = -6.0; public void initialize() { @@ -181,8 +178,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { // ignore places where we don't have a variant if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null) { - vcfWriter.add(vc_input); - return 1; + vcfWriter.add(vc_input); + return 1; } @@ -190,8 +187,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { byte refByte = ref.getBase(); // make new Genotypes based on Beagle results - Map genotypes = new HashMap(vc_input.getGenotypes().size()); - + GenotypesContext genotypes = GenotypesContext.create(vc_input.getGenotypes().size()); // for each genotype, create a new object with Beagle information on it @@ -200,15 +196,13 @@ public class BeagleOutputToVCFWalker extends RodWalker { Double alleleFrequencyH = 0.0; int beagleVarCounts = 0; - Map hapmapGenotypes = null; + GenotypesContext hapmapGenotypes = null; if (vc_comp != null) { hapmapGenotypes = vc_comp.getGenotypes(); } - for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { - - Genotype g = originalGenotypes.getValue(); + for ( final Genotype g : vc_input.getGenotypes() ) { Set filters = new LinkedHashSet(g.getFilters()); boolean genotypeIsPhased = true; @@ -218,7 +212,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { // use sample as key into genotypes structure if (vc_comp != null) { - if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) { + if (vc_input.getGenotypes().containsSample(sample) && hapmapGenotypes.containsSample(sample)) { Genotype hapmapGenotype = hapmapGenotypes.get(sample); if (hapmapGenotype.isCalled()){ @@ -255,9 +249,9 @@ public class BeagleOutputToVCFWalker extends RodWalker { Allele bglAlleleA, bglAlleleB; if (alleleA.matches(refString)) - bglAlleleA = Allele.create(alleleA,true); + bglAlleleA = Allele.create(alleleA,true); else - bglAlleleA = Allele.create(alleleA,false); + bglAlleleA = Allele.create(alleleA,false); if (alleleB.matches(refString)) bglAlleleB = Allele.create(alleleB,true); @@ -286,7 +280,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { // deal with numerical errors coming from limited formatting value on Beagle output files if (probWrongGenotype > 1 - MIN_PROB_ERROR) probWrongGenotype = 1 - MIN_PROB_ERROR; - + if (1-probWrongGenotype < noCallThreshold) { // quality is bad: don't call genotype alleles.clear(); @@ -298,7 +292,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { if (probWrongGenotype < MIN_PROB_ERROR) genotypeQuality = MAX_GENOTYPE_QUALITY; else - genotypeQuality = -log10(probWrongGenotype); + genotypeQuality = log10(probWrongGenotype); HashMap originalAttributes = new HashMap(g.getAttributes()); @@ -329,47 +323,40 @@ public class BeagleOutputToVCFWalker extends RodWalker { else { originalAttributes.put("OG","."); } - Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); + Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) { beagleVarCounts++; } - genotypes.put(originalGenotypes.getKey(), imputedGenotype); - + genotypes.add(imputedGenotype); } - VariantContext filteredVC; - if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) - filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); - else { + final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes); + if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) { Set removedFilters = vc_input.filtersWereApplied() ? new HashSet(vc_input.getFilters()) : new HashSet(1); removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0))); - filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes()); + builder.alleles(new HashSet(Arrays.asList(vc_input.getReference()))).filters(removedFilters); } - HashMap attributes = new HashMap(filteredVC.getAttributes()); // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false); + VariantContextUtils.calculateChromosomeCounts(builder, false); // Get Hapmap AC and AF if (vc_comp != null) { - attributes.put("ACH", alleleCountH.toString() ); - attributes.put("ANH", chrCountH.toString() ); - attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) ); + builder.attribute("ACH", alleleCountH.toString() ); + builder.attribute("ANH", chrCountH.toString() ); + builder.attribute("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) ); } - attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle ); + builder.attribute("NumGenotypesChanged", numGenotypesChangedByBeagle ); if( !beagleR2Feature.getR2value().equals(Double.NaN) ) { - attributes.put("R2", beagleR2Feature.getR2value().toString() ); + builder.attribute("R2", beagleR2Feature.getR2value().toString() ); } - - vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes)); - + vcfWriter.add(builder.make()); return 1; - } public Integer reduceInit() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index b722220f9..aa71f4399 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.PrintStream; @@ -204,7 +201,7 @@ public class ProduceBeagleInputWalker extends RodWalker { logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER)); + bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make()); } bootstrapSetSize++; return true; @@ -245,18 +242,18 @@ public class ProduceBeagleInputWalker extends RodWalker { } if ( markers != null ) markers.append("\n"); - Map preferredGenotypes = preferredVC.getGenotypes(); - Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; + GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); + GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure - if ( preferredGenotypes.keySet().contains(sample) ) { + if ( preferredGenotypes.containsSample(sample) ) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; - } else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) { + } else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) { genotype = otherGenotypes.get(sample); isValidation = ! isValidationSite; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index a447d17af..efa57c0aa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; +import org.apache.log4j.Logger; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -46,6 +47,8 @@ import java.util.Map; * Class implementing diffnode reader for VCF */ public class VCFDiffableReader implements DiffableReader { + private static Logger logger = Logger.getLogger(VCFDiffableReader.class); + @Override public String getName() { return "VCF"; } @@ -68,7 +71,10 @@ public class VCFDiffableReader implements DiffableReader { String key = headerLine.getKey(); if ( headerLine instanceof VCFNamedHeaderLine ) key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); - root.add(key, headerLine.toString()); + if ( root.hasElement(key) ) + logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); + else + root.add(key, headerLine.toString()); } String line = lineReader.readLine(); @@ -90,22 +96,22 @@ public class VCFDiffableReader implements DiffableReader { // add fields vcRoot.add("CHROM", vc.getChr()); vcRoot.add("POS", vc.getStart()); - vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("ID", vc.getID()); vcRoot.add("REF", vc.getReference()); vcRoot.add("ALT", vc.getAlternateAlleles()); - vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4); vcRoot.add("FILTER", vc.getFilters()); // add info fields for (Map.Entry attribute : vc.getAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY)) + if ( ! attribute.getKey().startsWith("_") ) vcRoot.add(attribute.getKey(), attribute.getValue()); } - for (Genotype g : vc.getGenotypes().values() ) { + for (Genotype g : vc.getGenotypes() ) { DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); gRoot.add("GT", g.getGenotypeString()); - gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 ); + gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 ); for (Map.Entry attribute : g.getAttributes().entrySet()) { if ( ! attribute.getKey().startsWith("_") ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index bf3606b54..8278dbab7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -36,9 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -224,7 +222,7 @@ public class VariantFiltrationWalker extends RodWalker { (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied Set filters = new LinkedHashSet(vc.getFilters()); filters.add(MASK_NAME); - vc = VariantContext.modifyFilters(vc, filters); + vc = new VariantContextBuilder(vc).filters(filters).make(); } FiltrationContext varContext = new FiltrationContext(ref, vc); @@ -267,7 +265,7 @@ public class VariantFiltrationWalker extends RodWalker { (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied Set filters = new LinkedHashSet(vc.getFilters()); filters.add(MASK_NAME); - vc = VariantContext.modifyFilters(vc, filters); + vc = new VariantContextBuilder(vc).filters(filters).make(); } return vc; @@ -279,20 +277,15 @@ public class VariantFiltrationWalker extends RodWalker { if ( context == null ) return; - VariantContext vc = context.getVariantContext(); + final VariantContext vc = context.getVariantContext(); + final VariantContextBuilder builder = new VariantContextBuilder(vc); // make new Genotypes based on filters - Map genotypes; - if ( genotypeFilterExps.size() == 0 ) { - genotypes = null; - } else { - genotypes = new HashMap(vc.getGenotypes().size()); + if ( genotypeFilterExps.size() > 0 ) { + GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); // for each genotype, check filters then create a new object - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - - Genotype g = genotype.getValue(); - + for ( final Genotype g : vc.getGenotypes() ) { if ( g.isCalled() ) { Set filters = new LinkedHashSet(g.getFilters()); @@ -300,11 +293,13 @@ public class VariantFiltrationWalker extends RodWalker { if ( VariantContextUtils.match(vc, g, exp) ) filters.add(exp.name); } - genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.isPhased())); + genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased())); } else { - genotypes.put(genotype.getKey(), g); + genotypes.add(g); } } + + builder.genotypes(genotypes); } // make a new variant context based on filters @@ -324,14 +319,9 @@ public class VariantFiltrationWalker extends RodWalker { filters.add(exp.name); } } + builder.filters(filters); - VariantContext filteredVC; - if ( genotypes == null ) - filteredVC = VariantContext.modifyFilters(vc, filters); - else - filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes()); - - writer.add(filteredVC); + writer.add(builder.make()); } public Integer reduce(Integer value, Integer sum) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 35a9fe31d..a8ce98945 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.List; -import java.util.Map; -import java.util.Set; /** @@ -47,8 +43,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ EXACT, - /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */ - GRID_SEARCH } protected int N; @@ -73,7 +67,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @param log10AlleleFrequencyPriors priors * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results */ - protected abstract void getLog10PNonRef(Map GLs, List Alleles, + protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors); @@ -85,7 +79,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * * @return calls */ - protected abstract Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood); + protected abstract GenotypesContext assignGenotypes(VariantContext vc, + double[] log10AlleleFrequencyPosteriors, + int AFofMaxLikelihood); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index b5987963f..106bb1982 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils; * Time: 6:46:09 PM * To change this template use File | Settings | File Templates. */ -enum DiploidGenotype { +public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), AG ('A', 'G'), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 666fe88a3..295cf8688 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); if ( elt.isReducedRead() ) { // reduced read representation - byte qual = elt.getQual(); - if ( BaseUtils.isRegularBase( elt.getBase() )) { + if ( BaseUtils.isRegularBase( obsBase )) { add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods return elt.getRepresentativeCount(); // we added nObs bases here - } else // odd bases or deletions => don't use them - return 0; - } else { - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; + } + + // odd bases or deletions => don't use them + return 0; } + + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { @@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @return */ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) return 0; - } else { - byte qual = p.getQual(); - if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); - if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); - if ( (int)qual < minBaseQual ) - qual = (byte)0; + byte qual = p.getQual(); - return qual; - } + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; } // ----------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 1c2d82ab7..5d0b6f0a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -26,14 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; @@ -46,12 +42,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final boolean SIMPLE_GREEDY_GENOTYPER = false; private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + private final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(Map GLs, List alleles, + public void getLog10PNonRef(GenotypesContext GLs, List alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { final int numAlleles = alleles.size(); @@ -95,11 +92,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - private static final ArrayList getGLs(Map GLs) { + private static final ArrayList getGLs(GenotypesContext GLs) { ArrayList genotypeLikelihoods = new ArrayList(); genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.values() ) { + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); @@ -155,7 +152,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - public int linearExact(Map GLs, + public int linearExact(GenotypesContext GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -268,14 +265,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { * * @return calls */ - public Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood) { + public GenotypesContext assignGenotypes(VariantContext vc, + double[] log10AlleleFrequencyPosteriors, + int AFofMaxLikelihood) { if ( !vc.isVariant() ) throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - Map GLs = vc.getGenotypes(); + GenotypesContext GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1]; @@ -291,16 +288,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // todo = can't deal with optimal dynamic programming solution with multiallelic records if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - sampleIndices.addAll(GLs.keySet()); + sampleIndices.addAll(GLs.getSampleNamesOrderedByName()); sampleIdx = GLs.size(); } else { - for ( Map.Entry sample : GLs.entrySet() ) { - if ( !sample.getValue().hasLikelihoods() ) + for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { + if ( !genotype.hasLikelihoods() ) continue; - double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + double[] likelihoods = genotype.getLikelihoods().getAsVector(); if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { //System.out.print(sample.getKey()+":"); @@ -312,7 +309,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { continue; } - sampleIndices.add(sample.getKey()); + sampleIndices.add(genotype.getSampleName()); for (int k=0; k <= AFofMaxLikelihood; k++) { @@ -342,7 +339,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - HashMap calls = new HashMap(); + GenotypesContext calls = GenotypesContext.create(); int startIdx = AFofMaxLikelihood; for (int k = sampleIdx; k > 0; k--) { @@ -355,11 +352,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // and will add no-call genotype to GL's in a second pass ArrayList myAlleles = new ArrayList(); - double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); + bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); } else { int newIdx = tracebackArray[k][startIdx];; @@ -367,20 +363,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { startIdx = newIdx; } - /* System.out.format("Sample: %s GL:",sample); - for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f, ",likelihoods[i]); - */ - - for (int i=0; i < likelihoods.length; i++) { - if (i==bestGTguess) - continue; - if (likelihoods[i] >= qual) - qual = likelihoods[i]; - } - // qual contains now max(likelihoods[k]) for all k != bestGTguess - qual = likelihoods[bestGTguess] - qual; - // likelihoods are stored row-wise in lower triangular matrix. IE // for 2 alleles they have ordering AA,AB,BB // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC @@ -408,37 +390,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { break; } - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on - double[] normalized = MathUtils.normalizeFromLog10(likelihoods); - double chosenGenotype = normalized[bestGTguess]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } + final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods); //System.out.println(myAlleles.toString()); - calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); - + calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); } - for ( Map.Entry sample : GLs.entrySet() ) { - - if ( !sample.getValue().hasLikelihoods() ) + for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { + if ( !genotype.hasLikelihoods() ) continue; - Genotype g = GLs.get(sample.getKey()); - double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + final Genotype g = GLs.get(genotype.getSampleName()); + final double[] likelihoods = genotype.getLikelihoods().getAsVector(); if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) continue; // regular likelihoods - ArrayList myAlleles = new ArrayList(); - - double qual = Genotype.NO_NEG_LOG_10PERROR; - myAlleles.add(Allele.NO_CALL); - myAlleles.add(Allele.NO_CALL); - //System.out.println(myAlleles.toString()); - calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false)); + final double qual = Genotype.NO_LOG10_PERROR; + calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false)); } + return calls; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 489e963e8..74c55dbfe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param priors priors to use for GLs * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set - * - * @param useBAQedPileup + * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @return genotype likelihoods per sample for AA, AB, BB */ public abstract Allele getLikelihoods(RefMetaDataTracker tracker, @@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse, boolean useBAQedPileup); + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java deleted file mode 100755 index 27842a8bf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.*; - -public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { - - // for use in optimizing the P(D|AF) calculations: - // how much off from the max likelihoods do we need to be before we can quit calculating? - protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0; - - private AlleleFrequencyMatrix AFMatrix; - - protected GridSearchAFEstimation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - AFMatrix = new AlleleFrequencyMatrix(N); - } - - protected void getLog10PNonRef(Map GLs, List alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { - initializeAFMatrix(GLs); - - // first, calculate for AF=0 (no change to matrix) - log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0]; - double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0]; - - int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2; - - // for each minor allele frequency, calculate log10PofDgivenAFi - for (int i = 1; i <= maxAlleleFrequencyToTest; i++) { - // add one more alternate allele - AFMatrix.incrementFrequency(); - - // calculate new likelihoods - log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i]; - - // an optimization to speed up the calculation: if we are beyond the local maximum such - // that subsequent likelihoods won't factor into the confidence score, just quit - if ( maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) - return; - - if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen ) - maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i]; - } - } - - /** - * Overrides the super class - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - protected Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood) { - if ( !vc.isVariant() ) - throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - - Allele refAllele = vc.getReference(); - Allele altAllele = vc.getAlternateAllele(0); - HashMap calls = new HashMap(); - - // first, the potential alt calls - for ( String sample : AFMatrix.getSamples() ) { - Genotype g = vc.getGenotype(sample); - - // set the genotype and confidence - Pair AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample); - ArrayList myAlleles = new ArrayList(); - if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(refAllele); - } else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(altAllele); - } else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() ) - myAlleles.add(altAllele); - myAlleles.add(altAllele); - } - - calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, g.getAttributes(), false)); - } - - return calls; - } - - private void initializeAFMatrix(Map GLs) { - AFMatrix.clear(); - - for ( Genotype g : GLs.values() ) { - if ( g.hasLikelihoods() ) - AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName()); - } - } - - protected static class AlleleFrequencyMatrix { - - private double[][] matrix; // allele frequency matrix - private int[] indexes; // matrix to maintain which genotype is active - private int maxN; // total possible frequencies in data - private int frequency; // current frequency - - // data structures necessary to maintain a list of the best genotypes and their scores - private ArrayList samples = new ArrayList(); - private HashMap>> samplesToGenotypesPerAF = new HashMap>>(); - - public AlleleFrequencyMatrix(int N) { - maxN = N; - matrix = new double[N][3]; - indexes = new int[N]; - clear(); - } - - public List getSamples() { return samples; } - - public void clear() { - frequency = 0; - for (int i = 0; i < maxN; i++) - indexes[i] = 0; - samples.clear(); - samplesToGenotypesPerAF.clear(); - } - - public void setLikelihoods(double[] GLs, String sample) { - int index = samples.size(); - samples.add(sample); - matrix[index][GenotypeType.AA.ordinal()] = GLs[0]; - matrix[index][GenotypeType.AB.ordinal()] = GLs[1]; - matrix[index][GenotypeType.BB.ordinal()] = GLs[2]; - } - - public void incrementFrequency() { - int N = samples.size(); - if ( frequency == 2 * N ) - throw new ReviewedStingException("Frequency was incremented past N; how is this possible?"); - frequency++; - - double greedy = VALUE_NOT_CALCULATED; - int greedyIndex = -1; - for (int i = 0; i < N; i++) { - - if ( indexes[i] == GenotypeType.AB.ordinal() ) { - if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()]; - greedyIndex = i; - } - } - else if ( indexes[i] == GenotypeType.AA.ordinal() ) { - if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()]; - greedyIndex = i; - } - // note that we currently don't bother with breaking ties between samples - // (which would be done by looking at the HOM_VAR value) because it's highly - // unlikely that a collision will both occur and that the difference will - // be significant at HOM_VAR... - } - // if this person is already hom var, he can't add another alternate allele - // so we can ignore that case - } - if ( greedyIndex == -1 ) - throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?"); - - if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() ) - indexes[greedyIndex] = GenotypeType.BB.ordinal(); - else - indexes[greedyIndex] = GenotypeType.AB.ordinal(); - } - - public double getLikelihoodsOfFrequency() { - double likelihoods = 0.0; - int N = samples.size(); - for (int i = 0; i < N; i++) - likelihoods += matrix[i][indexes[i]]; - - /* - System.out.println(frequency); - for (int i = 0; i < N; i++) { - System.out.print(samples.get(i)); - for (int j=0; j < 3; j++) { - System.out.print(String.valueOf(matrix[i][j])); - System.out.print(indexes[i] == j ? "* " : " "); - } - System.out.println(); - } - System.out.println(likelihoods); - System.out.println(); - */ - - recordGenotypes(); - - return likelihoods; - } - - public Pair getGenotype(int frequency, String sample) { - return samplesToGenotypesPerAF.get(frequency).get(sample); - } - - private void recordGenotypes() { - HashMap> samplesToGenotypes = new HashMap>(); - - int index = 0; - for ( String sample : samples ) { - int genotype = indexes[index]; - - double score; - - int maxEntry = MathUtils.maxElementIndex(matrix[index]); - // if the max value is for the most likely genotype, we can compute next vs. next best - if ( genotype == maxEntry ) { - if ( genotype == GenotypeType.AA.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); - else if ( genotype == GenotypeType.AB.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); - else // ( genotype == GenotypeType.HOM.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]); - } - // otherwise, we need to calculate the probability of the genotype - else { - double[] normalized = MathUtils.normalizeFromLog10(matrix[index]); - double chosenGenotype = normalized[genotype]; - score = -1.0 * Math.log10(1.0 - chosenGenotype); - } - - samplesToGenotypes.put(sample, new Pair(genotype, Math.abs(score))); - index++; - } - - samplesToGenotypesPerAF.put(frequency, samplesToGenotypes); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index d88e55687..97f7b21eb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -35,9 +35,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -108,9 +106,9 @@ public class UGCallVariants extends RodWalker { return sum; try { - Map attrs = new HashMap(value.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(value, attrs, true); - writer.add(VariantContext.modifyAttributes(value, attrs)); + VariantContextBuilder builder = new VariantContextBuilder(value); + VariantContextUtils.calculateChromosomeCounts(builder, true); + writer.add(builder.make()); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } @@ -128,27 +126,27 @@ public class UGCallVariants extends RodWalker { return null; VariantContext variantVC = null; - Map genotypes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(); for ( VariantContext vc : VCs ) { if ( variantVC == null && vc.isVariant() ) variantVC = vc; - genotypes.putAll(getGenotypesWithGLs(vc.getGenotypes())); + genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes())); } if ( variantVC == null ) { VariantContext vc = VCs.get(0); throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart()); } - return new VariantContext("VCwithGLs", variantVC.getChr(), variantVC.getStart(), variantVC.getEnd(), variantVC.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null); + + return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make(); } - private static Map getGenotypesWithGLs(Map genotypes) { - Map genotypesWithGLs = new HashMap(); - for ( Map.Entry g : genotypes.entrySet() ) { - if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null ) - genotypesWithGLs.put(g.getKey(), g.getValue()); + private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) { + GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size()); + for ( final Genotype g : genotypes ) { + if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null ) + genotypesWithGLs.add(g); } - return genotypesWithGLs; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index bdd4e2c65..369c2d0c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -258,7 +258,7 @@ public class UnifiedGenotyper extends LocusWalker result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); return result; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index cee128a6a..c38bb5b42 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -229,8 +229,7 @@ public class UnifiedGenotyperEngine { VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); - + vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).make(); } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) @@ -238,7 +237,7 @@ public class UnifiedGenotyperEngine { Set alleles = new HashSet(); alleles.add(Allele.create(ref.getBase(), true)); - vc = new VariantContext("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles); + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); } if ( annotationEngine != null ) { @@ -265,7 +264,7 @@ public class UnifiedGenotyperEngine { alleles.add(refAllele); boolean addedAltAlleles = false; - HashMap genotypes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(); for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) { if ( !addedAltAlleles ) { addedAltAlleles = true; @@ -281,22 +280,13 @@ public class UnifiedGenotyperEngine { attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); + genotypes.add(new Genotype(GL.getSample(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } GenomeLoc loc = refContext.getLocus(); int endLoc = calculateEndPos(alleles, refAllele, loc); - return new VariantContext("UG_call", - loc.getContig(), - loc.getStart(), - endLoc, - alleles, - genotypes, - VariantContext.NO_NEG_LOG_10PERROR, - null, - null, - refContext.getBase()); + return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make(); } // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine @@ -354,7 +344,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); // print out stats if we have a writer if ( verboseWriter != null ) @@ -420,8 +410,14 @@ public class UnifiedGenotyperEngine { myAlleles = new HashSet(1); myAlleles.add(vc.getReference()); } - VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase()); + + VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles); + builder.genotypes(genotypes); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter); + builder.attributes(attributes); + builder.referenceBaseForIndel(refContext.getBase()); + VariantContext vcCall = builder.make(); if ( annotationEngine != null ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations @@ -491,7 +487,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); // *** note that calculating strand bias involves overwriting data structures, so we do that last HashMap attributes = new HashMap(); @@ -504,10 +500,15 @@ public class UnifiedGenotyperEngine { myAlleles = new HashSet(1); myAlleles.add(vc.getReference()); } - VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel()); - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); + VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles); + builder.genotypes(genotypes); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter); + builder.attributes(attributes); + builder.referenceBaseForIndel(vc.getReferenceBaseForIndel()); + + return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF)); } private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { @@ -811,9 +812,6 @@ public class UnifiedGenotyperEngine { case EXACT: afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter); break; - case GRID_SEARCH: - afcm = new GridSearchAFEstimation(UAC, N, logger, verboseWriter); - break; default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index f531cadd4..09968f47e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -556,40 +556,51 @@ public class PairHMMIndelErrorModel { long indStart = start - haplotype.getStartPosition(); long indStop = stop - haplotype.getStartPosition(); - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), - (int)indStart, (int)indStop); - double readLikelihood; - if (matchMetricArray == null) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + if (DEBUG) + System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString()); - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - if (previousHaplotypeSeen == null) - startIdx = 0; - else { - final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); - final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); - startIdx = Math.min(Math.min(s1, s2), s3); - } - previousHaplotypeSeen = haplotypeBases.clone(); - previousGOP = currentContextGOP.clone(); - previousGCP = currentContextGCP.clone(); + if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { + // read spanned more than allowed reference context: we currently can't deal with this + readLikelihood =0; + } else + { + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), + (int)indStart, (int)indStop); + + if (matchMetricArray == null) { + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + } + final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); + final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + if (previousHaplotypeSeen == null) + startIdx = 0; + else { + final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); + final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); + startIdx = Math.min(Math.min(s1, s2), s3); + } + previousHaplotypeSeen = haplotypeBases.clone(); + previousGOP = currentContextGOP.clone(); + previousGCP = currentContextGCP.clone(); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, - currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); - if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIdx); + readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, + currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); + + if (DEBUG) { + System.out.println("H:"+new String(haplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + System.out.format("StPos:%d\n", startIdx); + } } readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 414ffa09c..aa9ae1517 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -58,9 +58,7 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -1057,16 +1055,15 @@ public class SomaticIndelDetectorWalker extends ReadWalker { stop += event_length; } - Map genotypes = new HashMap(); - + GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - Map attrs = call.makeStatsAttributes(null); + Map attrs = call.makeStatsAttributes(null); if ( call.isCall() ) // we made a call - put actual het genotype here: - genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); } Set filters = null; @@ -1074,8 +1071,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker { filters = new HashSet(); filters.add("NoCall"); } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, null, refBases[(int)start-1]); + VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) + .genotypes(genotypes).filters(filters).referenceBaseForIndel(refBases[(int)start-1]).make(); vcf.add(vc); } @@ -1147,14 +1144,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker { homRefAlleles.add( alleles.get(0)); homRefAlleles.add( alleles.get(0)); - Map genotypes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); + genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false)); } for ( String sample : tumorSamples ) { - genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); + genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) ); } Set filters = null; @@ -1171,8 +1168,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker { filters.add("TCov"); } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, attrs, refBases[(int)start-1]); + VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) + .genotypes(genotypes).filters(filters).attributes(attrs).referenceBaseForIndel(refBases[(int)start-1]).make(); vcf.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java index 5a32479ab..54838b55e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java @@ -29,7 +29,7 @@ import java.util.Arrays; import java.util.LinkedList; import java.util.List; -public abstract class BaseArray implements Comparable { +abstract class BaseArray implements Comparable { protected Byte[] bases; public BaseArray(byte[] bases) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java index 06f4d3ab2..45a1ab04c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java @@ -30,7 +30,7 @@ import java.util.Iterator; /* * CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables. */ -public class CardinalityCounter implements Iterator, Iterable { +class CardinalityCounter implements Iterator, Iterable { private int[] cards; private int[] valList; private boolean hasNext; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java index 4ec940f4f..e88a7104d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java @@ -30,7 +30,7 @@ import java.util.NoSuchElementException; It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned to save the current pointer for a later time (while the original iterator can continue to iterate). */ -public class CloneableIteratorLinkedList { +class CloneableIteratorLinkedList { private CloneableIteratorDoublyLinkedNode first; private CloneableIteratorDoublyLinkedNode last; private int size; diff --git a/public/java/src/org/broadinstitute/sting/utils/DisjointSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/utils/DisjointSet.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java index 52c18e6d6..c054af5d6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/DisjointSet.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java @@ -21,13 +21,13 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.gatk.walkers.phasing; import java.util.Collection; import java.util.Set; import java.util.TreeSet; -public class DisjointSet { +class DisjointSet { private ItemNode[] nodes; public DisjointSet(int numItems) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java index 3c20a311e..61d5a725e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -27,7 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; -public class Haplotype extends BaseArray implements Cloneable { +class Haplotype extends BaseArray implements Cloneable { public Haplotype(byte[] bases) { super(bases); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java deleted file mode 100644 index 809772c05..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - - -/** - * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}) -@By(DataSource.REFERENCE_ORDERED_DATA) - -public class MergeMNPsWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; - - @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false) - protected int maxGenomicDistanceForMNP = 1; - - @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) - public RodBinding variants; - - public void initialize() { - initializeVcfWriter(); - } - - private void initializeVcfWriter() { - // false <-> don't take control of writer, since didn't create it: - vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false); - writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site, send it to be (possibly) merged with previously observed sites. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return dummy Integer - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - for (VariantContext vc : tracker.getValues(variants, context.getLocation())) - writeVCF(vc); - - return 0; - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, vcMergerWriter, logger); - } - - public Integer reduce(Integer result, Integer total) { - if (result == null) - return total; - - return total + result; - } - - /** - * Release any VariantContexts not yet processed. - * - * @param result Empty for now... - */ - public void onTraversalDone(Integer result) { - vcMergerWriter.close(); - - System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge()); - System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule()); - System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords()); - System.out.println(vcMergerWriter.getAltAlleleStats()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index 53cfaa3a9..2f15c165f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2011, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -33,10 +33,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; @@ -44,7 +41,7 @@ import java.util.*; // Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts -public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { +class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private VCFWriter innerWriter; private GenomeLocParser genomeLocParser; @@ -52,7 +49,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private ReferenceSequenceFile referenceFileForMNPmerging; private VariantContextMergeRule vcMergeRule; - private VariantContextUtils.AlleleMergeRule alleleMergeRule; + private PhasingUtils.AlleleMergeRule alleleMergeRule; private String useSingleSample = null; @@ -71,7 +68,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { // Should we call innerWriter.close() in close() private boolean takeOwnershipOfInner; - public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { + public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, PhasingUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { this.innerWriter = innerWriter; this.genomeLocParser = genomeLocParser; try { @@ -122,7 +119,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (useSingleSample != null) { // only want to output context for one sample Genotype sampGt = vc.getGenotype(useSingleSample); if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP! - vc = vc.subContextFromGenotypes(sampGt); + vc = vc.subContextFromSample(sampGt.getSampleName()); else // asked for a sample that this vc does not contain, so ignore this vc: return; } @@ -179,14 +176,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { boolean mergedRecords = false; if (shouldAttemptToMerge) { numRecordsSatisfyingMergeRule++; - VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule); + VariantContext mergedVc = PhasingUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule); if (mergedVc != null) { mergedRecords = true; Map addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc); addedAttribs.putAll(mergedVc.getAttributes()); - mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); + mergedVc = new VariantContextBuilder(mergedVc).attributes(addedAttribs).make(); vcfrWaitingToMerge = new VCFRecord(mergedVc, true); numMergedRecords++; @@ -218,26 +215,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { filteredVcfrList.clear(); } - public int getNumRecordsAttemptToMerge() { - return numRecordsAttemptToMerge; - } - - public int getNumRecordsSatisfyingMergeRule() { - return numRecordsSatisfyingMergeRule; - } - - public int getNumMergedRecords() { - return numMergedRecords; - } - - public VariantContextMergeRule getVcMergeRule() { - return vcMergeRule; - } - - public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() { - return alleleMergeRule; - } - /** * Gets a string representation of this object. * @@ -248,13 +225,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { return getClass().getName(); } - public String getAltAlleleStats() { - if (altAlleleStats == null) - return ""; - - return "\n" + altAlleleStats.toString(); - } - private static class VCFRecord { public VariantContext vc; public boolean resultedFromMerge; @@ -373,7 +343,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (shouldAttemptToMerge) { aas.numSuccessiveGenotypesAttemptedToBeMerged++; - if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) { + if (!PhasingUtils.alleleSegregationIsKnown(gt1, gt2)) { aas.segregationUnknown++; logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2)); } @@ -498,9 +468,9 @@ class DistanceMergeRule extends VariantContextMergeRule { } -class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule { +class ExistsDoubleAltAlleleMergeRule extends PhasingUtils.AlleleMergeRule { public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { - return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2); + return PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2); } public String toString() { @@ -515,7 +485,7 @@ class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule { public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { // Must be interesting AND consistent: - return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2); + return super.allelesShouldBeMerged(vc1, vc2) && PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2); } public String toString() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java deleted file mode 100644 index 96d5c471f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}) -@By(DataSource.REFERENCE_ORDERED_DATA) - -public class MergeSegregatingAlternateAllelesWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; - - @Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false) - protected int maxGenomicDistance = 1; - - @Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false) - protected String useSingleSample = null; - - @Hidden - @Argument(fullName = "emitOnlyMergedRecords", shortName = "emitOnlyMerged", doc = "Only output records that resulted from merging [For DEBUGGING purposes only - DO NOT USE, since it disregards the semantics of '|' as 'phased relative to previous non-filtered VC']; [default:false]", required = false) - protected boolean emitOnlyMergedRecords = false; - - @Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false) - protected boolean disablePrintAlternateAlleleStatistics = false; - - public final static String IGNORE_REFSEQ = "IGNORE"; - public final static String UNION_REFSEQ = "UNION"; - public final static String INTERSECT_REFSEQ = "INTERSECT"; - - @Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false) - protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ; - - @Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false) - protected boolean dontRequireSomeSampleHasDoubleAltAllele = false; - - @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) - public RodBinding variants; - - public void initialize() { - initializeVcfWriter(); - } - - private void initializeVcfWriter() { - GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser(); - - VariantContextMergeRule vcMergeRule; - if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ)) - vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser); - else - vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation); - - VariantContextUtils.AlleleMergeRule alleleMergeRule; - if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them if there is a trailing prefix of polymorphisms (i.e., upstream polymorphic site): - alleleMergeRule = new PrefixPolymorphismMergeAllelesRule(); - else - alleleMergeRule = new ExistsDoubleAltAlleleMergeRule(); - - // false <-> don't take control of writer, since didn't create it: - vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics); - writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site, send it to be (possibly) merged with previously observed sites. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return dummy Integer - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - for (VariantContext vc : tracker.getValues(variants, context.getLocation())) - writeVCF(vc); - - return 0; - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, vcMergerWriter, logger); - } - - public Integer reduce(Integer result, Integer total) { - if (result == null) - return total; - - return total + result; - } - - /** - * Release any VariantContexts not yet processed. - * - * @param result Empty for now... - */ - public void onTraversalDone(Integer result) { - vcMergerWriter.close(); - - if (useSingleSample != null) - System.out.println("Only considered single sample: " + useSingleSample); - - System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge()); - System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule()); - System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords()); - System.out.println(vcMergerWriter.getAltAlleleStats()); - } -} - - -enum MergeBasedOnRefSeqAnnotation { - UNION_WITH_DIST, INTERSECT_WITH_DIST -} - -class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule { - private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation; - - public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) { - super(maxGenomicDistanceForMNP, genomeLocParser); - - if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ)) - this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST; - else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ)) - this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST; - else - throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!"); - } - - public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) { - boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2); - - if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST) - return withinDistance || sameGene(vc1, vc2); - else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST - return withinDistance && sameGene(vc1, vc2); - } - - private boolean sameGene(VariantContext vc1, VariantContext vc2) { - Set names_vc1 = RefSeqDataParser.getRefSeqNames(vc1); - Set names_vc2 = RefSeqDataParser.getRefSeqNames(vc2); - names_vc1.retainAll(names_vc2); - - if (!names_vc1.isEmpty()) - return true; - - // Check refseq.name2: - Set names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true); - Set names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true); - names2_vc1.retainAll(names2_vc2); - - return !names2_vc1.isEmpty(); - } - - public String toString() { - return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene"; - } - - public Map addToMergedAttributes(VariantContext vc1, VariantContext vc2) { - Map addedAttribs = super.addToMergedAttributes(vc1, vc2); - addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2)); - return addedAttribs; - } -} - - - -class PrefixPolymorphismMergeAllelesRule extends VariantContextUtils.AlleleMergeRule { - public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { - return vc1.isPolymorphic(); - } - - public String toString() { - return super.toString() + ", there exists a polymorphism at the start of the merged allele"; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 847165e3e..cea7dd007 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -13,10 +13,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; @@ -135,7 +132,7 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Allele NO_CALL = Allele.create(".",false); private final String DUMMY_NAME = "DummySample"; - private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); private ArrayList getAlleles(Genotype.Type genotype){ ArrayList alleles = new ArrayList(2); @@ -165,10 +162,10 @@ public class PhaseByTransmission extends RodWalker, HashMa //Homozygous genotypes will be set as phased, heterozygous won't be private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true)); } else - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false)); } //Find the phase for a parent/child pair @@ -176,8 +173,8 @@ public class PhaseByTransmission extends RodWalker, HashMa //Special case for Het/Het as it is ambiguous if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); return; } @@ -189,23 +186,23 @@ public class PhaseByTransmission extends RodWalker, HashMa //If there is a possible phasing between the mother and child => phase int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); if(childTransmittedAlleleIndex > -1){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); } else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - parentPhasedAlleles.add(parentAlleles.get(1)); - parentPhasedAlleles.add(parentAlleles.get(0)); - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); } //This is a Mendelian Violation => Do not phase else{ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); } } @@ -239,7 +236,7 @@ public class PhaseByTransmission extends RodWalker, HashMa motherPhasedAlleles.add(motherAlleles.get(0)); else motherPhasedAlleles.add(motherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); //Create father's genotype ArrayList fatherPhasedAlleles = new ArrayList(2); @@ -248,10 +245,10 @@ public class PhaseByTransmission extends RodWalker, HashMa fatherPhasedAlleles.add(fatherAlleles.get(0)); else fatherPhasedAlleles.add(fatherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); //Create child's genotype - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); //Once a phased combination is found; exit return; @@ -259,9 +256,9 @@ public class PhaseByTransmission extends RodWalker, HashMa } //If this is reached then no phasing could be found - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false)); } /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. @@ -301,26 +298,26 @@ public class PhaseByTransmission extends RodWalker, HashMa } } - /** - * Applies the trio genotype combination to the given trio. - * @param ref: Reference allele - * @param alt: Alternate allele - * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination - * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination - * @param childGenotype: Genotype of the child to phase using this trio genotype combination - * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) - * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child - */ + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); } - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ - int phredScoreTransmission = -1; - if(transmissionProb != NO_TRANSMISSION_PROB) + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); //Handle null, missing and unavailable genotypes @@ -336,27 +333,27 @@ public class PhaseByTransmission extends RodWalker, HashMa if(transmissionProb>NO_TRANSMISSION_PROB) genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); - ArrayList phasedAlleles = new ArrayList(2); - for(Allele allele : phasedGenotype.getAlleles()){ - if(allele.isReference()) - phasedAlleles.add(refAllele); - else if(allele.isNonReference()) - phasedAlleles.add(altAllele); - //At this point there should not be any other alleles left - else - throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - } + } - //Compute the new Log10Error if the genotype is different from the original genotype - double negLog10Error; - if(genotype.getType() == phasedGenotype.getType()) - negLog10Error = genotype.getNegLog10PError(); - else - negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + //Compute the new Log10Error if the genotype is different from the original genotype + double log10Error; + if(genotype.getType() == phasedGenotype.getType()) + log10Error = genotype.getLog10PError(); + else + log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); - return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); - } + return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased()); + } } @@ -404,14 +401,14 @@ public class PhaseByTransmission extends RodWalker, HashMa } else{ for(Sample familyMember : family){ - parents = familyMember.getParents(); - if(parents.size()>0){ + parents = familyMember.getParents(); + if(parents.size()>0){ if(family.containsAll(parents)) this.trios.add(familyMember); else logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); break; - } + } } } @@ -426,11 +423,11 @@ public class PhaseByTransmission extends RodWalker, HashMa mvCountMatrix = new EnumMap>>(Genotype.Type.class); transmissionMatrix = new EnumMap>>(Genotype.Type.class); for(Genotype.Type mother : Genotype.Type.values()){ - mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); - transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); - for(Genotype.Type father : Genotype.Type.values()){ - mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); - transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); for(Genotype.Type child : Genotype.Type.values()){ mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); @@ -671,9 +668,9 @@ public class PhaseByTransmission extends RodWalker, HashMa else phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return bestMVCount.get(configuration_index); + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); } @@ -682,14 +679,14 @@ public class PhaseByTransmission extends RodWalker, HashMa //Increment metrics counters if(parent.isCalled() && child.isCalled()){ - counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); - if(parent.isPhased()) - counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); - else{ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); if(parent.isHet() && child.isHet()) counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); - } + } }else{ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); } @@ -700,21 +697,21 @@ public class PhaseByTransmission extends RodWalker, HashMa //Increment metrics counters if(mother.isCalled() && father.isCalled() && child.isCalled()){ - counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); - if(mother.isPhased()) - counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); - else{ - if(mvCount > 0){ - if(mvCount >1) + else{ + if(mvCount > 0){ + if(mvCount >1) counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); - else - counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); - } - else if(mother.isHet() && father.isHet() && child.isHet()) - counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - } + } }else{ counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); } @@ -749,11 +746,9 @@ public class PhaseByTransmission extends RodWalker, HashMa if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + VariantContextBuilder builder = new VariantContextBuilder(vc); - Map genotypeMap = vc.getGenotypes(); - - int mvCount; - + GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); for (Sample sample : trios) { Genotype mother = vc.getGenotype(sample.getMaternalID()); Genotype father = vc.getGenotype(sample.getPaternalID()); @@ -764,18 +759,18 @@ public class PhaseByTransmission extends RodWalker, HashMa continue; ArrayList trioGenotypes = new ArrayList(3); - mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); + final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); //Fill the genotype map with the new genotypes and increment metrics counters - genotypeMap.put(phasedChild.getSampleName(),phasedChild); + genotypesContext.replace(phasedChild); if(mother != null){ - genotypeMap.put(phasedMother.getSampleName(), phasedMother); + genotypesContext.replace(phasedMother); if(father != null){ - genotypeMap.put(phasedFather.getSampleName(), phasedFather); + genotypesContext.replace(phasedFather); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) @@ -789,24 +784,21 @@ public class PhaseByTransmission extends RodWalker, HashMa } } else{ - genotypeMap.put(phasedFather.getSampleName(),phasedFather); + genotypesContext.replace(phasedFather); updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); - } + } //Report violation if set so //TODO: ADAPT FOR PAIRS TOO!! if(mvCount>0 && mvFile != null) mvFile.println(mvfLine); - } - - VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); - - vcfWriter.add(newvc); + builder.genotypes(genotypesContext); + vcfWriter.add(builder.make()); } return metricsCounters; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java index fe2792475..8f980ad72 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java @@ -23,12 +23,10 @@ */ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.sting.utils.DisjointSet; - import java.util.*; // Represents an undirected graph with no self-edges: -public class PhasingGraph implements Iterable { +class PhasingGraph implements Iterable { private Neighbors[] adj; public PhasingGraph(int numVertices) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java index 56197a85f..053b09439 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; /* Edge class for PhasingGraph */ -public class PhasingGraphEdge implements Comparable { +class PhasingGraphEdge implements Comparable { protected int v1; protected int v2; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index 63fb33295..a95b13d68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; -public class PhasingRead extends BaseArray { +class PhasingRead extends BaseArray { private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java new file mode 100644 index 000000000..75d0773f1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +class PhasingUtils { + static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { + if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) + return null; + + // Check that it's logically possible to merge the VCs: + if (!allSamplesAreMergeable(vc1, vc2)) + return null; + + // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) + if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) + return null; + + return reallyMergeIntoMNP(vc1, vc2, referenceFile); + } + + static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { + int startInter = vc1.getEnd() + 1; + int endInter = vc2.getStart() - 1; + byte[] intermediateBases = null; + if (startInter <= endInter) { + intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); + StringUtil.toUpperCase(intermediateBases); + } + MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added + + GenotypesContext mergedGenotypes = GenotypesContext.create(); + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + List mergedAllelesForSample = new LinkedList(); + + /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, + we preserve phase information (if any) relative to whatever precedes vc1: + */ + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); + mergedAllelesForSample.add(mergedAllele); + } + + double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError()); + Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered + + Map mergedGtAttribs = new HashMap(); + PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); + if (phaseQual.PQ != null) + mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); + + Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); + mergedGenotypes.add(mergedGt); + } + + String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); + double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); + Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered + Map mergedAttribs = mergeVariantContextAttributes(vc1, vc2); + + // ids + List mergedIDs = new ArrayList(); + if ( vc1.hasID() ) mergedIDs.add(vc1.getID()); + if ( vc2.hasID() ) mergedIDs.add(vc2.getID()); + String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); + + VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs); + VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); + return mergedBuilder.make(); + } + + static String mergeVariantContextNames(String name1, String name2) { + return name1 + "_" + name2; + } + + static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { + Map mergedAttribs = new HashMap(); + + List vcList = new LinkedList(); + vcList.add(vc1); + vcList.add(vc2); + + String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; + for (String orAttrib : MERGE_OR_ATTRIBS) { + boolean attribVal = false; + for (VariantContext vc : vcList) { + attribVal = vc.getAttributeAsBoolean(orAttrib, false); + if (attribVal) // already true, so no reason to continue: + break; + } + mergedAttribs.put(orAttrib, attribVal); + } + + return mergedAttribs; + } + + static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { + GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); + GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); + + if (!loc1.onSameContig(loc2)) + throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); + + if (!loc1.isBefore(loc2)) + throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); + + if (vc1.isFiltered() || vc2.isFiltered()) + return false; + + if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets + return false; + + if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) + return false; + + return true; + } + + static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { + for (final Genotype gt : vc.getGenotypes()) { + if (gt.isNoCall() || gt.isFiltered()) + return false; + } + + return true; + } + + static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { + // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom + return false; + } + + return true; + } + + static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + if (gt1.getPloidy() != gt2.getPloidy()) + return false; + + /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. + + HOWEVER, EVEN if this is not the case, but gt1.isHom(), + it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. + */ + return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); + } + + static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { + if (gt2.isPhased() || gt2.isHom()) + return new PhaseAndQuality(gt1); // maintain the phase of gt1 + + if (!gt1.isHom()) + throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); + + /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype + + For example, if we're merging the third Genotype with the second one: + 0/1 + 1|1 + 0/1 + + Then, we want to output: + 0/1 + 1/2 + */ + return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] + } + + static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate + return true; + } + } + + return false; + } + + static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { + // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): + Map allele1ToAllele2 = new HashMap(); + Map allele2ToAllele1 = new HashMap(); + + // Note the segregation of the alleles for the reference genome: + allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); + allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); + + // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); + + Allele all1To2 = allele1ToAllele2.get(all1); + if (all1To2 == null) + allele1ToAllele2.put(all1, all2); + else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 + return false; + + Allele all2To1 = allele2ToAllele1.get(all2); + if (all2To1 == null) + allele2ToAllele1.put(all2, all1); + else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 + return false; + } + } + + return true; + } + + abstract static class AlleleMergeRule { + // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): + abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); + + public String toString() { + return "all samples are mergeable"; + } + } + + static class AlleleOneAndTwo { + private Allele all1; + private Allele all2; + + public AlleleOneAndTwo(Allele all1, Allele all2) { + this.all1 = all1; + this.all2 = all2; + } + + public int hashCode() { + return all1.hashCode() + all2.hashCode(); + } + + public boolean equals(Object other) { + if (!(other instanceof AlleleOneAndTwo)) + return false; + + AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; + return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); + } + } + + static class MergedAllelesData { + private Map mergedAlleles; + private byte[] intermediateBases; + private int intermediateLength; + + public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { + this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo + this.intermediateBases = intermediateBases; + this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + + this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); + } + + public Allele ensureMergedAllele(Allele all1, Allele all2) { + return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor + } + + private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { + AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); + Allele mergedAllele = mergedAlleles.get(all12); + + if (mergedAllele == null) { + byte[] bases1 = all1.getBases(); + byte[] bases2 = all2.getBases(); + + byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; + System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); + if (intermediateBases != null) + System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); + System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); + + mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); + mergedAlleles.put(all12, mergedAllele); + } + + return mergedAllele; + } + + public Set getAllMergedAlleles() { + return new HashSet(mergedAlleles.values()); + } + } + + static class PhaseAndQuality { + public boolean isPhased; + public Double PQ = null; + + public PhaseAndQuality(Genotype gt) { + this.isPhased = gt.isPhased(); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java index 99446705e..b68739b48 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; /* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values with precision (prevents underflow by representing in log10 space). */ -public class PreciseNonNegativeDouble implements Comparable { +class PreciseNonNegativeDouble implements Comparable { private static final double EQUALS_THRESH = 1e-6; private static final double INFINITY = Double.POSITIVE_INFINITY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 68fbe8ce2..9470ce2f4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -34,17 +34,13 @@ import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.DisjointSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -125,7 +121,8 @@ public class ReadBackedPhasingWalker extends RodWalker samplesToPhase = null; + protected Set + samplesToPhase = null; private GenomeLoc mostDownstreamLocusReached = null; @@ -275,10 +272,10 @@ public class ReadBackedPhasingWalker extends RodWalker KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); - private VariantContext reduceVCToSamples(VariantContext vc, List samplesToPhase) { + private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { // for ( String sample : samplesToPhase ) // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values()); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); // logger.debug("original VC = " + vc); // logger.debug("sub VC = " + subvc); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); @@ -355,17 +352,16 @@ public class ReadBackedPhasingWalker extends RodWalker sampGenotypes = vc.getGenotypes(); + GenotypesContext sampGenotypes = vc.getGenotypes(); Map samplePhaseStats = new TreeMap(); - for (Map.Entry sampGtEntry : sampGenotypes.entrySet()) { - String samp = sampGtEntry.getKey(); - Genotype gt = sampGtEntry.getValue(); + for (final Genotype gt : sampGenotypes) { + String samp = gt.getSampleName(); if (DEBUG) logger.debug("sample = " + samp); if (isUnfilteredCalledDiploidGenotype(gt)) { if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: // true <-> can trivially phase a hom site relative to ANY previous site: - Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true); + Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true); uvc.setGenotype(samp, phasedGt); } else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype @@ -401,7 +397,7 @@ public class ReadBackedPhasingWalker extends RodWalker gtAttribs = new HashMap(gt.getAttributes()); gtAttribs.put(PQ_KEY, pr.phaseQuality); - Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); + Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); uvc.setGenotype(samp, phasedGt); } @@ -421,7 +417,7 @@ public class ReadBackedPhasingWalker extends RodWalker handledGtAttribs = new HashMap(handledGt.getAttributes()); handledGtAttribs.put(PQ_KEY, pr.phaseQuality); - Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); + Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); interiorUvc.setGenotype(samp, phasedHomGt); } } @@ -1055,7 +1051,7 @@ public class ReadBackedPhasingWalker extends RodWalker alleles; - private Map genotypes; - private double negLog10PError; + private Map genotypes; + private double log10PError; private Set filters; private Map attributes; + private String id; public UnfinishedVariantContext(VariantContext vc) { this.name = vc.getSource(); + this.id = vc.getID(); this.contig = vc.getChr(); this.start = vc.getStart(); this.stop = vc.getEnd(); this.alleles = vc.getAlleles(); - this.genotypes = new HashMap(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable - this.negLog10PError = vc.getNegLog10PError(); + + this.genotypes = new HashMap(); + for ( final Genotype g : vc.getGenotypes() ) { + this.genotypes.put(g.getSampleName(), g); + } + + this.log10PError = vc.getLog10PError(); this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; this.attributes = new HashMap(vc.getAttributes()); } public VariantContext toVariantContext() { - return new VariantContext(name, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); + GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); + return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) + .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); } public GenomeLoc getLocation() { @@ -1156,7 +1161,7 @@ public class ReadBackedPhasingWalker extends RodWalker { +class ReadBasesAtPosition implements Iterable { // list of: private LinkedList bases; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java deleted file mode 100644 index f94140814..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields: - */ -public class RefSeqDataParser { - private static String REFSEQ_PREFIX = "refseq."; - - private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords"; - private static String NAME_KEY = REFSEQ_PREFIX + "name"; - private static String NAME2_KEY = REFSEQ_PREFIX + "name2"; - - private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY}; - - private static Map getRefSeqEntriesToNames(VariantContext vc, boolean getName2) { - String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY; - String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; - - Map entriesToNames = new HashMap(); - int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); - if (numRecords != -1) { - boolean done = false; - - if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsString(nameKeyToUse, null); - if (name != null) { - entriesToNames.put(nameKeyToUse, name); - done = true; - } - } - - if (!done) { - for (int i = 1; i <= numRecords; i++) { - String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsString(key, null); - if (name != null) - entriesToNames.put(key, name); - } - } - } - else { // no entry with the # of records: - String name = vc.getAttributeAsString(nameKeyToUse, null); - if (name != null) { - entriesToNames.put(nameKeyToUse, name); - } - else { // Check all INFO fields for a match (if there are multiple entries): - for (Map.Entry entry : vc.getAttributes().entrySet()) { - String key = entry.getKey(); - if (key.startsWith(nameKeyToUseMultiplePrefix)) - entriesToNames.put(key, entry.getValue().toString()); - } - } - } - return entriesToNames; - } - - private static Map getRefSeqEntriesToNames(VariantContext vc) { - return getRefSeqEntriesToNames(vc, false); - } - - public static Set getRefSeqNames(VariantContext vc, boolean getName2) { - return new TreeSet(getRefSeqEntriesToNames(vc, getName2).values()); - } - - public static Set getRefSeqNames(VariantContext vc) { - return getRefSeqNames(vc, false); - } - - public static Map getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) { - Map refSeqNameAttribs = new HashMap(); - - Map entriesMap1 = getAllRefSeqEntriesByName(vc1); - Map entriesMap2 = getAllRefSeqEntriesByName(vc2); - - Set commonNames = entriesMap1.keySet(); - commonNames.retainAll(entriesMap2.keySet()); - boolean addSuffix = commonNames.size() > 1; - int nextCount = 1; - - for (String name : commonNames) { - RefSeqEntry refseq1 = entriesMap1.get(name); - RefSeqEntry refseq2 = entriesMap2.get(name); - - String keySuffix = ""; - if (addSuffix) - keySuffix = "_" + nextCount; - - boolean added = false; - for (String key : NAME_KEYS) { - Object obj1 = refseq1.info.get(key); - Object obj2 = refseq2.info.get(key); - if (obj1 != null && obj2 != null && obj1.equals(obj2)) { - added = true; - String useKey = key + keySuffix; - refSeqNameAttribs.put(useKey, obj1); - } - } - if (added) - nextCount++; - } - int totalCount = nextCount - 1; // since incremented count one extra time - if (totalCount > 1) - refSeqNameAttribs.put(NUM_RECORDS_KEY, totalCount); - - return refSeqNameAttribs; - } - - public static Map removeRefSeqAttributes(Map attributes) { - Map removedRefSeqAttributes = new HashMap(attributes); - - Iterator> attrIt = removedRefSeqAttributes.entrySet().iterator(); - while (attrIt.hasNext()) { - String key = attrIt.next().getKey(); - if (key.startsWith(REFSEQ_PREFIX)) - attrIt.remove(); - } - - return removedRefSeqAttributes; - } - - private static Map getAllRefSeqEntriesByName(VariantContext vc) { - Map nameToEntries = new TreeMap(); - - List allEntries = getAllRefSeqEntries(vc); - for (RefSeqEntry entry : allEntries) { - Object name = entry.info.get(NAME_KEY); - if (name != null) - nameToEntries.put(name.toString(), entry); - } - - return nameToEntries; - } - - // Returns a List of SEPARATE Map for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc. - private static List getAllRefSeqEntries(VariantContext vc) { - List allRefSeq = new LinkedList(); - - for (Map.Entry entryToName : getRefSeqEntriesToNames(vc).entrySet()) { - String entry = entryToName.getKey(); - String entrySuffix = entry.replaceFirst(NAME_KEY, ""); - allRefSeq.add(new RefSeqEntry(vc, entrySuffix)); - } - - return allRefSeq; - } - - private static class RefSeqEntry { - public Map info; - - public RefSeqEntry(VariantContext vc, String entrySuffix) { - this.info = new HashMap(); - - for (Map.Entry attribEntry : vc.getAttributes().entrySet()) { - String key = attribEntry.getKey(); - if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) { - String genericKey = key.replaceAll(entrySuffix, ""); - this.info.put(genericKey, attribEntry.getValue()); - } - } - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java index 153c4a23f..6a2381e29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java @@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -public class SNPallelePair extends AllelePair { +class SNPallelePair extends AllelePair { public SNPallelePair(Genotype gt) { super(gt); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java deleted file mode 100644 index c10eaa2da..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -public class WriteVCF { - public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) { - writer.add(vc); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java deleted file mode 100644 index e770418c1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.ReadPairWalker; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.List; - -/** - * Counts the number of read pairs encountered in a file sorted in - * query name order. Breaks counts down by total pairs and number - * of paired reads. - * - * - *

Input

- *

- * One or more bam files. - *

- * - *

Output

- *

- * Number of pairs seen. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CountPairs \
- *   -o output.txt \
- *   -I input.bam
- * 
- * - * @author mhanna - */ -public class CountPairsWalker extends ReadPairWalker { - @Output - private PrintStream out; - - /** - * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec. - */ - private long firstOfPair = 0; - - /** - * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec. - */ - private long secondOfPair = 0; - - /** - * A breakdown of the total number of reads seen with exactly the same read name. - */ - private List pairCountsByType = new ExpandingArrayList(); - - /** - * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser. - * @param reads Collection of reads having the same name. - * @return Semantics defined by implementer. - */ - @Override - public Integer map(Collection reads) { - if(pairCountsByType.get(reads.size()) != null) - pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1); - else - pairCountsByType.set(reads.size(),1L); - - for(SAMRecord read: reads) { - if(read.getFirstOfPairFlag()) firstOfPair++; - if(read.getSecondOfPairFlag()) secondOfPair++; - } - - return 1; - } - - /** - * No pairs at the beginning of a traversal. - * @return 0 always. - */ - @Override - public Long reduceInit() { - return 0L; - } - - /** - * Combine number of pairs seen in this iteration (always 1) with total number of pairs - * seen in previous iterations. - * @param value Pairs in this iteration (1), from the map function. - * @param sum Count of all pairs in prior iterations. - * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1). - */ - @Override - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Print summary statistics over the entire traversal. - * @param sum A count of all read pairs viewed. - */ - @Override - public void onTraversalDone(Long sum) { - out.printf("Total number of pairs : %d%n",sum); - out.printf("Total number of first reads in pair : %d%n",firstOfPair); - out.printf("Total number of second reads in pair: %d%n",secondOfPair); - for(int i = 1; i < pairCountsByType.size(); i++) { - if(pairCountsByType.get(i) == null) - continue; - out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i)); - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index e10334a77..6b4fec04e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.Arrays; import java.util.EnumSet; -import java.util.List; /* * Copyright (c) 2009 The Broad Institute diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index e64d00bf5..f370e2818 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -39,8 +39,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.Map; @@ -466,9 +466,7 @@ public class GenotypeAndValidateWalker extends RodWalker { } } } else /* (mask != null && validate == null ) */ { - if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) { + if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )) { logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); sequenceInvalid = true; @@ -281,7 +281,7 @@ public class ValidationAmplicons extends RodWalker { sequence.append('N'); indelCounter--; rawSequence.append(Character.toUpperCase((char)ref.getBase())); - } else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){ + } else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )){ logger.debug("SNP in mask found at " + ref.getLocus().toString()); if ( lowerCaseSNPs ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 4e4a1550d..10d4651b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; @@ -186,7 +187,7 @@ public class VariantEvalWalker extends RodWalker implements Tr * File containing tribble-readable features for the IntervalStratificiation */ @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false) - protected IntervalBinding intervalsFile = null; + public IntervalBinding intervalsFile = null; // Variables private Set jexlExpressions = new TreeSet(); @@ -264,9 +265,9 @@ public class VariantEvalWalker extends RodWalker implements Tr stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); for ( VariantStratifier vs : getStratificationObjects() ) { - if ( vs.getClass().getSimpleName().equals("Filter") ) + if ( vs.getName().equals("Filter") ) byFilterIsEnabled = true; - else if ( vs.getClass().getSimpleName().equals("Sample") ) + else if ( vs.getName().equals("Sample") ) perSampleIsEnabled = true; } @@ -311,16 +312,17 @@ public class VariantEvalWalker extends RodWalker implements Tr String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); // --------- track --------- sample - VariantContexts - - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); - HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for ( final RodBinding evalRod : evals ) { - final HashMap> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap>(0); + final Map> emptyEvalMap = Collections.emptyMap(); + final Map> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for ( final String sampleName : sampleNamesForStratification ) { - Set evalSetBySample = evalSet.get(sampleName); + Collection evalSetBySample = evalSet.get(sampleName); if ( evalSetBySample == null ) { evalSetBySample = new HashSet(1); evalSetBySample.add(null); @@ -330,16 +332,14 @@ public class VariantEvalWalker extends RodWalker implements Tr for ( VariantContext eval : evalSetBySample ) { // deal with ancestral alleles if requested if ( eval != null && aastr != null ) { - HashMap newAts = new HashMap(eval.getAttributes()); - newAts.put("ANCESTRALALLELE", aastr); - eval = VariantContext.modifyAttributes(eval, newAts); + eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for ( final RodBinding compRod : comps ) { // no sample stratification for comps - final HashMap> compSetHash = compVCs.get(compRod); - final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); + final HashMap> compSetHash = compVCs.get(compRod); + final Collection compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); @@ -383,7 +383,7 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } - private VariantContext findMatchingComp(final VariantContext eval, final Set comps) { + private VariantContext findMatchingComp(final VariantContext eval, final Collection comps) { // if no comps, return null if ( comps == null || comps.isEmpty() ) return null; @@ -448,20 +448,18 @@ public class VariantEvalWalker extends RodWalker implements Tr TableType t = (TableType) field.get(ve); String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); - String subTableDesc = datamap.get(field).description(); + final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; if (!report.hasTable(subTableName)) { - report.addTable(subTableName, subTableDesc); + report.addTable(subTableName, dataPointAnn.description()); table = report.getTable(subTableName); table.addPrimaryKey("entry", false); table.addColumn(subTableName, subTableName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); - - table.addColumn(columnName, "unknown"); + table.addColumn(vs.getName(), "unknown"); } table.addColumn("row", "unknown"); @@ -485,9 +483,8 @@ public class VariantEvalWalker extends RodWalker implements Tr String r = (String) t.getRowKeys()[row]; for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); - - table.set(stateKey.toString() + r, columnName, stateKey.get(vs.getClass().getSimpleName())); + final String columnName = vs.getName(); + table.set(stateKey.toString() + r, columnName, stateKey.get(columnName)); } for (int col = 0; col < t.getColumnKeys().length; col++) { @@ -508,9 +505,9 @@ public class VariantEvalWalker extends RodWalker implements Tr GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); + String columnName = vs.getName(); - table.set(stateKey.toString(), columnName, stateKey.get(vs.getClass().getSimpleName())); + table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } table.set(stateKey.toString(), field.getName(), field.get(ve)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java deleted file mode 100755 index 925bff9c0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -class NewCompEvalGenotypes { - private GenomeLoc loc; - private Genotype compGt; - private Genotype evalGt; - - public NewCompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { - this.loc = loc; - this.compGt = compGt; - this.evalGt = evalGt; - } - - public GenomeLoc getLocus() { - return loc; - } - - public Genotype getCompGenotpye() { - return compGt; - } - public Genotype getEvalGenotype() { - return evalGt; - } - - public void setCompGenotype(Genotype compGt) { - this.compGt = compGt; - } - - public void setEvalGenotype(Genotype evalGt) { - this.evalGt = evalGt; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 9facb11b5..89d137ea9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -28,13 +28,13 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval sites at comp sites") long nVariantsAtComp = 0; - @DataPoint(description = "percentage of eval sites at comp sites") + @DataPoint(description = "percentage of eval sites at comp sites", format = "%.2f" ) double compRate = 0.0; @DataPoint(description = "number of concordant sites") long nConcordant = 0; - @DataPoint(description = "the concordance rate") + @DataPoint(description = "the concordance rate", format = "%.2f") double concordantRate = 0.0; public int getComparisonOrder() { @@ -72,7 +72,7 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean evalIsGood = eval != null && eval.isPolymorphic(); + boolean evalIsGood = eval != null && eval.isPolymorphicInSamples(); boolean compIsGood = comp != null && comp.isNotFiltered(); if (evalIsGood) nEvalVariants++; // count the number of eval events diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index cba2781d8..c740eb78c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -62,17 +62,17 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nHomDerived = 0; // calculations that get set in the finalizeEvaluation method - @DataPoint(description = "heterozygosity per locus rate") + @DataPoint(description = "heterozygosity per locus rate", format = "%.2e") public double heterozygosity = 0; - @DataPoint(description = "heterozygosity per base pair") + @DataPoint(description = "heterozygosity per base pair", format = "%.2f") public double heterozygosityPerBp = 0; - @DataPoint(description = "heterozygosity to homozygosity ratio") + @DataPoint(description = "heterozygosity to homozygosity ratio", format = "%.2f") public double hetHomRatio = 0; - @DataPoint(description = "indel rate (insertion count + deletion count)") + @DataPoint(description = "indel rate (insertion count + deletion count)", format = "%.2e") public double indelRate = 0; - @DataPoint(description = "indel rate per base pair") + @DataPoint(description = "indel rate per base pair", format = "%.2f") public double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio") + @DataPoint(description = "deletion to insertion ratio", format = "%.2f") public double deletionInsertionRatio = 0; private double perLocusRate(long n) { @@ -103,7 +103,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. - if ( vc1.isMonomorphic() ) { + if ( vc1.isMonomorphicInSamples() ) { nRefLoci++; } else { switch (vc1.getType()) { @@ -157,8 +157,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // A C A // A C C - for (Genotype g : vc1.getGenotypes().values()) { - String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null; + for (final Genotype g : vc1.getGenotypes()) { + final String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null; switch (g.getType()) { case NO_CALL: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java deleted file mode 100644 index 3ab618496..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.EnumMap; -import java.util.HashMap; -import java.util.Map; - -@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table") -public class G1KPhaseITable extends VariantEvaluator { - // basic counts on various rates found - @DataPoint(description = "Number of samples") - public long nSamples = 0; - - @DataPoint(description = "Number of processed loci") - public long nProcessedLoci = 0; - - @DataPoint(description = "Number of SNPs") - public long nSNPs = 0; - @DataPoint(description = "SNP Novelty Rate") - public double SNPNoveltyRate = 0; - @DataPoint(description = "Mean number of SNPs per individual") - public long nSNPsPerSample = 0; - - @DataPoint(description = "Number of Indels") - public long nIndels = 0; - @DataPoint(description = "Indel Novelty Rate") - public double IndelNoveltyRate = 0; - @DataPoint(description = "Mean number of Indels per individual") - public long nIndelsPerSample = 0; - - @DataPoint(description = "Number of SVs") - public long nSVs = 0; - @DataPoint(description = "SV Novelty Rate") - public double SVNoveltyRate = 0; - @DataPoint(description = "Mean number of SVs per individual") - public long nSVsPerSample = 0; - - Map allVariantCounts, knownVariantCounts; - Map> countsPerSample; - - private final Map makeCounts() { - Map counts = new EnumMap(VariantContext.Type.class); - counts.put(VariantContext.Type.SNP, 0); - counts.put(VariantContext.Type.INDEL, 0); - counts.put(VariantContext.Type.SYMBOLIC, 0); - return counts; - } - - public void initialize(VariantEvalWalker walker) { - countsPerSample = new HashMap>(); - nSamples = walker.getSampleNamesForEvaluation().size(); - - for ( String sample : walker.getSampleNamesForEvaluation() ) { - countsPerSample.put(sample, makeCounts()); - } - - allVariantCounts = makeCounts(); - knownVariantCounts = makeCounts(); - } - - @Override public boolean enabled() { return true; } - - public int getComparisonOrder() { - return 2; // we only need to see each eval track - } - - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval == null || eval.isMonomorphic() ) return null; - - switch (eval.getType()) { -// case NO_VARIATION: -// // shouldn't get here -// break; - case SNP: - case INDEL: - case SYMBOLIC: - allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1); - if ( comp != null ) - knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1); - break; - default: - throw new UserException.BadInput("Unexpected variant context type: " + eval); - } - - // count variants per sample - for (final Genotype g : eval.getGenotypes().values()) { - if ( ! g.isNoCall() && ! g.isHomRef() ) { - int count = countsPerSample.get(g.getSampleName()).get(eval.getType()); - countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1); - } - } - - return null; // we don't capture any interesting sites - } - - private final int perSampleMean(VariantContext.Type type) { - long sum = 0; - for ( Map count : countsPerSample.values() ) { - sum += count.get(type); - } - return (int)(Math.round(sum / (1.0 * countsPerSample.size()))); - } - - private final double noveltyRate(VariantContext.Type type) { - int all = allVariantCounts.get(type); - int known = knownVariantCounts.get(type); - int novel = all - known; - return (novel / (1.0 * all)); - } - - public void finalizeEvaluation() { - nSNPs = allVariantCounts.get(VariantContext.Type.SNP); - nIndels = allVariantCounts.get(VariantContext.Type.INDEL); - nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC); - - nSNPsPerSample = perSampleMean(VariantContext.Type.SNP); - nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL); - nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC); - - SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); - IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); - SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java index bbd3f5f54..4f5aeed61 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java @@ -209,7 +209,7 @@ public class GenotypeConcordance extends VariantEvaluator { //public GenotypeConcordance(VariantEvalWalker parent) { // super(parent); - // discordantInteresting = parent.DISCORDANT_INTERESTING; + // discordantInteresting = parent.DISCORDANT_INTERESTING; //} public String getName() { @@ -277,8 +277,9 @@ public class GenotypeConcordance extends VariantEvaluator { // determine concordance for eval data if (eval != null) { - for (final String sample : eval.getGenotypes().keySet()) { - final Genotype.Type called = eval.getGenotype(sample).getType(); + for (final Genotype g : eval.getGenotypes() ) { + final String sample = g.getSampleName(); + final Genotype.Type called = g.getType(); final Genotype.Type truth; if (!validationIsValidVC || !validation.hasGenotype(sample)) { @@ -299,9 +300,9 @@ public class GenotypeConcordance extends VariantEvaluator { else { final Genotype.Type called = Genotype.Type.NO_CALL; - for (final String sample : validation.getGenotypes().keySet()) { - final Genotype.Type truth = validation.getGenotype(sample).getType(); - detailedStats.incrValue(sample, truth, called); + for (final Genotype g : validation.getGenotypes()) { + final Genotype.Type truth = g.getType(); + detailedStats.incrValue(g.getSampleName(), truth, called); // print out interesting sites /* @@ -410,8 +411,8 @@ class SampleStats implements TableType { public SampleStats(VariantContext vc, int nGenotypeTypes) { this.nGenotypeTypes = nGenotypeTypes; - for (String sample : vc.getGenotypes().keySet()) - concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); + for (final Genotype g : vc.getGenotypes()) + concordanceStats.put(g.getSampleName(), new long[nGenotypeTypes][nGenotypeTypes]); } public SampleStats(int genotypeTypes) { @@ -444,39 +445,6 @@ class SampleStats implements TableType { } } -/** - * Sample stats, but for AC - */ -class ACStats extends SampleStats { - private String[] rowKeys; - - public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { - super(nGenotypeTypes); - rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; - for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... - concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[i] = String.format("evalAC%d",i); - - } - - for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { - concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - } - - public String getName() { - return "Allele Count Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - throw new StingException("RowKeys is null!"); - } - return rowKeys; - } -} - /** * a table of sample names to genotype concordance summary statistics */ @@ -511,8 +479,8 @@ class SampleSummaryStats implements TableType { public SampleSummaryStats(final VariantContext vc) { concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - concordanceSummary.put(sample, new double[COLUMN_KEYS.length]); + for( final Genotype g : vc.getGenotypes() ) { + concordanceSummary.put(g.getSampleName(), new double[COLUMN_KEYS.length]); } } @@ -636,79 +604,3 @@ class SampleSummaryStats implements TableType { } } -/** - * SampleSummaryStats .. but for allele counts - */ -class ACSummaryStats extends SampleSummaryStats { - private String[] rowKeys; - - public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; - rowKeys[0] = ALL_SAMPLES_KEY; - for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[i+1] = String.format("evalAC%d",i); - } - for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - - } - - public String getName() { - return "Allele Count Summary Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null) { - throw new StingException("rowKeys is null!!"); - } - return rowKeys; - } -} - -class CompACNames implements Comparator{ - - final Logger myLogger; - private boolean info = true; - - public CompACNames(Logger l) { - myLogger = l; - } - - public boolean equals(Object o) { - return ( o.getClass() == CompACNames.class ); - } - - public int compare(Object o1, Object o2) { - if ( info ) { - myLogger.info("Sorting AC names"); - info = false; - } - //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); - return getRank(o1) - getRank(o2); - } - - public int getRank(Object o) { - if ( o.getClass() != String.class ) { - return Integer.MIN_VALUE/4; - } else { - String s = (String) o; - if ( s.startsWith("eval") ) { - return Integer.MIN_VALUE/4 + 1 + parseAC(s); - } else if ( s.startsWith("comp") ) { - return 1+ parseAC(s); - } else { - return Integer.MIN_VALUE/4; - } - } - } - - public int parseAC(String s) { - String[] g = s.split("AC"); - return Integer.parseInt(g[1]); - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index e69dbfb28..ea12ada48 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.HashMap; @@ -91,13 +92,13 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { Set allSamples = new HashSet(); - Map compSampGenotypes = null; + GenotypesContext compSampGenotypes = null; if (isRelevantToPhasing(comp)) { allSamples.addAll(comp.getSampleNames()); compSampGenotypes = comp.getGenotypes(); } - Map evalSampGenotypes = null; + GenotypesContext evalSampGenotypes = null; if (isRelevantToPhasing(eval)) { allSamples.addAll(eval.getSampleNames()); evalSampGenotypes = eval.getGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index ffe7c185f..ccec9af12 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -91,7 +91,7 @@ public class IndelLengthHistogram extends VariantEvaluator { public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( vc1.isIndel() && vc1.isPolymorphic() ) { + if ( vc1.isIndel() && vc1.isPolymorphicInSamples() ) { if ( ! vc1.isBiallelic() ) { //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index f70e6c2de..87b453ae3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -8,11 +8,9 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; import org.broadinstitute.sting.utils.IndelUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.HashMap; /* * Copyright (c) 2010 The Broad Institute @@ -270,7 +268,7 @@ public class IndelStatistics extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (eval != null && eval.isPolymorphic()) { + if (eval != null && eval.isPolymorphicInSamples()) { if ( indelStats == null ) { indelStats = new IndelStats(eval); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index a0cc393d9..0cadf6c0d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -147,7 +147,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator { } private boolean includeGenotype(Genotype g) { - return g.getNegLog10PError() > getQThreshold() && g.isCalled(); + return g.getLog10PError() > getQThreshold() && g.isCalled(); } public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java deleted file mode 100755 index 2d0163206..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ /dev/null @@ -1,194 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Degeneracy; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Sample; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.StateKey; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author depristo - * @since Apr 11, 2010 - */ - -@Analysis(name = "Quality Metrics by allele count", description = "Shows various stats binned by allele count") -public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(description = "TiTv by allele count") - MetricsByAc metrics = null; - - private final static Object[] METRIC_COLUMNS = {"AC", "nTi", "nTv", "n", "TiTv"}; - private int numSamples; - - class MetricsAtAC { - public int ac = -1, nTi = 0, nTv = 0; - - public MetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - if ( VariantContextUtils.isTransition(eval) ) - nTi++; - else - nTv++; - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nTi); - case 2: return String.valueOf(nTv); - case 3: return String.valueOf(nTi + nTv); - case 4: return String.valueOf(ratio(nTi, nTv)); - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class MetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public MetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new MetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "MetricsByAc"; - } - - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - return ""; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = eval.getAttributeAsInt("AC", -1); - } - - if ( ac != -1 ) { - metrics.get(ac).update(eval); - } - } - } - - public void initialize(VariantEvalWalker walker) { - numSamples = walker.getNumSamples(); - metrics = new MetricsByAc(2*numSamples); - } - - public String getName() { - return "SimpleMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (numSamples == 0) { - return null; - } - - final String interesting = null; - - if (eval != null) { - if ( metrics == null ) { - int nSamples = numSamples; - - if ( nSamples != -1 ) { - metrics = new MetricsByAc(2 * nSamples); - } - } - - if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - @Override - public boolean stateIsApplicable(StateKey stateKey) { - String sampleClassName = Sample.class.getSimpleName(); - String degeneracyClassName = Degeneracy.class.getSimpleName(); - - //return !(stateKey.containsKey(sampleClassName) && !stateKey.get(sampleClassName).equalsIgnoreCase("all")); - - if (stateKey.containsKey(sampleClassName) && !stateKey.get(sampleClassName).equalsIgnoreCase("all")) { - return false; - } - - if (stateKey.containsKey(degeneracyClassName) && !stateKey.get(degeneracyClassName).equalsIgnoreCase("all")) { - return false; - } - - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index e51623c3c..bb7843361 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -37,7 +37,7 @@ public class ThetaVariantEvaluator extends VariantEvaluator { } public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) { + if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphicInSamples()) { return null; //no interesting sites } @@ -48,7 +48,7 @@ public class ThetaVariantEvaluator extends VariantEvaluator { float numGenosHere = 0; int numIndsHere = 0; - for (Genotype genotype : vc.getGenotypes().values()) { + for (final Genotype genotype : vc.getGenotypes()) { numIndsHere++; if (!genotype.isNoCall()) { //increment stats for heterozygosity diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 9b6e145e6..9de850d82 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -16,19 +16,19 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv long nTi = 0; @DataPoint(description = "number of transversion loci") long nTv = 0; - @DataPoint(description = "the transition to transversion ratio") + @DataPoint(description = "the transition to transversion ratio", format = "%.2f") double tiTvRatio = 0.0; @DataPoint(description = "number of comp transition sites") long nTiInComp = 0; @DataPoint(description = "number of comp transversion sites") long nTvInComp = 0; - @DataPoint(description = "the transition to transversion ratio for comp sites") + @DataPoint(description = "the transition to transversion ratio for comp sites", format = "%.2f") double TiTvRatioStandard = 0.0; @DataPoint(description = "number of derived transition loci") long nTiDerived = 0; @DataPoint(description = "number of derived transversion loci") long nTvDerived = 0; - @DataPoint(description = "the derived transition to transversion ratio") + @DataPoint(description = "the derived transition to transversion ratio", format = "%.2f") double tiTvDerivedRatio = 0.0; public boolean enabled() { @@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) { + if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphicInSamples()) { if (VariantContextUtils.isTransition(vc)) { if (updateStandard) nTiInComp++; else nTi++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 3b4967cad..86d3467fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -11,7 +11,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Collection; -import java.util.Set; /** * The Broad Institute @@ -31,10 +30,10 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { @DataPoint(description = "FN") int FN = 0; @DataPoint(description = "TN") int TN = 0; - @DataPoint(description = "Sensitivity") double sensitivity = 0; - @DataPoint(description = "Specificity") double specificity = 0; - @DataPoint(description = "PPV") double PPV = 0; - @DataPoint(description = "FDR") double FDR = 0; + @DataPoint(description = "Sensitivity", format = "%.2f") double sensitivity = 0; + @DataPoint(description = "Specificity", format = "%.2f") double specificity = 0; + @DataPoint(description = "PPV", format = "%.2f") double PPV = 0; + @DataPoint(description = "FDR", format = "%.2f") double FDR = 0; @DataPoint(description = "CompMonoEvalNoCall") int CompMonoEvalNoCall = 0; @DataPoint(description = "CompMonoEvalFiltered") int CompMonoEvalFiltered = 0; @@ -118,8 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { public SiteStatus calcSiteStatus(VariantContext vc) { if ( vc == null ) return SiteStatus.NO_CALL; if ( vc.isFiltered() ) return SiteStatus.FILTERED; - if ( vc.isMonomorphic() ) return SiteStatus.MONO; - if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes + if ( vc.isMonomorphicInSamples() ) return SiteStatus.MONO; + if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { int ac = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index 263227938..ce9e45c9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -232,14 +232,14 @@ public class VariantQualityScore extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String interesting = null; - if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) if( titvStats == null ) { titvStats = new TiTvStats(); } titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } int alternateAlleleCount = 0; for (final Allele a : eval.getAlternateAlleles()) { - alternateAlleleCount += eval.getChromosomeCount(a); + alternateAlleleCount += eval.getCalledChrCount(a); } alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java new file mode 100644 index 000000000..ba7164400 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Collection; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; + +@Analysis(description = "1000 Genomes Phase I summary of variants table") +public class VariantSummary extends VariantEvaluator implements StandardEval { + // basic counts on various rates found + @DataPoint(description = "Number of samples") + public long nSamples = 0; + + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public long nSNPs = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + @DataPoint(description = "SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SNPs per individual") + public long nSNPsPerSample = 0; + @DataPoint(description = "Mean TiTv ratio per individual", format = "%.2f") + public double TiTvRatioPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at SNPs", format = "%.1f") + public double SNPDPPerSample = 0; + + @DataPoint(description = "Number of Indels") + public long nIndels = 0; + @DataPoint(description = "Indel Novelty Rate") + public String IndelNoveltyRate = "NA"; + @DataPoint(description = "Mean number of Indels per individual") + public long nIndelsPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at Indels", format = "%.1f") + public double IndelDPPerSample = 0; + + @DataPoint(description = "Number of SVs") + public long nSVs = 0; + @DataPoint(description = "SV Novelty Rate") + public String SVNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SVs per individual") + public long nSVsPerSample = 0; + + TypeSampleMap allVariantCounts, knownVariantCounts; + TypeSampleMap countsPerSample; + TypeSampleMap transitionsPerSample, transversionsPerSample; + TypeSampleMap depthPerSample; + + private final static String ALL = "ALL"; + + private class TypeSampleMap extends EnumMap> { + public TypeSampleMap(final Collection samples) { + super(VariantContext.Type.class); + for ( VariantContext.Type type : VariantContext.Type.values() ) { + Map bySample = new HashMap(samples.size()); + for ( final String sample : samples ) { + bySample.put(sample, 0); + } + bySample.put(ALL, 0); + this.put(type, bySample); + } + } + + public final void inc(final VariantContext.Type type, final String sample) { + final int count = this.get(type).get(sample); + get(type).put(sample, count + 1); + } + + public final int all(VariantContext.Type type) { + return get(type).get(ALL); + } + + public final int meanValue(VariantContext.Type type) { + long sum = 0; + int n = 0; + for ( final Map.Entry pair : get(type).entrySet() ) { + if ( pair.getKey() != ALL) { + n++; + sum += pair.getValue(); + } + } + return (int)(Math.round(sum / (1.0 * n))); + } + + public final double ratioValue(VariantContext.Type type, TypeSampleMap denoms, boolean allP) { + double sum = 0; + int n = 0; + for ( final String sample : get(type).keySet() ) { + if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { + final long num = get(type).get(sample); + final long denom = denoms.get(type).get(sample); + sum += ratio(num, denom); + n++; + } + } + return Math.round(sum / (1.0 * n)); + } + } + + + public void initialize(VariantEvalWalker walker) { + nSamples = walker.getSampleNamesForEvaluation().size(); + countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transitionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transversionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + allVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + knownVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + depthPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + } + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) return null; + + TypeSampleMap titvTable = null; + + switch (eval.getType()) { + case SNP: + titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; + titvTable.inc(eval.getType(), ALL); + case INDEL: + case SYMBOLIC: + allVariantCounts.inc(eval.getType(), ALL); + if ( comp != null ) + knownVariantCounts.inc(eval.getType(), ALL); + if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(eval.getType(), ALL); + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + // per sample metrics + for (final Genotype g : eval.getGenotypes()) { + if ( ! g.isNoCall() && ! g.isHomRef() ) { + countsPerSample.inc(eval.getType(), g.getSampleName()); + + // update transition / transversion ratio + if ( titvTable != null ) titvTable.inc(eval.getType(), g.getSampleName()); + + if ( g.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(eval.getType(), g.getSampleName()); + } + } + + return null; // we don't capture any interesting sites + } + + private final String noveltyRate(VariantContext.Type type) { + final int all = allVariantCounts.all(type); + final int known = knownVariantCounts.all(type); + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + nSNPs = allVariantCounts.all(VariantContext.Type.SNP); + nIndels = allVariantCounts.all(VariantContext.Type.INDEL); + nSVs = allVariantCounts.all(VariantContext.Type.SYMBOLIC); + + TiTvRatio = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, true); + TiTvRatioPerSample = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, false); + + nSNPsPerSample = countsPerSample.meanValue(VariantContext.Type.SNP); + nIndelsPerSample = countsPerSample.meanValue(VariantContext.Type.INDEL); + nSVsPerSample = countsPerSample.meanValue(VariantContext.Type.SYMBOLIC); + + SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); + IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); + SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); + + SNPDPPerSample = depthPerSample.meanValue(VariantContext.Type.SNP); + IndelDPPerSample = depthPerSample.meanValue(VariantContext.Type.INDEL); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index c7bea93b2..2f342e120 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -47,7 +47,7 @@ public class AlleleCount extends VariantStratifier { AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) - AC = Math.max(AC, eval.getChromosomeCount(allele)); + AC = Math.max(AC, eval.getCalledChrCount(allele)); } else // by default, the site is considered monomorphic AC = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index bf001588a..00a656cc6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -63,8 +63,8 @@ public class IntervalStratification extends VariantStratifier { if ( locs.isEmpty() ) throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); - logger.info(String.format("Creating IntervalStratification containing %d intervals covering %d bp", - locs.size(), IntervalUtils.intervalSize(locs))); + logger.info(String.format("Creating IntervalStratification %s containing %d intervals covering %d bp", + getVariantEvalWalker().intervalsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); // set up the map from contig -> interval tree for ( final String contig : getVariantEvalWalker().getContigNames() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java index 5cae2fb15..119a1b83f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java @@ -9,10 +9,15 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -public abstract class VariantStratifier implements Comparable { +public abstract class VariantStratifier implements Comparable { private VariantEvalWalker variantEvalWalker; + final private String name; protected ArrayList states = new ArrayList(); + protected VariantStratifier() { + name = this.getClass().getSimpleName(); + } + /** * @return a reference to the parent VariantEvalWalker running this stratification */ @@ -34,8 +39,12 @@ public abstract class VariantStratifier implements Comparable { return null; } - public int compareTo(Object o1) { - return this.getClass().getSimpleName().compareTo(o1.getClass().getSimpleName()); + public int compareTo(VariantStratifier o1) { + return this.getName().compareTo(o1.getName()); + } + + public final String getName() { + return name; } public ArrayList getAllStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java index 396843252..90a6b97e0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java @@ -6,4 +6,5 @@ import java.lang.annotation.RetentionPolicy; @Retention(RetentionPolicy.RUNTIME) public @interface DataPoint { String description() default ""; // the description, optional + String format() default ""; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 8112ae97f..c34e44516 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -21,7 +21,7 @@ public class NewEvaluationContext extends HashMap { String value = ""; for ( VariantStratifier key : this.keySet() ) { - value += "\t" + key.getClass().getSimpleName() + ":" + this.get(key) + "\n"; + value += "\t" + key.getName() + ":" + this.get(key) + "\n"; } return value; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 2cccb0d35..96bd9a9b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -1,24 +1,23 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; +import java.util.Map; import java.util.TreeMap; public class StateKey extends TreeMap { - public int hashCode() { - int hashCode = 1; - - for (String key : this.keySet()) { - String value = this.get(key); - - hashCode *= key.hashCode() + value.hashCode(); - } - - return hashCode; - } +// public int hashCode() { +// int hashCode = 1; +// +// for (final Map.Entry pair : this.entrySet()) { +// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); +// } +// +// return hashCode; +// } public String toString() { String value = ""; - for ( String key : this.keySet() ) { + for ( final String key : this.keySet() ) { //value += "\tstate " + key + ":" + this.get(key) + "\n"; value += String.format("%s:%s;", key, this.get(key)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index aa8c6cfb9..cb44ca522 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.lang.reflect.Field; @@ -195,7 +196,7 @@ public class VariantEvalUtils { for (VariantStratifier vs : ec.keySet()) { String state = ec.get(vs); - stateKey.put(vs.getClass().getSimpleName(), state); + stateKey.put(vs.getName(), state); } ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); @@ -229,7 +230,7 @@ public class VariantEvalUtils { table.addColumn(tableName, tableName); for (VariantStratifier vs : stratificationObjects) { - String columnName = vs.getClass().getSimpleName(); + String columnName = vs.getName(); table.addColumn(columnName, "unknown"); } @@ -245,7 +246,7 @@ public class VariantEvalUtils { field.setAccessible(true); if (!(field.get(vei) instanceof TableType)) { - table.addColumn(field.getName(), 0.0); + table.addColumn(field.getName(), 0.0, datamap.get(field).format()); } } } catch (InstantiationException e) { @@ -266,7 +267,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested sample */ public VariantContext getSubsetOfVariantContext(VariantContext vc, String sampleName) { - return getSubsetOfVariantContext(vc, Arrays.asList(sampleName)); + return getSubsetOfVariantContext(vc, Collections.singleton(sampleName)); } /** @@ -276,24 +277,19 @@ public class VariantEvalUtils { * @param sampleNames the samples to pull out of the VariantContext * @return a new VariantContext with just the requested samples */ - public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); + public VariantContext getSubsetOfVariantContext(VariantContext vc, Set sampleNames) { + VariantContext vcsub = vc.subContextFromSamples(sampleNames, vc.getAlleles()); + VariantContextBuilder builder = new VariantContextBuilder(vcsub); - HashMap newAts = new HashMap(vcsub.getAttributes()); - - int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); - int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); + final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); + final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) { - newAts.put("ISSINGLETON", true); + builder.attribute("ISSINGLETON", true); } - VariantContextUtils.calculateChromosomeCounts(vcsub, newAts, true); - vcsub = VariantContext.modifyAttributes(vcsub, newAts); - - //VariantEvalWalker.logger.debug(String.format("VC %s subset to %s AC%n", vc.getSource(), vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY))); - - return vcsub; + VariantContextUtils.calculateChromosomeCounts(builder, true); + return builder.make(); } /** @@ -301,6 +297,7 @@ public class VariantEvalUtils { * Additional variant contexts per sample are automatically generated and added to the map unless the sample name * matches the ALL_SAMPLE_NAME constant. * + * * @param tracker the metadata tracker * @param ref the reference context * @param tracks the list of tracks to process @@ -312,7 +309,7 @@ public class VariantEvalUtils { * * @return the mapping of track to VC list that should be populated */ - public HashMap, HashMap>> + public HashMap, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, @@ -323,11 +320,11 @@ public class VariantEvalUtils { if ( tracker == null ) return null; - HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + HashMap, HashMap>> bindings = new HashMap, HashMap>>(); RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); for ( RodBinding track : tracks ) { - HashMap> mapping = new HashMap>(); + HashMap> mapping = new HashMap>(); for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) { @@ -356,9 +353,9 @@ public class VariantEvalUtils { if ( mergeTracks && bindings.containsKey(firstTrack) ) { // go through each binding of sample -> value and add all of the bindings from this entry - HashMap> firstMapping = bindings.get(firstTrack); - for ( Map.Entry> elt : mapping.entrySet() ) { - Set firstMappingSet = firstMapping.get(elt.getKey()); + HashMap> firstMapping = bindings.get(firstTrack); + for ( Map.Entry> elt : mapping.entrySet() ) { + Collection firstMappingSet = firstMapping.get(elt.getKey()); if ( firstMappingSet != null ) { firstMappingSet.addAll(elt.getValue()); } else { @@ -373,9 +370,9 @@ public class VariantEvalUtils { return bindings; } - private void addMapping(HashMap> mappings, String sample, VariantContext vc) { + private void addMapping(HashMap> mappings, String sample, VariantContext vc) { if ( !mappings.containsKey(sample) ) - mappings.put(sample, new LinkedHashSet()); + mappings.put(sample, new ArrayList(1)); mappings.get(sample).add(vc); } @@ -414,7 +411,7 @@ public class VariantEvalUtils { newStateKey.putAll(stateKey); } - newStateKey.put(vs.getClass().getSimpleName(), state); + newStateKey.put(vs.getName(), state); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 1d5493daf..b1b8fa46d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.io.File; import java.io.FileNotFoundException; @@ -203,8 +204,9 @@ public class ApplyRecalibration extends RodWalker { for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) { if( vc != null ) { if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); String filterString = null; - final Map attrs = new HashMap(vc.getAttributes()); + final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); if( lod == null ) { @@ -212,8 +214,8 @@ public class ApplyRecalibration extends RodWalker { } // Annotate the new record with its VQSLOD and the worst performing annotation - attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); - attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); @@ -232,11 +234,10 @@ public class ApplyRecalibration extends RodWalker { } if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - final Set filters = new HashSet(); - filters.add(filterString); - vc = VariantContext.modifyFilters(vc, filters); + builder.filters(filterString); } - vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs) ); + + vcfWriter.add( builder.make() ); } else { // valid VC but not compatible with this mode, so just emit the variant untouched vcfWriter.add( vc ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e04bfab76..a2782fe34 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; @@ -38,7 +37,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; /** @@ -284,7 +282,7 @@ public class VariantDataManager { private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 573e15971..096085330 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; @@ -221,7 +222,7 @@ public class CombineVariants extends RodWalker { for ( final VariantContext vc : vcs ) { vcfWriter.add(vc); } - + return vcs.isEmpty() ? 0 : 1; } @@ -244,18 +245,17 @@ public class CombineVariants extends RodWalker { SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; - HashMap attributes = new HashMap(mergedVC.getAttributes()); + final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); - VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); + VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) - annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); - vcfWriter.add(annotatedMergedVC); + VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index c9f330db5..edbfb557a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -38,9 +38,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -160,7 +158,7 @@ public class LeftAlignVariants extends RodWalker { // update if necessary and write if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { int difference = originalIndex - newCigar.getCigarElement(0).getLength(); - VariantContext newVC = VariantContext.modifyLocation(vc, vc.getChr(), vc.getStart()-difference, vc.getEnd()-difference); + VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); int indelIndex = originalIndex-difference; @@ -210,18 +208,18 @@ public class LeftAlignVariants extends RodWalker { } // create new Genotype objects - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getValue().getAlleles() ) { + for ( Allele allele : genotype.getAlleles() ) { Allele newA = alleleMap.get(allele); if ( newA == null ) newA = Allele.NO_CALL; newAlleles.add(newA); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), refBaseForIndel); + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index a932d44ed..50fafa202 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; @@ -117,16 +118,15 @@ public class LiftoverVariants extends RodWalker { vc = VariantContextUtils.reverseComplement(vc); } - vc = VariantContext.modifyLocation(vc, toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length); + vc = new VariantContextBuilder(vc).loc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length).make(); if ( RECORD_ORIGINAL_LOCATION ) { - HashMap attrs = new HashMap(vc.getAttributes()); - attrs.put("OriginalChr", fromInterval.getSequence()); - attrs.put("OriginalStart", fromInterval.getStart()); - vc = VariantContext.modifyAttributes(vc, attrs); + vc = new VariantContextBuilder(vc) + .attribute("OriginalChr", fromInterval.getSequence()) + .attribute("OriginalStart", fromInterval.getStart()).make(); } - VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + VariantContext newVC = VariantContextUtils.createVariantContextWithPaddedAlleles(vc, false); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 609593acc..b0016ff4b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,16 +24,14 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -41,9 +39,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; @@ -275,8 +270,8 @@ public class SelectVariants extends RodWalker { private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable - * number of variants. Use --select_random_fraction for larger numbers of variants. + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory + * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; @@ -493,7 +488,7 @@ public class SelectVariants extends RodWalker { if (outMVFile != null) outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), - vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), + vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), @@ -525,14 +520,14 @@ public class SelectVariants extends RodWalker { continue; VariantContext sub = subsetRecord(vc, samples); - if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { return 0; } } if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub, ref.getBase()); + randomlyAddVariant(++variantNumber, sub); } else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); @@ -561,8 +556,8 @@ public class SelectVariants extends RodWalker { return (compVCs == null || compVCs.isEmpty()); // check if we find it in the variant rod - Map genotypes = vc.getGenotypes(samples); - for (Genotype g : genotypes.values()) { + GenotypesContext genotypes = vc.getGenotypes(samples); + for (final Genotype g : genotypes) { if (sampleHasVariant(g)) { // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples. if (compVCs == null) @@ -659,19 +654,12 @@ public class SelectVariants extends RodWalker { if ( samples == null || samples.isEmpty() ) return vc; - ArrayList genotypes = new ArrayList(); - for ( Map.Entry genotypePair : vc.getGenotypes().entrySet() ) { - if ( samples.contains(genotypePair.getKey()) ) - genotypes.add(genotypePair.getValue()); - } - - VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); + final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); + VariantContextBuilder builder = new VariantContextBuilder(sub); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); - - HashMap attributes = new HashMap(sub.getAttributes()); + builder.genotypes(VariantContextUtils.stripPLs(vc.getGenotypes())); int depth = 0; for (String sample : sub.getSampleNames()) { @@ -688,24 +676,21 @@ public class SelectVariants extends RodWalker { if (KEEP_ORIGINAL_CHR_COUNTS) { - if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) - attributes.put("AC_Orig",attributes.get(VCFConstants.ALLELE_COUNT_KEY)); - if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) - attributes.put("AF_Orig",attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); - if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) - attributes.put("AN_Orig",attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); - + if ( sub.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) + builder.attribute("AC_Orig", sub.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); + if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) + builder.attribute("AF_Orig", sub.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + if ( sub.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) + builder.attribute("AN_Orig", sub.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); } - VariantContextUtils.calculateChromosomeCounts(sub,attributes,false); - attributes.put("DP", depth); + VariantContextUtils.calculateChromosomeCounts(builder, false); + builder.attribute("DP", depth); - sub = VariantContext.modifyAttributes(sub, attributes); - - return sub; + return builder.make(); } - private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { + private void randomlyAddVariant(int rank, VariantContext vc) { if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 4e6cc722d..31aa8963b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; @@ -227,24 +228,24 @@ public class VariantValidationAssessor extends RodWalker numHomVarViolations++; isViolation = true; } - vContext = VariantContext.modifyFilters(vContext, filters); + + VariantContextBuilder builder = new VariantContextBuilder(vContext).filters(filters); numRecords++; // add the info fields - HashMap infoMap = new HashMap(); - infoMap.put("NoCallPct", String.format("%.1f", 100.0*noCallProp)); - infoMap.put("HomRefPct", String.format("%.1f", 100.0*homRefProp)); - infoMap.put("HomVarPct", String.format("%.1f", 100.0*homVarProp)); - infoMap.put("HetPct", String.format("%.1f", 100.0*hetProp)); - infoMap.put("HW", String.format("%.2f", hwScore)); + builder.attribute("NoCallPct", String.format("%.1f", 100.0 * noCallProp)); + builder.attribute("HomRefPct", String.format("%.1f", 100.0 * homRefProp)); + builder.attribute("HomVarPct", String.format("%.1f", 100.0 * homVarProp)); + builder.attribute("HetPct", String.format("%.1f", 100.0 * hetProp)); + builder.attribute("HW", String.format("%.2f", hwScore)); Collection altAlleles = vContext.getAlternateAlleles(); - int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getChromosomeCount(altAlleles.iterator().next()); + int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getCalledChrCount(altAlleles.iterator().next()); if ( !isViolation && altAlleleCount > 0 ) numTrueVariants++; - infoMap.put(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount)); - infoMap.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount())); + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount)); + builder.attribute(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getCalledChrCount())); - return VariantContext.modifyAttributes(vContext, infoMap); + return builder.make(); } private double hardyWeinbergCalculation(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 454909634..f1f61d071 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -315,7 +315,7 @@ public class VariantsToTable extends RodWalker { getters.put("FILTER", new Getter() { public String get(VariantContext vc) { return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } }); - getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.hasID() ? vc.getID() : "."; } }); + getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.getID(); } }); getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); @@ -326,7 +326,7 @@ public class VariantsToTable extends RodWalker { getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); getters.put("GQ", new Getter() { public String get(VariantContext vc) { if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError()); }}); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 9b33f8537..f5928b723 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -42,10 +42,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.util.*; @@ -124,25 +121,22 @@ public class VariantsToVCF extends RodWalker { Collection contexts = getVariantContexts(tracker, ref); for ( VariantContext vc : contexts ) { - Map attrs = new HashMap(vc.getAttributes()); - if ( rsID != null && !vc.hasID() ) { - attrs.put(VariantContext.ID_KEY, rsID); - vc = VariantContext.modifyAttributes(vc, attrs); + VariantContextBuilder builder = new VariantContextBuilder(vc); + if ( rsID != null && vc.emptyID() ) { + builder.id(rsID).make(); } // set the appropriate sample name if necessary if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) { Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName); - Map genotypes = new HashMap(); - genotypes.put(sampleName, g); - vc = VariantContext.modifyGenotypes(vc, genotypes); + builder.genotypes(g); } if ( fixReferenceBase ) { - vc = VariantContext.modifyReferencePadding(vc, ref.getBase()); + builder.referenceBaseForIndel(ref.getBase()); } - writeRecord(vc, tracker, ref.getLocus()); + writeRecord(builder.make(), tracker, ref.getLocus()); } return 1; @@ -207,7 +201,7 @@ public class VariantsToVCF extends RodWalker { while ( dbsnpIterator.hasNext() ) { GATKFeature feature = dbsnpIterator.next(); VariantContext vc = (VariantContext)feature.getUnderlyingObject(); - if ( vc.hasID() && vc.getID().equals(rsID) ) + if ( vc.getID().equals(rsID) ) return vc; } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e10bcbaa0..4f1df9e7b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -87,12 +87,12 @@ public class GenomeLocParser { @Requires("contig != null") public synchronized boolean hasContig(final String contig) { - return lastContig == contig || dict.getSequence(contig) != null; + return contig.equals(lastContig) || dict.getSequence(contig) != null; } @Requires("index >= 0") public synchronized boolean hasContig(final int index) { - return lastIndex == index|| dict.getSequence(index) != null; + return lastIndex == index || dict.getSequence(index) != null; } @Requires("contig != null") @@ -554,4 +554,54 @@ public class GenomeLocParser { return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } + /** + * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + + int start = loc.getStart() - maxBasePairs; + int stop = loc.getStart() - 1; + + if (start < 1) + start = 1; + if (stop < 1) + return null; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } + + /** + * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + int contigLength = contig.getSequenceLength(); + + int start = loc.getStop() + 1; + int stop = loc.getStop() + maxBasePairs; + + if (start > contigLength) + return null; + if (stop > contigLength) + stop = contigLength; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 17f458f31..f92d4be78 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -188,6 +188,10 @@ public class MathUtils { return ! Double.isInfinite(val) && ! Double.isNaN(val); } + public static double bound(double value, double minBoundary, double maxBoundary) { + return Math.max(Math.min(value, maxBoundary), minBoundary); + } + public static boolean isBounded(double val, double lower, double upper) { return val >= lower && val <= upper; } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 6e4ddddc4..8c1061494 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -171,6 +171,9 @@ public class ReadClipper { clippedRead = op.apply(algorithm, clippedRead); } wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return new GATKSAMRecord( clippedRead.getHeader() ); return clippedRead; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // this should never happen diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3377172dd..7cceaa008 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -10,16 +10,14 @@ import org.broad.tribble.util.BlockCompressedInputStream; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; -public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser { +public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -61,6 +59,29 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, protected Map stringCache = new HashMap(); + /** + * Creates a LazyParser for a LazyGenotypesContext to use to decode + * our genotypes only when necessary. We do this instead of eagarly + * decoding the genotypes just to turn around and reencode in the frequent + * case where we don't actually want to manipulate the genotypes + */ + class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { + final List alleles; + final String contig; + final int start; + + LazyVCFGenotypesParser(final List alleles, final String contig, final int start) { + this.alleles = alleles; + this.contig = contig; + this.start = start; + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + //System.out.printf("Loading genotypes... %s:%d%n", contig, start); + return createGenotypeMap((String) data, alleles, contig, start); + } + } /** * @param reader the line reader to take header lines from @@ -70,13 +91,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public abstract Map createGenotypeMap(String str, List alleles, String chr, int pos); + public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos); /** @@ -97,7 +119,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, headerStrings.add(line); Set metaData = new TreeSet(); - Set auxTags = new LinkedHashSet(); + Set sampleNames = new LinkedHashSet(); // iterate over all the passed in strings for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { @@ -125,9 +147,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } while ( arrayIndex < strings.length ) - auxTags.add(strings[arrayIndex++]); + sampleNames.add(strings[arrayIndex++]); - if ( sawFormatTag && auxTags.size() == 0 ) + if ( sawFormatTag && sampleNames.size() == 0 ) throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); } else { @@ -151,7 +173,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } } - header = new VCFHeader(metaData, auxTags); + header = new VCFHeader(metaData, sampleNames); + header.buildVCFReaderMaps(new ArrayList(sampleNames)); return header; } @@ -252,29 +275,33 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @return a variant context object */ private VariantContext parseVCFLine(String[] parts) { + VariantContextBuilder builder = new VariantContextBuilder(); + builder.source(getName()); + // increment the line count lineNo++; // parse out the required fields - String contig = getCachedString(parts[0]); + final String chr = getCachedString(parts[0]); + builder.chr(chr); int pos = Integer.valueOf(parts[1]); - String id = null; + builder.start(pos); + if ( parts[2].length() == 0 ) generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) - id = VCFConstants.EMPTY_ID_FIELD; + builder.noID(); else - id = new String(parts[2]); + builder.id(parts[2]); + String ref = getCachedString(parts[3].toUpperCase()); String alts = getCachedString(parts[4].toUpperCase()); - Double qual = parseQual(parts[5]); - String filter = getCachedString(parts[6]); - String info = new String(parts[7]); + builder.log10PError(parseQual(parts[5])); + builder.filters(parseFilters(getCachedString(parts[6]))); + builder.attributes(parseInfo(parts[7])); // get our alleles, filters, and setup an attribute map List alleles = parseAlleles(ref, alts, lineNo); - Set filters = parseFilters(filter); - Map attributes = parseInfo(info, id); // find out our current location, and clip the alleles down to their minimum length int loc = pos; @@ -286,23 +313,30 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } + builder.stop(loc); + builder.alleles(alleles); // do we have genotyping data if (parts.length > NUM_STANDARD_FIELDS) { - attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8])); - attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this); + final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); + final int nGenotypes = header.getGenotypeSamples().size(); + LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); + + // did we resort the sample names? If so, we need to load the genotype data + if ( !header.samplesWereAlreadySorted() ) + lazy.decode(); + + builder.genotypesNoValidation(lazy); } VariantContext vc = null; try { - vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); + builder.referenceBaseForIndel(ref.getBytes()[0]); + vc = builder.make(); } catch (Exception e) { generateException(e.getMessage()); } - // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) - vc.getGenotypes(); return vc; } @@ -349,10 +383,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, /** * parse out the info fields * @param infoField the fields - * @param id the indentifier * @return a mapping of keys to objects */ - private Map parseInfo(String infoField, String id) { + private Map parseInfo(String infoField) { Map attributes = new HashMap(); if ( infoField.length() == 0 ) @@ -391,8 +424,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } } - if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) ) - attributes.put(VariantContext.ID_KEY, id); return attributes; } @@ -444,16 +475,16 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, protected static Double parseQual(String qualString) { // if we're the VCF 4 missing char, return immediately if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) - return VariantContext.NO_NEG_LOG_10PERROR; + return VariantContext.NO_LOG10_PERROR; Double val = Double.valueOf(qualString); // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) - return VariantContext.NO_NEG_LOG_10PERROR; + return VariantContext.NO_LOG10_PERROR; // scale and return the value - return val / 10.0; + return val / -10.0; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 0da7a100f..ac1da7110 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -25,18 +25,10 @@ package org.broadinstitute.sting.utils.codecs.vcf; import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; -import org.broad.tribble.index.DynamicIndexCreator; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.ParsingUtils; -import org.broad.tribble.util.PositionalStream; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.lang.reflect.Array; @@ -164,10 +156,10 @@ public class StandardVCFWriter extends IndexingVCFWriter { throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); if ( doNotWriteGenotypes ) - vc = VariantContext.modifyGenotypes(vc, null); + vc = new VariantContextBuilder(vc).noGenotypes().make(); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc, false); super.add(vc); Map alleleMap = new HashMap(vc.getAlleles().size()); @@ -182,7 +174,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // ID - String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD; + String ID = vc.getID(); mWriter.write(ID); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -212,7 +204,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // QUAL - if ( !vc.hasNegLog10PError() ) + if ( !vc.hasLog10PError() ) mWriter.write(VCFConstants.MISSING_VALUE_v4); else mWriter.write(getQualValue(vc.getPhredScaledQual())); @@ -227,9 +219,6 @@ public class StandardVCFWriter extends IndexingVCFWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) - continue; - String outputValue = formatVCFField(field.getValue()); if ( outputValue != null ) infoFields.put(key, outputValue); @@ -237,9 +226,10 @@ public class StandardVCFWriter extends IndexingVCFWriter { writeInfoString(infoFields); // FORMAT - if ( vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) ) { + final GenotypesContext gc = vc.getGenotypes(); + if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) { mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, "")); + mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString()); } else { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { @@ -361,7 +351,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { // some exceptions if ( key.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( Math.abs(g.getNegLog10PError() - Genotype.NO_NEG_LOG_10PERROR) < 1e-6) + if ( ! g.hasLog10PError() ) val = VCFConstants.MISSING_VALUE_v4; else { val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); @@ -451,11 +441,11 @@ public class StandardVCFWriter extends IndexingVCFWriter { boolean sawGoodGT = false; boolean sawGoodQual = false; boolean sawGenotypeFilter = false; - for ( Genotype g : vc.getGenotypes().values() ) { + for ( final Genotype g : vc.getGenotypes() ) { keys.addAll(g.getAttributes().keySet()); if ( g.isAvailable() ) sawGoodGT = true; - if ( g.hasNegLog10PError() ) + if ( g.hasLog10PError() ) sawGoodQual = true; if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index e5b1a2de5..6f8e64e55 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -3,12 +3,9 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.util.*; @@ -112,19 +109,20 @@ public class VCF3Codec extends AbstractVCFCodec { /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); - Map genotypes = new LinkedHashMap(nParts); + ArrayList genotypes = new ArrayList(nParts); // get the format keys int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); @@ -139,9 +137,9 @@ public class VCF3Codec extends AbstractVCFCodec { for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + double GTQual = VariantContext.NO_LOG10_PERROR; Set genotypeFilters = null; - Map gtAttributes = null; + Map gtAttributes = null; String sampleName = sampleNameIterator.next(); // check to see if the value list is longer than the key list, which is a problem @@ -150,7 +148,7 @@ public class VCF3Codec extends AbstractVCFCodec { int genotypeAlleleLocation = -1; if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); + gtAttributes = new HashMap(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { final String gtKey = new String(genotypeKeyArray[i]); @@ -180,7 +178,7 @@ public class VCF3Codec extends AbstractVCFCodec { // add it to the list try { - genotypes.put(sampleName, new Genotype(sampleName, + genotypes.add(new Genotype(sampleName, parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), GTQual, genotypeFilters, @@ -191,7 +189,7 @@ public class VCF3Codec extends AbstractVCFCodec { } } - return genotypes; + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 42ea05355..407c4bc41 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -3,12 +3,9 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.util.*; @@ -141,17 +138,18 @@ public class VCFCodec extends AbstractVCFCodec { /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @return a mapping of sample name to genotype object */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); - Map genotypes = new LinkedHashMap(nParts); + ArrayList genotypes = new ArrayList(nParts); // get the format keys int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); @@ -166,9 +164,9 @@ public class VCFCodec extends AbstractVCFCodec { for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + double GTQual = VariantContext.NO_LOG10_PERROR; Set genotypeFilters = null; - Map gtAttributes = null; + Map gtAttributes = null; String sampleName = sampleNameIterator.next(); // check to see if the value list is longer than the key list, which is a problem @@ -177,7 +175,7 @@ public class VCFCodec extends AbstractVCFCodec { int genotypeAlleleLocation = -1; if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); + gtAttributes = new HashMap(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { final String gtKey = new String(genotypeKeyArray[i]); @@ -209,19 +207,13 @@ public class VCFCodec extends AbstractVCFCodec { // add it to the list try { - genotypes.put(sampleName, - new Genotype(sampleName, - GTalleles, - GTQual, - genotypeFilters, - gtAttributes, - phased)); + genotypes.add(new Genotype(sampleName, GTalleles, GTQual, genotypeFilters, gtAttributes, phased)); } catch (TribbleException e) { throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); } } - return genotypes; + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 66e11bc1e..5c5df15ab 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.variantcontext.Genotype; import java.util.*; @@ -38,6 +39,10 @@ public class VCFHeader { // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; + // cache for efficient conversion of VCF -> VariantContext + protected ArrayList sampleNamesInOrder = null; + protected HashMap sampleNameToOffset = null; + /** * create a VCF header, given a list of meta data and auxillary tags @@ -69,6 +74,27 @@ public class VCFHeader { samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); } + /** + * Tell this VCF header to use pre-calculated sample name ordering and the + * sample name -> offset map. This assumes that all VariantContext created + * using this header (i.e., read by the VCFCodec) will have genotypes + * occurring in the same order + * + */ + + protected void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { + sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); + sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + + int i = 0; + for ( final String name : genotypeSampleNamesInAppearenceOrder ) { + sampleNamesInOrder.add(name); + sampleNameToOffset.put(name, i++); + } + Collections.sort(sampleNamesInOrder); + } + + /** * Adds a header line to the header metadata. * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java deleted file mode 100755 index 1dba351e2..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.vcf; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -import java.util.List; -import java.util.Map; - - -/** - * All VCF codecs need to implement this interface so that we can perform lazy loading. - */ -public interface VCFParser { - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos); - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a208d2dc0..c599d4759 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -100,6 +100,12 @@ public class UserException extends ReviewedStingException { } } + public static class TooManyOpenFiles extends UserException { + public TooManyOpenFiles() { + super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + } + } + public static class ErrorWritingBamFile extends UserException { public ErrorWritingBamFile(String message) { super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index ef0d9ca42..b4ad81c02 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -25,10 +25,9 @@ package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -70,7 +69,7 @@ public class GCF { alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); } - qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); + qual = (float)vc.getLog10PError(); //qualToByte(vc.getPhredScaledQual()); info = infoFieldString(vc, GCFHeaderBuilder); filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); @@ -140,26 +139,26 @@ public class GCF { public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); - double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); - Set filters = header.getFilters(filterOffset); - Map attributes = new HashMap(); - attributes.put("INFO", info); - Byte refPadByte = refPad == 0 ? null : refPad; - Map genotypes = decodeGenotypes(header); - return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); + VariantContextBuilder builder = new VariantContextBuilder(source, contig, start, stop, alleleMap); + builder.genotypes(decodeGenotypes(header)); + builder.log10PError(qual); + builder.filters(header.getFilters(filterOffset)); + builder.attribute("INFO", info); + builder.referenceBaseForIndel(refPad == 0 ? null : refPad); + return builder.make(); } - private Map decodeGenotypes(final GCFHeader header) { + private GenotypesContext decodeGenotypes(final GCFHeader header) { if ( genotypes.isEmpty() ) return VariantContext.NO_GENOTYPES; else { - Map map = new TreeMap(); + GenotypesContext map = GenotypesContext.create(genotypes.size()); for ( int i = 0; i < genotypes.size(); i++ ) { final String sampleName = header.getSample(i); final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap); - map.put(sampleName, g); + map.add(g); } return map; @@ -172,7 +171,7 @@ public class GCF { List genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); - for ( Genotype g : vc.getGenotypes().values() ) { + for ( Genotype g : vc.getGenotypes() ) { int i = GCFHeaderBuilder.encodeSample(g.getSampleName()); genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g)); } @@ -192,8 +191,6 @@ public class GCF { boolean first = true; for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) - continue; int stringIndex = GCFHeaderBuilder.encodeString(key); String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); if ( outputValue != null ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java index dd1fb091c..f8fdd9291 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java @@ -84,14 +84,14 @@ public class GCFGenotype { public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) { final List alleles = decodeAlleles(gt, alleleIndex); - final double negLog10PError = gq / 10.0; + final double log10PError = gq / -10.0; final Set filters = Collections.emptySet(); final Map attributes = new HashMap(); attributes.put("DP", dp); attributes.put("AD", ad); attributes.put("PL", pl); - return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false); + return new Genotype(sampleName, alleles, log10PError, filters, attributes, false); } private static int encodeAlleles(List gtList, List allAlleles) { diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f0e164c87..f8655f74a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -56,28 +56,30 @@ public class IntervalUtils { public static List parseIntervalArguments(GenomeLocParser parser, String arg) { List rawIntervals = new ArrayList(); // running list of raw GenomeLocs - // separate argument on semicolon first - for (String fileOrInterval : arg.split(";")) { - // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. - if (isUnmapped(fileOrInterval)) - rawIntervals.add(GenomeLoc.UNMAPPED); - // if it's a file, add items to raw interval list - else if (isIntervalFile(fileOrInterval)) { - try { - rawIntervals.addAll(intervalFileToList(parser, fileOrInterval)); - } - catch ( UserException.MalformedGenomeLoc e ) { - throw e; - } - catch ( Exception e ) { - throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); - } - } + if ( arg.indexOf(';') != -1 ) { + throw new UserException.BadArgumentValue("-L " + arg, "The legacy -L \"interval1;interval2\" syntax " + + "is no longer supported. Please use one -L argument for each " + + "interval or an interval file instead."); + } - // otherwise treat as an interval -> parse and add to raw interval list - else { - rawIntervals.add(parser.parseGenomeLoc(fileOrInterval)); + // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. + if (isUnmapped(arg)) + rawIntervals.add(GenomeLoc.UNMAPPED); + // if it's a file, add items to raw interval list + else if (isIntervalFile(arg)) { + try { + rawIntervals.addAll(intervalFileToList(parser, arg)); } + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(arg, "Interval file could not be parsed in any supported format.", e); + } + } + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(parser.parseGenomeLoc(arg)); } return rawIntervals; @@ -233,8 +235,12 @@ public class IntervalUtils { * * Returns a null string if there are no differences, otherwise returns a string describing the difference * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference */ - public static final String equateIntervals(List masterArg, List testArg) { + public static String equateIntervals(List masterArg, List testArg) { LinkedList master = new LinkedList(masterArg); LinkedList test = new LinkedList(testArg); @@ -317,23 +323,6 @@ public class IntervalUtils { return lengths; } - /** - * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param locs The genome locs. - * @return The maximum number of parts the intervals can be split into. - */ - public static int countContigIntervals(List locs) { - int maxFiles = 0; - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } - } - return maxFiles; - } - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -373,7 +362,6 @@ public class IntervalUtils { * @return A list of lists of genome locs, split according to splits */ public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; int start = 0; List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { @@ -465,7 +453,7 @@ public class IntervalUtils { @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) @Ensures({"result != null"}) - final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { final List split = new ArrayList(); long size = 0; @@ -579,10 +567,101 @@ public class IntervalUtils { } } - public static final long intervalSize(final List locs) { + public static long intervalSize(final List locs) { long size = 0; for ( final GenomeLoc loc : locs ) size += loc.size(); return size; } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (String contig: locsByContig.keySet()) { + List contigLocs = locsByContig.get(contig); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index daf6606ef..2d13d6e59 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -95,11 +95,12 @@ public class PileupElement implements Comparable { // -------------------------------------------------------------------------- public boolean isReducedRead() { - return ((GATKSAMRecord)read).isReducedRead(); + return read.isReducedRead(); } public int getRepresentativeCount() { - return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; + // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed + return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 3fe1060dd..d3a52167a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -188,7 +188,9 @@ public class GATKSAMRecord extends BAMRecord { } public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + byte firstCount = getReducedReadCounts()[0]; + byte offsetCount = getReducedReadCounts()[i]; + return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); } @@ -259,7 +261,7 @@ public class GATKSAMRecord extends BAMRecord { * @return true if the read has no bases */ public boolean isEmpty() { - return this.getReadLength() == 0; + return super.getReadBases() == null || super.getReadLength() == 0; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index e125b8c80..8d9018045 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -243,7 +243,7 @@ public class ReadUtils { public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - GATKSAMRecord result = (GATKSAMRecord)read; + GATKSAMRecord result = read; if ( adaptorBoundaries != null ) { if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java similarity index 78% rename from public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java index bf16cd1cf..c0c9f36ce 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java @@ -11,33 +11,24 @@ import java.util.*; * * @author depristo */ -public final class InferredGeneticContext { - public static final double NO_NEG_LOG_10PERROR = -1.0; +final class CommonInfo { + public static final double NO_LOG10_PERROR = 1.0; - private static Set NO_FILTERS = Collections.unmodifiableSet(new HashSet()); + private static Set NO_FILTERS = Collections.emptySet(); private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); - private double negLog10PError = NO_NEG_LOG_10PERROR; + private double log10PError = NO_LOG10_PERROR; private String name = null; - private Set filters = NO_FILTERS; + private Set filters = null; private Map attributes = NO_ATTRIBUTES; -// public InferredGeneticContext(String name) { -// this.name = name; -// } -// -// public InferredGeneticContext(String name, double negLog10PError) { -// this(name); -// setNegLog10PError(negLog10PError); -// } - - public InferredGeneticContext(String name, double negLog10PError, Set filters, Map attributes) { + public CommonInfo(String name, double log10PError, Set filters, Map attributes) { this.name = name; - setNegLog10PError(negLog10PError); - if ( filters != null ) - setFilters(filters); - if ( attributes != null ) - setAttributes(attributes); + setLog10PError(log10PError); + this.filters = filters; + if ( attributes != null && ! attributes.isEmpty() ) { + this.attributes = attributes; + } } /** @@ -64,12 +55,20 @@ public final class InferredGeneticContext { // // --------------------------------------------------------------------------------------------------------- + public Set getFiltersMaybeNull() { + return filters; + } + public Set getFilters() { - return Collections.unmodifiableSet(filters); + return filters == null ? NO_FILTERS : Collections.unmodifiableSet(filters); + } + + public boolean filtersWereApplied() { + return filters != null; } public boolean isFiltered() { - return filters.size() > 0; + return filters == null ? false : filters.size() > 0; } public boolean isNotFiltered() { @@ -77,8 +76,8 @@ public final class InferredGeneticContext { } public void addFilter(String filter) { - if ( filters == NO_FILTERS ) // immutable -> mutable - filters = new HashSet(filters); + if ( filters == null ) // immutable -> mutable + filters = new HashSet(); if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); @@ -91,37 +90,30 @@ public final class InferredGeneticContext { addFilter(f); } - public void clearFilters() { - filters = new HashSet(); - } - - public void setFilters(Collection filters) { - clearFilters(); - addFilters(filters); - } - // --------------------------------------------------------------------------------------------------------- // // Working with log error rates // // --------------------------------------------------------------------------------------------------------- - public boolean hasNegLog10PError() { - return getNegLog10PError() != NO_NEG_LOG_10PERROR; + public boolean hasLog10PError() { + return getLog10PError() != NO_LOG10_PERROR; } /** * @return the -1 * log10-based error estimate */ - public double getNegLog10PError() { return negLog10PError; } - public double getPhredScaledQual() { return getNegLog10PError() * 10; } + public double getLog10PError() { return log10PError; } + public double getPhredScaledQual() { return getLog10PError() * -10; } - public void setNegLog10PError(double negLog10PError) { - if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); - if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity"); - if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN"); - - this.negLog10PError = negLog10PError; + public void setLog10PError(double log10PError) { + if ( log10PError > 0 && log10PError != NO_LOG10_PERROR) + throw new IllegalArgumentException("BUG: log10PError cannot be > 0 : " + this.log10PError); + if ( Double.isInfinite(this.log10PError) ) + throw new IllegalArgumentException("BUG: log10PError should not be Infinity"); + if ( Double.isNaN(this.log10PError) ) + throw new IllegalArgumentException("BUG: log10PError should not be NaN"); + this.log10PError = log10PError; } // --------------------------------------------------------------------------------------------------------- @@ -157,7 +149,7 @@ public final class InferredGeneticContext { if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable attributes = new HashMap(); - + attributes.put(key, value); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index e2e44e2b9..1691129c9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -12,30 +12,28 @@ import java.util.*; * * @author Mark DePristo */ -public class Genotype { +public class Genotype implements Comparable { public final static String PHASED_ALLELE_SEPARATOR = "|"; public final static String UNPHASED_ALLELE_SEPARATOR = "/"; - protected InferredGeneticContext commonInfo; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; + protected CommonInfo commonInfo; + public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; protected List alleles = null; // new ArrayList(); protected Type type = null; protected boolean isPhased = false; - protected boolean filtersWereAppliedToContext; - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { - this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null); + public Genotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased) { + this(sampleName, alleles, log10PError, filters, attributes, isPhased, null); } - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { + public Genotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { if ( alleles != null ) this.alleles = Collections.unmodifiableList(alleles); - commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); + commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes); if ( log10Likelihoods != null ) commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); - filtersWereAppliedToContext = filters != null; this.isPhased = isPhased; validate(); } @@ -44,21 +42,26 @@ public class Genotype { * Creates a new Genotype for sampleName with genotype according to alleles. * @param sampleName * @param alleles - * @param negLog10PError the confidence in these alleles + * @param log10PError the confidence in these alleles * @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known */ - public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) { - this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods); + public Genotype(String sampleName, List alleles, double log10PError, double[] log10Likelihoods) { + this(sampleName, alleles, log10PError, null, null, false, log10Likelihoods); } - public Genotype(String sampleName, List alleles, double negLog10PError) { - this(sampleName, alleles, negLog10PError, null, null, false); + public Genotype(String sampleName, List alleles, double log10PError) { + this(sampleName, alleles, log10PError, null, null, false); } public Genotype(String sampleName, List alleles) { - this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false); + this(sampleName, alleles, NO_LOG10_PERROR, null, null, false); } + public Genotype(String sampleName, Genotype parent) { + this(sampleName, parent.getAlleles(), parent.getLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); + } + + // --------------------------------------------------------------------------------------------------------- // @@ -67,15 +70,15 @@ public class Genotype { // --------------------------------------------------------------------------------------------------------- public static Genotype modifyName(Genotype g, String name) { - return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + return new Genotype(name, g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } public static Genotype modifyAttributes(Genotype g, Map attributes) { - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); } public static Genotype modifyAlleles(Genotype g, List alleles) { - return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + return new Genotype(g.getSampleName(), alleles, g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } /** @@ -328,11 +331,12 @@ public class Genotype { // --------------------------------------------------------------------------------------------------------- public String getSampleName() { return commonInfo.getName(); } public Set getFilters() { return commonInfo.getFilters(); } + public Set getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); } public boolean isFiltered() { return commonInfo.isFiltered(); } public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); } + public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } + public double getLog10PError() { return commonInfo.getLog10PError(); } public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } @@ -347,4 +351,14 @@ public class Genotype { public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } + + /** + * comparable genotypes -> compareTo on the sample names + * @param genotype + * @return + */ + @Override + public int compareTo(final Genotype genotype) { + return getSampleName().compareTo(genotype.getSampleName()); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 8c8e4f257..a5e4e5774 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -116,30 +116,35 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype - public double getNegLog10GQ(Genotype.Type genotype){ + public double getLog10GQ(Genotype.Type genotype){ + return getQualFromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); + } + + public static double getQualFromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ + if(likelihoods == null) + return Double.NEGATIVE_INFINITY; double qual = Double.NEGATIVE_INFINITY; - EnumMap likelihoods = getAsMap(false); - if(likelihoods == null) - return qual; - for(Map.Entry likelihood : likelihoods.entrySet()){ - if(likelihood.getKey() == genotype) + for (int i=0; i < likelihoods.length; i++) { + if (i==iOfChoosenGenotype) continue; - if(likelihood.getValue() > qual) - qual = likelihood.getValue(); - + if (likelihoods[i] >= qual) + qual = likelihoods[i]; } - //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) - qual = likelihoods.get(genotype) - qual; + // qual contains now max(likelihoods[k]) for all k != bestGTguess + qual = likelihoods[iOfChoosenGenotype] - qual; - //Quality of other genotypes 1-P(G) if (qual < 0) { - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); + // QUAL can be negative if the chosen genotype is not the most likely one individually. + // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one + double[] normalized = MathUtils.normalizeFromLog10(likelihoods); + double chosenGenotype = normalized[iOfChoosenGenotype]; + return Math.log10(1.0 - chosenGenotype); + } else { + // invert the size, as this is the probability of making an error + return -1 * qual; } - return qual; } private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java new file mode 100644 index 000000000..845c65c9c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Represents an ordered collection of Genotype objects + */ +public class GenotypesContext implements List { + /** + * static constant value for an empty GenotypesContext. Useful since so many VariantContexts have no genotypes + */ + public final static GenotypesContext NO_GENOTYPES = + new GenotypesContext(new ArrayList(0), new HashMap(0), Collections.emptyList()).immutable(); + + /** + *sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical order + */ + List sampleNamesInOrder = null; + + /** + * a map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + */ + Map sampleNameToOffset = null; + + /** + * An ArrayList of genotypes contained in this context + * + * WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY + * ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD. + * + */ + ArrayList notToBeDirectlyAccessedGenotypes; + + /** Are we allowing users to modify the list? */ + boolean immutable = false; + + // --------------------------------------------------------------------------- + // + // private constructors -- you have to use static create methods to make these classes + // + // --------------------------------------------------------------------------- + + /** + * Create an empty GenotypeContext + */ + protected GenotypesContext() { + this(10); + } + + /** + * Create an empty GenotypeContext, with initial capacity for n elements + */ + @Requires("n >= 0") + protected GenotypesContext(final int n) { + this(new ArrayList(n)); + } + + /** + * Create an GenotypeContext containing genotypes + */ + @Requires({"genotypes != null", "noDups(genotypes)"}) + protected GenotypesContext(final ArrayList genotypes) { + this.notToBeDirectlyAccessedGenotypes = genotypes; + this.sampleNameToOffset = null; + } + + /** + * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, + * and sorted sample names + * + * @param genotypes our genotypes in arbitrary + * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical + * order. + */ + @Requires({"genotypes != null", "noDups(genotypes)", + "sampleNameToOffset != null", + "sampleNamesInOrder != null", + "genotypes.size() == sampleNameToOffset.size()", + "genotypes.size() == sampleNamesInOrder.size()"}) + protected GenotypesContext(final ArrayList genotypes, + final Map sampleNameToOffset, + final List sampleNamesInOrder) { + this.notToBeDirectlyAccessedGenotypes = genotypes; + this.sampleNameToOffset = sampleNameToOffset; + this.sampleNamesInOrder = sampleNamesInOrder; + } + + // --------------------------------------------------------------------------- + // + // public static factory methods + // + // --------------------------------------------------------------------------- + + /** + * Basic creation routine + * @return an empty, mutable GenotypeContext + */ + @Ensures({"result != null"}) + public static final GenotypesContext create() { + return new GenotypesContext(); + } + + /** + * Basic creation routine + * @return an empty, mutable GenotypeContext with initial capacity for nGenotypes + */ + @Requires("nGenotypes >= 0") + @Ensures({"result != null"}) + public static final GenotypesContext create(final int nGenotypes) { + return new GenotypesContext(nGenotypes); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, + * and sorted sample names + * + * @param genotypes our genotypes in arbitrary + * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical + * order. + * @return an mutable GenotypeContext containing genotypes with already present lookup data + */ + @Requires({"genotypes != null", + "sampleNameToOffset != null", + "sampleNamesInOrder != null", + "sameSamples(genotypes, sampleNamesInOrder)", + "sameSamples(genotypes, sampleNameToOffset.keySet())"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final ArrayList genotypes, + final Map sampleNameToOffset, + final List sampleNamesInOrder) { + return new GenotypesContext(genotypes, sampleNameToOffset, sampleNamesInOrder); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes + * + * @param genotypes our genotypes in arbitrary + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"genotypes != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final ArrayList genotypes) { + return genotypes == null ? NO_GENOTYPES : new GenotypesContext(genotypes); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes + * + * @param genotypes our genotypes in arbitrary + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"genotypes != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final Genotype... genotypes) { + return create(new ArrayList(Arrays.asList(genotypes))); + } + + /** + * Create a freshly allocated GenotypeContext containing the genotypes in toCopy + * + * @param toCopy the GenotypesContext to copy + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"toCopy != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext copy(final GenotypesContext toCopy) { + return create(new ArrayList(toCopy.getGenotypes())); + } + + /** + * Create a GenotypesContext containing the genotypes in iteration order contained + * in toCopy + * + * @param toCopy the collection of genotypes + * @return an mutable GenotypeContext containing genotypes + */ + @Ensures({"result != null"}) + public static final GenotypesContext copy(final Collection toCopy) { + return toCopy == null ? NO_GENOTYPES : create(new ArrayList(toCopy)); + } + + // --------------------------------------------------------------------------- + // + // Mutability methods + // + // --------------------------------------------------------------------------- + + public final GenotypesContext immutable() { + immutable = true; + return this; + } + + public boolean isMutable() { + return ! immutable; + } + + public final void checkImmutability() { + if ( immutable ) + throw new IllegalAccessError("GenotypeMap is currently immutable, but a mutator method was invoked on it"); + } + + // --------------------------------------------------------------------------- + // + // caches + // + // --------------------------------------------------------------------------- + + @Ensures({"sampleNameToOffset == null"}) + protected void invalidateSampleNameMap() { + sampleNameToOffset = null; + } + + @Ensures({"sampleNamesInOrder == null"}) + protected void invalidateSampleOrdering() { + sampleNamesInOrder = null; + } + + @Ensures({"sampleNamesInOrder != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)"}) + protected void ensureSampleOrdering() { + if ( sampleNamesInOrder == null ) { + sampleNamesInOrder = new ArrayList(size()); + + for ( int i = 0; i < size(); i++ ) { + sampleNamesInOrder.add(getGenotypes().get(i).getSampleName()); + } + Collections.sort(sampleNamesInOrder); + } + } + + @Ensures({"sampleNameToOffset != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"}) + protected void ensureSampleNameMap() { + if ( sampleNameToOffset == null ) { + sampleNameToOffset = new HashMap(size()); + + for ( int i = 0; i < size(); i++ ) { + sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i); + } + } + } + + // for testing purposes + protected void ensureAll() { + ensureSampleNameMap(); + ensureSampleOrdering(); + } + + // --------------------------------------------------------------------------- + // + // Map methods + // + // --------------------------------------------------------------------------- + + protected ArrayList getGenotypes() { + return notToBeDirectlyAccessedGenotypes; + } + + @Override + public void clear() { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + getGenotypes().clear(); + } + + @Override + public int size() { + return getGenotypes().size(); + } + + @Override + public boolean isEmpty() { + return getGenotypes().isEmpty(); + } + + /** + * Adds a single genotype to this context. + * + * There are many constraints on this input, and important + * impacts on the performance of other functions provided by this + * context. + * + * First, the sample name of genotype must be unique within this + * context. However, this is not enforced in the code itself, through + * you will invalid the contract on this context if you add duplicate + * samples and are running with CoFoJa enabled. + * + * Second, adding genotype also updates the sample name -> index map, + * so add() followed by containsSample and related function is an efficient + * series of operations. + * + * Third, adding the genotype invalidates the sorted list of sample names, to + * add() followed by any of the SampleNamesInOrder operations is inefficient, as + * each SampleNamesInOrder must rebuild the sorted list of sample names at + * an O(n log n) cost. + * + * @param genotype + * @return + */ + @Override + @Requires({"genotype != null", "get(genotype.getSampleName()) == null"}) + @Ensures("noDups(getGenotypes())") + public boolean add(final Genotype genotype) { + checkImmutability(); + invalidateSampleOrdering(); + + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + sampleNameToOffset.put(genotype.getSampleName(), size()); + } + + return getGenotypes().add(genotype); + } + + @Override + @Requires("! contains(genotype)") + @Ensures("noDups(getGenotypes())") + public void add(final int i, final Genotype genotype) { + throw new UnsupportedOperationException(); + } + + /** + * Adds all of the genotypes to this context + * + * See {@link #add(Genotype)} for important information about this functions + * constraints and performance costs + * + * @param genotypes + * @return + */ + @Override + @Requires("! containsAny(genotypes)") + @Ensures("noDups(getGenotypes())") + public boolean addAll(final Collection genotypes) { + checkImmutability(); + invalidateSampleOrdering(); + + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + int pos = size(); + for ( final Genotype g : genotypes ) { + sampleNameToOffset.put(g.getSampleName(), pos++); + } + } + + return getGenotypes().addAll(genotypes); + } + + @Override + public boolean addAll(final int i, final Collection genotypes) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean contains(final Object o) { + return getGenotypes().contains(o); + } + + @Override + public boolean containsAll(final Collection objects) { + return getGenotypes().containsAll(objects); + } + + private boolean containsAny(final Collection genotypes) { + for ( final Genotype g : genotypes ) { + if ( contains(g) ) return true; + } + return false; + } + + @Override + public Genotype get(final int i) { + return getGenotypes().get(i); + } + + public Genotype get(final String sampleName) { + Integer offset = getSampleI(sampleName); + return offset == null ? null : getGenotypes().get(offset); + } + + private Integer getSampleI(final String sampleName) { + ensureSampleNameMap(); + return sampleNameToOffset.get(sampleName); + } + + @Override + public int indexOf(final Object o) { + return getGenotypes().indexOf(o); + } + + @Override + public Iterator iterator() { + return getGenotypes().iterator(); + } + + @Override + public int lastIndexOf(final Object o) { + return getGenotypes().lastIndexOf(o); + } + + @Override + public ListIterator listIterator() { + // todo -- must be immutable + throw new UnsupportedOperationException(); +// return genotypes.listIterator(); + } + + @Override + public ListIterator listIterator(final int i) { + // todo -- must be immutable + throw new UnsupportedOperationException(); +// return genotypes.listIterator(i); + } + + /** + * Note that remove requires us to invalidate our sample -> index + * cache. The loop: + * + * GenotypesContext gc = ... + * for ( sample in samples ) + * if ( gc.containsSample(sample) ) + * gc.remove(sample) + * + * is extremely inefficient, as each call to remove invalidates the cache + * and containsSample requires us to rebuild it, an O(n) operation. + * + * If you must remove many samples from the GC, use either removeAll or retainAll + * to avoid this O(n * m) operation. + * + * @param i + * @return + */ + @Override + public Genotype remove(final int i) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().remove(i); + } + + /** + * See for important warning {@link this.remove(Integer)} + * @param o + * @return + */ + @Override + public boolean remove(final Object o) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().remove(o); + } + + @Override + public boolean removeAll(final Collection objects) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().removeAll(objects); + } + + @Override + public boolean retainAll(final Collection objects) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().retainAll(objects); + } + + @Override + @Ensures("noDups(getGenotypes())") + public Genotype set(final int i, final Genotype genotype) { + checkImmutability(); + final Genotype prev = getGenotypes().set(i, genotype); + + invalidateSampleOrdering(); + if ( sampleNameToOffset != null ) { + // update the name map by removing the old entry and replacing it with the new one + sampleNameToOffset.remove(prev.getSampleName()); + sampleNameToOffset.put(genotype.getSampleName(), i); + } + + return prev; + } + + /** + * Replaces the genotype in this context -- note for efficiency + * reasons we do not add the genotype if it's not present. The + * return value will be null indicating this happened. + * + * Note this operation is preserves the map cache Sample -> Offset but + * invalidates the sorted list of samples. Using replace within a loop + * containing any of the SampleNameInOrder operation requires an O(n log n) + * resorting after each replace operation. + * + * @param genotype a non null genotype to bind in this context + * @return null if genotype was not added, otherwise returns the previous genotype + */ + @Requires("genotype != null") + public Genotype replace(final Genotype genotype) { + checkImmutability(); + Integer offset = getSampleI(genotype.getSampleName()); + if ( offset == null ) + return null; + else + return set(offset, genotype); + } + + @Override + public List subList(final int i, final int i1) { + return getGenotypes().subList(i, i1); + } + + @Override + public Object[] toArray() { + return getGenotypes().toArray(); + } + + @Override + public T[] toArray(final T[] ts) { + return getGenotypes().toArray(ts); + } + + /** + * Iterate over the Genotypes in this context in the order specified by sampleNamesInOrder + * + * @param sampleNamesInOrder a Iterable of String, containing exactly one entry for each Genotype sample name in + * this context + * @return a Iterable over the genotypes in this context. + */ + @Requires("sampleNamesInOrder != null") + public Iterable iterateInSampleNameOrder(final Iterable sampleNamesInOrder) { + return new Iterable() { + @Override + public Iterator iterator() { + return new InOrderIterator(sampleNamesInOrder.iterator()); + } + }; + } + + /** + * Iterate over the Genotypes in this context in their sample name order (A, B, C) + * regardless of the underlying order in the vector of genotypes + * @return a Iterable over the genotypes in this context. + */ + public Iterable iterateInSampleNameOrder() { + return iterateInSampleNameOrder(getSampleNamesOrderedByName()); + } + + private final class InOrderIterator implements Iterator { + final Iterator sampleNamesInOrder; + + private InOrderIterator(final Iterator sampleNamesInOrder) { + this.sampleNamesInOrder = sampleNamesInOrder; + } + + @Override + public boolean hasNext() { + return sampleNamesInOrder.hasNext(); + } + + @Override + public Genotype next() { + return get(sampleNamesInOrder.next()); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * @return The set of sample names for all genotypes in this context, in arbitrary order + */ + @Ensures("result != null") + public Set getSampleNames() { + ensureSampleNameMap(); + return sampleNameToOffset.keySet(); + } + + /** + * @return The set of sample names for all genotypes in this context, in their natural ordering (A, B, C) + */ + @Ensures("result != null") + public List getSampleNamesOrderedByName() { + ensureSampleOrdering(); + return sampleNamesInOrder; + } + + @Requires("sample != null") + public boolean containsSample(final String sample) { + ensureSampleNameMap(); + return sampleNameToOffset.containsKey(sample); + } + + @Requires("samples != null") + public boolean containsSamples(final Collection samples) { + return getSampleNames().containsAll(samples); + } + + /** + * Return a freshly allocated subcontext of this context containing only the samples + * listed in samples. Note that samples can contain names not in this context, they + * will just be ignored. + * + * @param samples + * @return + */ + @Requires("samples != null") + @Ensures("result != null") + public GenotypesContext subsetToSamples( final Set samples ) { + final int nSamples = samples.size(); + final int nGenotypes = size(); + + if ( nSamples == nGenotypes ) + return this; + else if ( nSamples == 0 ) + return NO_GENOTYPES; + else { // nGenotypes < nSamples + final GenotypesContext subset = create(samples.size()); + for ( final String sample : samples ) { + subset.add(get(sample)); + } + return subset; + } + } + + @Override + public String toString() { + final List gS = new ArrayList(); + for ( final Genotype g : this.iterateInSampleNameOrder() ) + gS.add(g.toString()); + return "[" + join(",", gS) + "]"; + } + + // copied from Utils + private static String join(final String separator, final Collection objects) { + if (objects.isEmpty()) { // fast path for empty collection + return ""; + } else { + final Iterator iter = objects.iterator(); + final T first = iter.next(); + + if ( ! iter.hasNext() ) // fast path for singleton collections + return first.toString(); + else { // full path for 2+ collection that actually need a join + final StringBuilder ret = new StringBuilder(first.toString()); + while(iter.hasNext()) { + ret.append(separator); + ret.append(iter.next().toString()); + } + return ret.toString(); + } + } + } + + protected final static boolean noDups(Collection genotypes) { + Set names = new HashSet(genotypes.size()); + for ( final Genotype g : genotypes ) { + if ( names.contains(g.getSampleName()) ) + return false; + names.add(g.getSampleName()); + } + + return true; + } + + protected final static boolean sameSamples(List genotypes, Collection sampleNamesInOrder) { + Set names = new HashSet(sampleNamesInOrder); + if ( names.size() != sampleNamesInOrder.size() ) + return false; + if ( genotypes.size() != names.size() ) + return false; + + for ( final Genotype g : genotypes ) + if ( ! names.contains(g.getSampleName()) ) + return false; + + return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java new file mode 100644 index 000000000..ce0422352 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Lazy-loading GenotypesContext. A lazy-loading context has access to the + * VCFParser and a unparsed string of genotype data. If the user attempts to manipulate + * the genotypes contained in this context, we decode the data and become a full blown + * GenotypesContext. However, if the user never does this we are spared a lot of expense + * decoding the genotypes unnecessarily. + */ +public class LazyGenotypesContext extends GenotypesContext { + /** The LazyParser we'll use to decode unparsedGenotypeData if necessary */ + final LazyParser parser; + + Object unparsedGenotypeData; + + /** + * nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data + * (known already in the parser). Useful for isEmpty and size() optimizations + */ + final int nUnparsedGenotypes; + + /** + * True if we've already decoded the values in unparsedGenotypeData + */ + boolean loaded = false; + + private final static ArrayList EMPTY = new ArrayList(0); + + /** + * Simple lazy parser interface. Provide an object implementing this + * interface to LazyGenotypesContext, and it's parse method will be called + * when the use of the lazy context requires the underlying genotypes data + * be parsed into Genotype objects. The data argument is the data provided + * to the LazyGenotypesContext holding encoded genotypes data + */ + public interface LazyParser { + @Requires("data != null") + @Ensures("result != null") + public LazyData parse(Object data); + } + + /** + * Returns the data used in the full GenotypesContext constructor + * + * {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List)} + */ + public static class LazyData { + final ArrayList genotypes; + final Map sampleNameToOffset; + final List sampleNamesInOrder; + + @Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null", + "sameSamples(genotypes, sampleNamesInOrder)", + "sameSamples(genotypes, sampleNameToOffset.keySet())"}) + public LazyData(final ArrayList genotypes, + final List sampleNamesInOrder, + final Map sampleNameToOffset) { + this.genotypes = genotypes; + this.sampleNamesInOrder = sampleNamesInOrder; + this.sampleNameToOffset = sampleNameToOffset; + } + } + + /** + * Creates a new lazy loading genotypes context using the LazyParser to create + * genotypes data on demand. + * + * @param parser the parser to be used to load on-demand genotypes data + * @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary + * @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data + */ + @Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"}) + public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) { + super(EMPTY); + this.parser = parser; + this.unparsedGenotypeData = unparsedGenotypeData; + this.nUnparsedGenotypes = nUnparsedGenotypes; + } + + /** + * Overrides the genotypes accessor. If we haven't already, decode the genotypes data + * and store the decoded results in the appropriate variables. Otherwise we just + * returned the decoded result directly. Note some care needs to be taken here as + * the value in notToBeDirectlyAccessedGenotypes may diverge from what would be produced + * by decode, if after the first decode the genotypes themselves are replaced + * @return + */ + @Override + @Ensures("result != null") + protected ArrayList getGenotypes() { + decode(); + return notToBeDirectlyAccessedGenotypes; + } + + /** + * Force us to decode the genotypes, if not already done + */ + public void decode() { + if ( ! loaded ) { + //System.out.printf("Loading genotypes... %s:%d%n", contig, start); + LazyData parsed = parser.parse(unparsedGenotypeData); + notToBeDirectlyAccessedGenotypes = parsed.genotypes; + sampleNamesInOrder = parsed.sampleNamesInOrder; + sampleNameToOffset = parsed.sampleNameToOffset; + loaded = true; + unparsedGenotypeData = null; // don't hold the unparsed data any longer + + // warning -- this path allows us to create a VariantContext that doesn't run validateGenotypes() + // That said, it's not such an important routine -- it's just checking that the genotypes + // are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec + } + } + + /** + * Overrides the ensure* functionality. If the data hasn't been loaded + * yet and we want to build the cache, just decode it and we're done. If we've + * already decoded the data, though, go through the super class + */ + @Override + protected synchronized void ensureSampleNameMap() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleNameMap(); + } + } + + @Override + protected synchronized void ensureSampleOrdering() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleOrdering(); + } + } + + @Override + protected void invalidateSampleNameMap() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) decode(); + super.invalidateSampleNameMap(); + } + + @Override + protected void invalidateSampleOrdering() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) decode(); + super.invalidateSampleOrdering(); + } + + @Override + public boolean isEmpty() { + // optimization -- we know the number of samples in the unparsed data, so use it here to + // avoid parsing just to know if the genotypes context is empty + return loaded ? super.isEmpty() : nUnparsedGenotypes == 0; + } + + @Override + public int size() { + // optimization -- we know the number of samples in the unparsed data, so use it here to + // avoid parsing just to know the size of the context + return loaded ? super.size() : nUnparsedGenotypes; + } + + public Object getUnparsedGenotypeData() { + return unparsedGenotypeData; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java deleted file mode 100755 index 14419a2a0..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext; - -import java.util.*; - -/** - * This class emcompasses all the basic information about a genotype. It is immutable. - * - * @author Mark DePristo - */ -public class MutableGenotype extends Genotype { - public MutableGenotype(Genotype parent) { - super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - public MutableGenotype(String sampleName, Genotype parent) { - super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - - public MutableGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { - super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); - } - - public MutableGenotype(String sampleName, List alleles, double negLog10PError) { - super(sampleName, alleles, negLog10PError); - } - - public MutableGenotype(String sampleName, List alleles) { - super(sampleName, alleles); - } - - public Genotype unmodifiableGenotype() { - return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased()); - } - - - /** - * - * @param alleles list of alleles - */ - public void setAlleles(List alleles) { - this.alleles = new ArrayList(alleles); - validate(); - } - - public void setPhase(boolean isPhased) { - super.isPhased = isPhased; - } - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - public void setName(String name) { commonInfo.setName(name); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java deleted file mode 100755 index a752f4a1b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java +++ /dev/null @@ -1,213 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext; - - -import java.util.Collection; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; - -/** - * Mutable version of VariantContext - * - * @author depristo - */ -public class MutableVariantContext extends VariantContext { - // --------------------------------------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------------------------------------- - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(VariantContext parent) { - super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); - } - - /** - * Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according - * to equals() in Allele. Validate() should be called when you are done modifying the context. - * - * @param alleles - */ - public void setAlleles(Collection alleles) { - this.alleles.clear(); - for ( Allele a : alleles ) - addAllele(a); - } - - /** - * Adds allele to the segregating allele list in this context to the collection of alleles. The new - * allele must be be unique according to equals() in Allele. - * Validate() should be called when you are done modifying the context. - * - * @param allele - */ - public void addAllele(Allele allele) { - final boolean allowDuplicates = false; // used to be a parameter - - type = null; - - for ( Allele a : alleles ) { - if ( a.basesMatch(allele) && ! allowDuplicates ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this); - } - - // we are a novel allele - alleles.add(allele); - } - - public void clearGenotypes() { - genotypes = new TreeMap(); - } - - /** - * Adds this single genotype to the context, not allowing duplicate genotypes to be added - * @param genotype - */ - public void addGenotypes(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype, false); - } - - /** - * Adds these genotypes to the context, not allowing duplicate genotypes to be added - * @param genotypes - */ - public void addGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) { - addGenotype(g); - } - } - - /** - * Adds these genotype to the context, not allowing duplicate genotypes to be added. - * @param genotypes - */ - public void addGenotypes(Map genotypes) { - - for ( Map.Entry elt : genotypes.entrySet() ) { - addGenotype(elt.getValue()); - } - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Map genotypes) { - for ( Map.Entry g : genotypes.entrySet() ) - putGenotype(g.getKey(), g.getValue()); - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) - putGenotype(g); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(Genotype genotype) { - addGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, false); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, true); - } - - private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { - if ( hasGenotype(sampleName) && ! allowOverwrites ) - throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this); - - if ( ! sampleName.equals(genotype.getSampleName()) ) - throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype); - - this.genotypes.put(sampleName, genotype); - } - - /** - * Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException - * @param sampleName - */ - public void removeGenotype(String sampleName) { - if ( ! this.genotypes.containsKey(sampleName) ) - throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes); - - this.genotypes.remove(sampleName); - } - - /** - * Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException - * @param genotype - */ - public void removeGenotype(Genotype genotype) { - removeGenotype(genotype.getSampleName()); - } - - // todo -- add replace genotype routine - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - - public void setSource(String source) { commonInfo.setName(source); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - public void setID(String id) { putAttribute(ID_KEY, id, true); } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index f52a7087b..247e412dd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -4,7 +4,6 @@ import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -130,17 +129,17 @@ import java.util.*; * *
  * vc.hasGenotypes()
- * vc.isMonomorphic()
- * vc.isPolymorphic()
+ * vc.isMonomorphicInSamples()
+ * vc.isPolymorphicInSamples()
  * vc.getSamples().size()
  *
  * vc.getGenotypes()
  * vc.getGenotypes().get("g1")
  * vc.hasGenotype("g1")
  *
- * vc.getChromosomeCount()
- * vc.getChromosomeCount(Aref)
- * vc.getChromosomeCount(T)
+ * vc.getCalledChrCount()
+ * vc.getCalledChrCount(Aref)
+ * vc.getCalledChrCount(T)
  * 
* * === NO_CALL alleles === @@ -162,20 +161,21 @@ import java.util.*; * @author depristo */ public class VariantContext implements Feature { // to enable tribble intergration - protected InferredGeneticContext commonInfo = null; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; - public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; - public final static String ID_KEY = "ID"; + protected CommonInfo commonInfo = null; + public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; + + @Deprecated // ID is no longer stored in the attributes map + private final static String ID_KEY = "ID"; private final Byte REFERENCE_BASE_FOR_INDEL; public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ - protected String contig; - protected long start; - protected long stop; + final protected String contig; + final protected long start; + final protected long stop; + private final String ID; /** The type (cached for performance reasons) of this context */ protected Type type = null; @@ -184,12 +184,12 @@ public class VariantContext implements Feature { // to enable tribble intergrati final protected List alleles; /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ - protected Map genotypes = null; + protected GenotypesContext genotypes = null; /** Counts for each of the possible Genotype types in this context */ protected int[] genotypeCounts = null; - public final static Map NO_GENOTYPES = Collections.unmodifiableMap(new HashMap()); + public final static GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES; // a fast cached access point to the ref / alt alleles for biallelic case private Allele REF = null; @@ -197,124 +197,41 @@ public class VariantContext implements Feature { // to enable tribble intergrati // set to the alt allele when biallelic, otherwise == null private Allele ALT = null; - // were filters applied? - private boolean filtersWereAppliedToContext; + /* cached monomorphic value: null -> not yet computed, False, True */ + private Boolean monomorphic = null; // --------------------------------------------------------------------------------------------------------- // - // constructors + // validation mode // // --------------------------------------------------------------------------------------------------------- - - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + public enum Validation { + REF_PADDING, + ALLELES, + GENOTYPES } - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); - } + private final static EnumSet ALL_VALIDATION = EnumSet.allOf(Validation.class); + private final static EnumSet NO_VALIDATION = EnumSet.noneOf(Validation.class); - /** - * Makes a VariantContext from its arguments without parsing the genotypes. - * Note that this constructor assumes that if there is genotype data, then it's been put into - * the attributes with the UNPARSED_GENOTYPE_MAP_KEY and that the codec has been added with the - * UNPARSED_GENOTYPE_PARSER_KEY. It doesn't validate that this is the case because it's possible - * that there is no genotype data. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); - } - - /** - * Create a new VariantContext - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); - } - - /** - * Create a new variant context with genotypes but without Perror, filters, and attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } + // --------------------------------------------------------------------------------------------------------- + // + // constructors: see VariantContextBuilder + // + // --------------------------------------------------------------------------------------------------------- /** * Copy constructor * * @param other the VariantContext to copy */ - public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); + protected VariantContext(VariantContext other) { + this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(), + other.getAlleles(), other.getGenotypes(), other.getLog10PError(), + other.getFiltersMaybeNull(), + other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, + NO_VALIDATION); } /** @@ -326,40 +243,44 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param stop the stop reference base (one based) * @param alleles alleles * @param genotypes genotypes map - * @param negLog10PError qual + * @param log10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes * @param referenceBaseForIndel padded reference base - * @param genotypesAreUnparsed true if the genotypes have not yet been parsed + * @param validationToPerform set of validation steps to take */ - private VariantContext(String source, String contig, long start, long stop, - Collection alleles, Map genotypes, - double negLog10PError, Set filters, Map attributes, - Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { + protected VariantContext(String source, String ID, + String contig, long start, long stop, + Collection alleles, GenotypesContext genotypes, + double log10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, + EnumSet validationToPerform ) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; this.stop = stop; - if ( !genotypesAreUnparsed && attributes != null ) { - if ( attributes.containsKey(UNPARSED_GENOTYPE_MAP_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_MAP_KEY); - if ( attributes.containsKey(UNPARSED_GENOTYPE_PARSER_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_PARSER_KEY); - } + // intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null + if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID; - this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); - filtersWereAppliedToContext = filters != null; + this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; + // todo -- remove me when this check is no longer necessary + if ( this.commonInfo.hasAttribute(ID_KEY) ) + throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID"); + if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles this.alleles = makeAlleles(alleles); - - if ( genotypes == null ) { genotypes = NO_GENOTYPES; } - this.genotypes = Collections.unmodifiableMap(genotypes); + if ( genotypes == null || genotypes == NO_GENOTYPES ) { + this.genotypes = NO_GENOTYPES; + } else { + this.genotypes = genotypes.immutable(); + } // cache the REF and ALT alleles int nAlleles = alleles.size(); @@ -371,39 +292,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati } } - validate(); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Partial-cloning routines (because Variant Context is immutable). - // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. - // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. - // - // --------------------------------------------------------------------------------------------------------- - - public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); - } - - public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true); - } - - public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); + if ( ! validationToPerform.isEmpty() ) { + validate(validationToPerform); + } } // --------------------------------------------------------------------------------------------------------- @@ -412,55 +303,32 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotype - * genotype and alleles in genotype. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotype genotype - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Genotype genotype) { - return subContextFromGenotypes(Arrays.asList(genotype)); + public VariantContext subContextFromSamples(Set sampleNames, Collection alleles) { + VariantContextBuilder builder = new VariantContextBuilder(this); + return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make(); } - - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes and alleles in these genotypes. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotypes genotypes - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes) { - return subContextFromGenotypes(genotypes, allelesOfGenotypes(genotypes)) ; + public VariantContext subContextFromSamples(Set sampleNames) { + VariantContextBuilder builder = new VariantContextBuilder(this); + GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames); + return builder.genotypes(newGenotypes).alleles(allelesOfGenotypes(newGenotypes)).make(); } - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes. Also, the resulting variant context will contain the alleles provided, not only those found in genotypes - * - * @param genotypes genotypes - * @param alleles the set of allele segregating alleles at this site. Must include those in genotypes, but may be more - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes, Collection alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); + public VariantContext subContextFromSample(String sampleName) { + return subContextFromSamples(Collections.singleton(sampleName)); } - /** * helper routine for subcontext * @param genotypes genotypes * @return allele set */ - private Set allelesOfGenotypes(Collection genotypes) { - Set alleles = new HashSet(); + private final Set allelesOfGenotypes(Collection genotypes) { + final Set alleles = new HashSet(); boolean addedref = false; - for ( Genotype g : genotypes ) { - for ( Allele a : g.getAlleles() ) { + for ( final Genotype g : genotypes ) { + for ( final Allele a : g.getAlleles() ) { addedref = addedref || a.isReference(); if ( a.isCalled() ) alleles.add(a); @@ -628,11 +496,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- public boolean hasID() { - return commonInfo.hasAttribute(ID_KEY); + return getID() != VCFConstants.EMPTY_ID_FIELD; + } + + public boolean emptyID() { + return ! hasID(); } public String getID() { - return (String)commonInfo.getAttribute(ID_KEY); + return ID; } public boolean hasReferenceBaseForIndel() { @@ -650,12 +522,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- public String getSource() { return commonInfo.getName(); } + public Set getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); } public Set getFilters() { return commonInfo.getFilters(); } public boolean isFiltered() { return commonInfo.isFiltered(); } public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); } + public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } + public double getLog10PError() { return commonInfo.getLog10PError(); } public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } @@ -811,35 +684,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- - private void loadGenotypes() { - if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { - if ( genotypes == null ) - genotypes = NO_GENOTYPES; - return; - } - - Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - if ( parserObj == null || !(parserObj instanceof VCFParser) ) - throw new IllegalStateException("There is no VCF parser stored to unparse the genotype data"); - VCFParser parser = (VCFParser)parserObj; - - Object mapObj = getAttribute(UNPARSED_GENOTYPE_MAP_KEY); - if ( mapObj == null ) - throw new IllegalStateException("There is no mapping string stored to unparse the genotype data"); - - genotypes = parser.createGenotypeMap(mapObj.toString(), new ArrayList(alleles), getChr(), getStart()); - - commonInfo.removeAttribute(UNPARSED_GENOTYPE_MAP_KEY); - commonInfo.removeAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - - validateGenotypes(); - } - /** * @return the number of samples in the context */ public int getNSamples() { - loadGenotypes(); return genotypes.size(); } @@ -847,31 +695,26 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return true if the context has associated genotypes */ public boolean hasGenotypes() { - loadGenotypes(); - return genotypes.size() > 0; + return ! genotypes.isEmpty(); } public boolean hasGenotypes(Collection sampleNames) { - loadGenotypes(); - for ( String name : sampleNames ) { - if ( ! genotypes.containsKey(name) ) - return false; - } - return true; + return genotypes.containsSamples(sampleNames); } /** * @return set of all Genotypes associated with this context */ - public Map getGenotypes() { - loadGenotypes(); + public GenotypesContext getGenotypes() { return genotypes; } - public List getGenotypesSortedByName() { - loadGenotypes(); - Collection types = new TreeMap(genotypes).values(); - return new ArrayList(types); + public Iterable getGenotypesOrderedByName() { + return genotypes.iterateInSampleNameOrder(); + } + + public Iterable getGenotypesOrderedBy(Iterable sampleOrdering) { + return genotypes.iterateInSampleNameOrder(sampleOrdering); } /** @@ -882,37 +725,38 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return * @throws IllegalArgumentException if sampleName isn't bound to a genotype */ - public Map getGenotypes(String sampleName) { - return getGenotypes(Arrays.asList(sampleName)); + public GenotypesContext getGenotypes(String sampleName) { + return getGenotypes(Collections.singleton(sampleName)); } /** * Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map * for consistency with the multi-get function. * + * For testing convenience only + * * @param sampleNames a unique list of sample names * @return * @throws IllegalArgumentException if sampleName isn't bound to a genotype */ - public Map getGenotypes(Collection sampleNames) { - HashMap map = new HashMap(); - - for ( String name : sampleNames ) { - if ( map.containsKey(name) ) throw new IllegalArgumentException("Duplicate names detected in requested samples " + sampleNames); - final Genotype g = getGenotype(name); - if ( g != null ) { - map.put(name, g); - } - } - - return map; + protected GenotypesContext getGenotypes(Collection sampleNames) { + return getGenotypes().subsetToSamples(new HashSet(sampleNames)); } + public GenotypesContext getGenotypes(Set sampleNames) { + return getGenotypes().subsetToSamples(sampleNames); + } + + /** - * @return the set of all sample names in this context + * @return the set of all sample names in this context, not ordered */ public Set getSampleNames() { - return getGenotypes().keySet(); + return getGenotypes().getSampleNames(); + } + + public List getSampleNamesOrderedByName() { + return getGenotypes().getSampleNamesOrderedByName(); } /** @@ -925,24 +769,25 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public boolean hasGenotype(String sample) { - return getGenotypes().containsKey(sample); + return getGenotypes().containsSample(sample); } public Genotype getGenotype(int ith) { - return getGenotypesSortedByName().get(ith); + return genotypes.get(ith); } /** - * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS + * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS) * * @return chromosome count */ - public int getChromosomeCount() { + public int getCalledChrCount() { int n = 0; - for ( Genotype g : getGenotypes().values() ) { - n += g.isNoCall() ? 0 : g.getPloidy(); + for ( final Genotype g : getGenotypes() ) { + for ( final Allele a : g.getAlleles() ) + n += a.isNoCall() ? 0 : 1; } return n; @@ -954,10 +799,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param a allele * @return chromosome count */ - public int getChromosomeCount(Allele a) { + public int getCalledChrCount(Allele a) { int n = 0; - for ( Genotype g : getGenotypes().values() ) { + for ( final Genotype g : getGenotypes() ) { n += g.getAlleles(a).size(); } @@ -970,8 +815,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati * * @return true if it's monomorphic */ - public boolean isMonomorphic() { - return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); + public boolean isMonomorphicInSamples() { + if ( monomorphic == null ) + monomorphic = ! isVariant() || (hasGenotypes() && getCalledChrCount(getReference()) == getCalledChrCount()); + return monomorphic; } /** @@ -980,25 +827,16 @@ public class VariantContext implements Feature { // to enable tribble intergrati * * @return true if it's polymorphic */ - public boolean isPolymorphic() { - return ! isMonomorphic(); + public boolean isPolymorphicInSamples() { + return ! isMonomorphicInSamples(); } private void calculateGenotypeCounts() { if ( genotypeCounts == null ) { genotypeCounts = new int[Genotype.Type.values().length]; - for ( Genotype g : getGenotypes().values() ) { - if ( g.isNoCall() ) - genotypeCounts[Genotype.Type.NO_CALL.ordinal()]++; - else if ( g.isHomRef() ) - genotypeCounts[Genotype.Type.HOM_REF.ordinal()]++; - else if ( g.isHet() ) - genotypeCounts[Genotype.Type.HET.ordinal()]++; - else if ( g.isHomVar() ) - genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++; - else - genotypeCounts[Genotype.Type.MIXED.ordinal()]++; + for ( final Genotype g : getGenotypes() ) { + genotypeCounts[g.getType().ordinal()]++; } } } @@ -1094,7 +932,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public void validateRSIDs(Set rsIDs) { - if ( rsIDs != null && hasAttribute(VariantContext.ID_KEY) ) { + if ( rsIDs != null && hasID() ) { for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) { if ( id.startsWith("rs") && !rsIDs.contains(id) ) throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart())); @@ -1109,7 +947,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati List reportedAlleles = getAlleles(); Set observedAlleles = new HashSet(); observedAlleles.add(getReference()); - for ( Genotype g : getGenotypes().values() ) { + for ( final Genotype g : getGenotypes() ) { if ( g.isCalled() ) observedAlleles.addAll(g.getAlleles()); } @@ -1125,19 +963,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public void validateChromosomeCounts() { - Map observedAttrs = calculateChromosomeCounts(); - // AN if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); - int observedAN = (Integer)observedAttrs.get(VCFConstants.ALLELE_NUMBER_KEY); + int observedAN = getCalledChrCount(); if ( reportedAN != observedAN ) throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN)); } // AC if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - List observedACs = (List)observedAttrs.get(VCFConstants.ALLELE_COUNT_KEY); + ArrayList observedACs = new ArrayList(); + + // if there are alternate alleles, record the relevant tags + if ( getAlternateAlleles().size() > 0 ) { + for ( Allele allele : getAlternateAlleles() ) { + observedACs.add(getCalledChrCount(allele)); + } + } + else { // otherwise, set them to 0 + observedACs.add(0); + } + if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { Collections.sort(observedACs); List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); @@ -1158,54 +1005,20 @@ public class VariantContext implements Feature { // to enable tribble intergrati } } - private Map calculateChromosomeCounts() { - Map attributes = new HashMap(); - - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, getChromosomeCount()); - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - - // if there are alternate alleles, record the relevant tags - if ( getAlternateAlleles().size() > 0 ) { - for ( Allele allele : getAlternateAlleles() ) { - alleleCounts.add(getChromosomeCount(allele)); - alleleFreqs.add((double)getChromosomeCount(allele) / (double)getChromosomeCount()); - } - } - // otherwise, set them to 0 - else { - alleleCounts.add(0); - alleleFreqs.add(0.0); - } - - attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs); - return attributes; - } - // --------------------------------------------------------------------------------------------------------- // // validation: the normal validation routines are called automatically upon creation of the VC // // --------------------------------------------------------------------------------------------------------- - /** - * To be called by any modifying routines - */ - private boolean validate() { - return validate(true); - } - - private boolean validate(boolean throwException) { - try { - validateReferencePadding(); - validateAlleles(); - validateGenotypes(); - } catch ( IllegalArgumentException e ) { - if ( throwException ) - throw e; - else - return false; + private boolean validate(final EnumSet validationToPerform) { + for (final Validation val : validationToPerform ) { + switch (val) { + case ALLELES: validateAlleles(); break; + case REF_PADDING: validateReferencePadding(); break; + case GENOTYPES: validateGenotypes(); break; + default: throw new IllegalArgumentException("Unexpected validation mode " + val); + } } return true; @@ -1258,12 +1071,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati private void validateGenotypes() { if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); - for ( Map.Entry elt : this.genotypes.entrySet() ) { - String name = elt.getKey(); - Genotype g = elt.getValue(); - - if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); - + for ( final Genotype g : this.genotypes ) { if ( g.isAvailable() ) { for ( Allele gAllele : g.getAlleles() ) { if ( ! hasAllele(gAllele) && gAllele.isCalled() ) @@ -1352,7 +1160,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati public String toString() { return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s", getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), this.getType(), - ParsingUtils.sortList(this.getAlleles()), ParsingUtils.sortedString(this.getAttributes()), this.getGenotypesSortedByName()); + ParsingUtils.sortList(this.getAlleles()), + ParsingUtils.sortedString(this.getAttributes()), + this.getGenotypes()); } // protected basic manipulation routines @@ -1386,16 +1196,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati return alleleList; } - public static Map genotypeCollectionToMap(Map dest, Collection genotypes) { - for ( Genotype g : genotypes ) { - if ( dest.containsKey(g.getSampleName() ) ) - throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g); - dest.put(g.getSampleName(), g); - } - - return dest; - } - // --------------------------------------------------------------------------------------------------------- // // tribble integration routines -- not for public consumption @@ -1413,8 +1213,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati return (int)stop; } - private boolean hasSymbolicAlleles() { - for (Allele a: getAlleles()) { + public boolean hasSymbolicAlleles() { + for (final Allele a: getAlleles()) { if (a.isSymbolic()) { return true; } @@ -1422,136 +1222,12 @@ public class VariantContext implements Feature { // to enable tribble intergrati return false; } - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { - - // see if we need to pad common reference base from all alleles - boolean padVC; - - // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. - // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). - long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; - if (inputVC.hasSymbolicAlleles()) - padVC = true; - else if (inputVC.getReference().length() == locLength) - padVC = false; - else if (inputVC.getReference().length() == locLength-1) - padVC = true; - else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + - " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - - // nothing to do if we don't need to pad bases - if (padVC) { - - if ( !inputVC.hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - - Byte refByte = inputVC.getReferenceBaseForIndel(); - - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - Map inputGenotypes = inputVC.getGenotypes(); - - for (Allele a : inputVC.getAlleles()) { - // get bases for current allele and create a new one with trimmed bases - if (a.isSymbolic()) { - alleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - alleles.add(Allele.create(newBases,a.isReference())); - } - } - - // now we can recreate new genotypes with trimmed alleles - for (String sample : inputVC.getSampleNames()) { - Genotype g = inputGenotypes.get(sample); - - List inAlleles = g.getAlleles(); - List newGenotypeAlleles = new ArrayList(); - for (Allele a : inAlleles) { - if (a.isCalled()) { - if (a.isSymbolic()) { - newGenotypeAlleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); - } - } - else { - // add no-call allele - newGenotypeAlleles.add(Allele.NO_CALL); - } - } - genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(), - g.getFilters(),g.getAttributes(),g.isPhased())); - - } - - // Do not change the filter state if filters were not applied to this context - Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes(),refByte); - } - else - return inputVC; - - } - - public ArrayList getTwoAllelesWithHighestAlleleCounts() { - // first idea: get two alleles with highest AC - int maxAC1 = 0, maxAC2=0,maxAC1ind =0, maxAC2ind = 0; - int i=0; - int[] alleleCounts = new int[this.getAlleles().size()]; - ArrayList alleleArray = new ArrayList(); - for (Allele a:this.getAlleles()) { - int ac = this.getChromosomeCount(a); - if (ac >=maxAC1) { - maxAC1 = ac; - maxAC1ind = i; - } - alleleArray.add(a); - alleleCounts[i++] = ac; - } - // now get second best allele - for (i=0; i < alleleCounts.length; i++) { - if (i == maxAC1ind) - continue; - if (alleleCounts[i] >= maxAC2) { - maxAC2 = alleleCounts[i]; - maxAC2ind = i; - } - } - - Allele alleleA, alleleB; - if (alleleArray.get(maxAC1ind).isReference()) { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - else if (alleleArray.get(maxAC2ind).isReference()) { - alleleA = alleleArray.get(maxAC2ind); - alleleB = alleleArray.get(maxAC1ind); - } else { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - ArrayList a = new ArrayList(); - a.add(alleleA); - a.add(alleleB); - return a; - } public Allele getAltAlleleWithHighestAlleleCount() { // first idea: get two alleles with highest AC Allele best = null; int maxAC1 = 0; for (Allele a:this.getAlternateAlleles()) { - int ac = this.getChromosomeCount(a); + int ac = this.getCalledChrCount(a); if (ac >=maxAC1) { maxAC1 = ac; best = a; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java new file mode 100644 index 000000000..b79584df8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.*; +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + +import java.util.*; + +/** + * Builder class for VariantContext + * + * Some basic assumptions here: + * + * 1 -- data isn't protectively copied. If you provide an attribute map to + * the build, and modify it later, the builder will see this and so will any + * resulting variant contexts. It's best not to modify collections provided + * to a builder. + * + * 2 -- the system uses the standard builder model, allowing the simple construction idiom: + * + * builder.source("a").genotypes(gc).id("x").make() => VariantContext + * + * 3 -- The best way to copy a VariantContext is: + * + * new VariantContextBuilder(vc).make() => a copy of VC + * + * 4 -- validation of arguments is done at the during the final make() call, so a + * VariantContextBuilder can exist in an inconsistent state as long as those issues + * are resolved before the call to make() is issued. + * + * @author depristo + */ +public class VariantContextBuilder { + // required fields + private String source = null; + private String contig = null; + private long start = -1; + private long stop = -1; + private Collection alleles = null; + + // optional -> these are set to the appropriate default value + private String ID = VCFConstants.EMPTY_ID_FIELD; + private GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES; + private double log10PError = VariantContext.NO_LOG10_PERROR; + private Set filters = null; + private Map attributes = null; + private boolean attributesCanBeModified = false; + private Byte referenceBaseForIndel = null; + + /** enum of what must be validated */ + final private EnumSet toValidate = EnumSet.noneOf(VariantContext.Validation.class); + + /** + * Create an empty VariantContextBuilder where all values adopt their default values. Note that + * source, chr, start, stop, and alleles must eventually be filled in, or the resulting VariantContext + * will throw an error. + */ + public VariantContextBuilder() {} + + /** + * Create an empty VariantContextBuilder where all values adopt their default values, but the bare min. + * of info (source, chr, start, stop, and alleles) have been provided to start. + */ + @Requires({"source != null", "contig != null", "start >= 0", "stop >= 0", + "alleles != null && !alleles.isEmpty()"}) + public VariantContextBuilder(String source, String contig, long start, long stop, Collection alleles) { + this.source = source; + this.contig = contig; + this.start = start; + this.stop = stop; + this.alleles = alleles; + toValidate.add(VariantContext.Validation.ALLELES); + } + + /** + * Returns a new builder based on parent -- the new VC will have all fields initialized + * to their corresponding values in parent. This is the best way to create a derived VariantContext + * + * @param parent + */ + public VariantContextBuilder(VariantContext parent) { + this.alleles = parent.alleles; + this.attributes = parent.getAttributes(); + this.attributesCanBeModified = false; + this.contig = parent.contig; + this.filters = parent.getFiltersMaybeNull(); + this.genotypes = parent.genotypes; + this.ID = parent.getID(); + this.log10PError = parent.getLog10PError(); + this.referenceBaseForIndel = parent.getReferenceBaseForIndel(); + this.source = parent.getSource(); + this.start = parent.getStart(); + this.stop = parent.getEnd(); + } + + /** + * Tells this builder to use this collection of alleles for the resulting VariantContext + * + * @param alleles + * @return this builder + */ + @Requires({"alleles != null", "!alleles.isEmpty()"}) + public VariantContextBuilder alleles(final Collection alleles) { + this.alleles = alleles; + toValidate.add(VariantContext.Validation.ALLELES); + return this; + } + + /** + * Tells this builder to use this map of attributes alleles for the resulting VariantContext + * + * Attributes can be null -> meaning there are no attributes. After + * calling this routine the builder assumes it can modify the attributes + * object here, if subsequent calls are made to set attribute values + * @param attributes + */ + public VariantContextBuilder attributes(final Map attributes) { + this.attributes = attributes; + this.attributesCanBeModified = true; + return this; + } + + /** + * Puts the key -> value mapping into this builder's attributes + * + * @param key + * @param value + * @return + */ + @Requires({"key != null"}) + @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()+1)"}) + public VariantContextBuilder attribute(final String key, final Object value) { + makeAttributesModifiable(); + attributes.put(key, value); + return this; + } + + /** + * Removes key if present in the attributes + * + * @param key + * @return + */ + @Requires({"key != null"}) + @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()-1)"}) + public VariantContextBuilder rmAttribute(final String key) { + makeAttributesModifiable(); + attributes.remove(key); + return this; + } + + /** + * Makes the attributes field modifiable. In many cases attributes is just a pointer to an immutable + * collection, so methods that want to add / remove records require the attributes to be copied to a + */ + private void makeAttributesModifiable() { + if ( ! attributesCanBeModified ) { + this.attributesCanBeModified = true; + this.attributes = new HashMap(attributes); + } + } + + /** + * This builder's filters are set to this value + * + * filters can be null -> meaning there are no filters + * @param filters + */ + public VariantContextBuilder filters(final Set filters) { + this.filters = filters; + return this; + } + + /** + * {@link #filters} + * + * @param filters + * @return + */ + public VariantContextBuilder filters(final String ... filters) { + filters(new HashSet(Arrays.asList(filters))); + return this; + } + + /** + * Tells this builder that the resulting VariantContext should have PASS filters + * + * @return + */ + public VariantContextBuilder passFilters() { + return filters(VariantContext.PASSES_FILTERS); + } + + /** + * Tells this builder that the resulting VariantContext be unfiltered + * + * @return + */ + public VariantContextBuilder unfiltered() { + this.filters = null; + return this; + } + + /** + * Tells this builder that the resulting VariantContext should use this genotypes GenotypeContext + * + * Note that genotypes can be null -> meaning there are no genotypes + * + * @param genotypes + */ + public VariantContextBuilder genotypes(final GenotypesContext genotypes) { + this.genotypes = genotypes; + if ( genotypes != null ) + toValidate.add(VariantContext.Validation.GENOTYPES); + return this; + } + + public VariantContextBuilder genotypesNoValidation(final GenotypesContext genotypes) { + this.genotypes = genotypes; + return this; + } + + /** + * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes + * + * Note that genotypes can be null -> meaning there are no genotypes + * + * @param genotypes + */ + public VariantContextBuilder genotypes(final Collection genotypes) { + return genotypes(GenotypesContext.copy(genotypes)); + } + + /** + * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes + * @param genotypes + */ + public VariantContextBuilder genotypes(final Genotype ... genotypes) { + return genotypes(GenotypesContext.copy(Arrays.asList(genotypes))); + } + + /** + * Tells this builder that the resulting VariantContext should not contain any GenotypeContext + */ + public VariantContextBuilder noGenotypes() { + this.genotypes = null; + return this; + } + + /** + * Tells us that the resulting VariantContext should have ID + * @param ID + * @return + */ + @Requires("ID != null") + public VariantContextBuilder id(final String ID) { + this.ID = ID; + return this; + } + + /** + * Tells us that the resulting VariantContext should not have an ID + * @return + */ + public VariantContextBuilder noID() { + return id(VCFConstants.EMPTY_ID_FIELD); + } + + /** + * Tells us that the resulting VariantContext should have log10PError + * @param log10PError + * @return + */ + @Requires("log10PError <= 0 || log10PError == VariantContext.NO_LOG10_PERROR") + public VariantContextBuilder log10PError(final double log10PError) { + this.log10PError = log10PError; + return this; + } + + /** + * Tells us that the resulting VariantContext should use this byte for the reference base + * Null means no refBase is available + * @param referenceBaseForIndel + */ + public VariantContextBuilder referenceBaseForIndel(final Byte referenceBaseForIndel) { + this.referenceBaseForIndel = referenceBaseForIndel; + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have source field set to source + * @param source + * @return + */ + @Requires("source != null") + public VariantContextBuilder source(final String source) { + this.source = source; + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified location + * @param contig + * @param start + * @param stop + * @return + */ + @Requires({"contig != null", "start >= 0", "stop >= 0"}) + public VariantContextBuilder loc(final String contig, final long start, final long stop) { + this.contig = contig; + this.start = start; + this.stop = stop; + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig chr + * @param contig + * @return + */ + @Requires({"contig != null"}) + public VariantContextBuilder chr(final String contig) { + this.contig = contig; + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig start + * @param start + * @return + */ + @Requires({"start >= 0"}) + public VariantContextBuilder start(final long start) { + this.start = start; + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig stop + * @param stop + * @return + */ + @Requires({"stop >= 0"}) + public VariantContextBuilder stop(final long stop) { + this.stop = stop; + return this; + } + + /** + * Takes all of the builder data provided up to this point, and instantiates + * a freshly allocated VariantContext with all of the builder data. This + * VariantContext is validated as appropriate and if not failing QC (and + * throwing an exception) is returned. + * + * Note that this function can be called multiple times to create multiple + * VariantContexts from the same builder. + */ + public VariantContext make() { + return new VariantContext(source, ID, contig, start, stop, alleles, + genotypes, log10PError, filters, attributes, + referenceBaseForIndel, toValidate); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 43f91041f..91a018c4e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -25,13 +25,10 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -58,84 +55,38 @@ public class VariantContextUtils { } /** - * Create a new VariantContext - * - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes != null ? VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles) { - return new VariantContext (name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, VariantContext.NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Copy constructor - * - * @param other the VariantContext to copy - * @return VariantContext object - */ - public static VariantContext toVC(VariantContext other) { - return new VariantContext(other.getSource(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.getFilters(), other.getAttributes()); - } - - /** - * Update the attributes of the attributes map given the VariantContext to reflect the proper chromosome-based VCF tags + * Update the attributes of the attributes map given the VariantContext to reflect the + * proper chromosome-based VCF tags * * @param vc the VariantContext * @param attributes the attributes map to populate; must not be null; may contain old values * @param removeStaleValues should we remove stale values from the mapping? + * @return the attributes map provided as input, returned for programming convenience */ - public static void calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { + public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { // if everyone is a no-call, remove the old attributes if requested - if ( vc.getChromosomeCount() == 0 && removeStaleValues ) { + if ( vc.getCalledChrCount() == 0 && removeStaleValues ) { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) attributes.remove(VCFConstants.ALLELE_COUNT_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - return; + return attributes; } if ( vc.hasGenotypes() ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getChromosomeCount()); + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getCalledChrCount()); // if there are alternate alleles, record the relevant tags if ( vc.getAlternateAlleles().size() > 0 ) { ArrayList alleleFreqs = new ArrayList(); ArrayList alleleCounts = new ArrayList(); - double totalChromosomes = (double)vc.getChromosomeCount(); + double totalChromosomes = (double)vc.getCalledChrCount(); for ( Allele allele : vc.getAlternateAlleles() ) { - int altChromosomes = vc.getChromosomeCount(allele); + int altChromosomes = vc.getCalledChrCount(allele); alleleCounts.add(altChromosomes); + // todo -- this is a performance problem String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); alleleFreqs.add(freq); } @@ -148,6 +99,54 @@ public class VariantContextUtils { attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); } } + + return attributes; + } + + /** + * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper + * chromosome-based VCF tags based on the current VC produced by builder.make() + * + * @param builder the VariantContextBuilder we are updating + * @param removeStaleValues should we remove stale values from the mapping? + */ + public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) { + final VariantContext vc = builder.make(); + + // if everyone is a no-call, remove the old attributes if requested + if ( vc.getCalledChrCount() == 0 && removeStaleValues ) { + if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) + builder.rmAttribute(VCFConstants.ALLELE_COUNT_KEY); + if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) + builder.rmAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) + builder.rmAttribute(VCFConstants.ALLELE_NUMBER_KEY); + return; + } + + if ( vc.hasGenotypes() ) { + builder.attribute(VCFConstants.ALLELE_NUMBER_KEY, vc.getCalledChrCount()); + + // if there are alternate alleles, record the relevant tags + if ( vc.getAlternateAlleles().size() > 0 ) { + ArrayList alleleFreqs = new ArrayList(); + ArrayList alleleCounts = new ArrayList(); + double totalChromosomes = (double)vc.getCalledChrCount(); + for ( Allele allele : vc.getAlternateAlleles() ) { + int altChromosomes = vc.getCalledChrCount(allele); + alleleCounts.add(altChromosomes); + String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); + alleleFreqs.add(freq); + } + + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + } + else { + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, 0); + builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); + } + } } private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { @@ -165,7 +164,81 @@ public class VariantContextUtils { Map attrs = new HashMap(g.getAttributes()); attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { + // see if we need to pad common reference base from all alleles + boolean padVC; + + // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. + // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). + long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; + if (inputVC.hasSymbolicAlleles()) + padVC = true; + else if (inputVC.getReference().length() == locLength) + padVC = false; + else if (inputVC.getReference().length() == locLength-1) + padVC = true; + else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); + + // nothing to do if we don't need to pad bases + if (padVC) { + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); + + Byte refByte = inputVC.getReferenceBaseForIndel(); + + List alleles = new ArrayList(); + + for (Allele a : inputVC.getAlleles()) { + // get bases for current allele and create a new one with trimmed bases + if (a.isSymbolic()) { + alleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + alleles.add(Allele.create(newBases,a.isReference())); + } + } + + // now we can recreate new genotypes with trimmed alleles + GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + List inAlleles = g.getAlleles(); + List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (Allele a : inAlleles) { + if (a.isCalled()) { + if (a.isSymbolic()) { + newGenotypeAlleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); + } + } + else { + // add no-call allele + newGenotypeAlleles.add(Allele.NO_CALL); + } + } + genotypes.add(new Genotype(g.getSampleName(), newGenotypeAlleles, g.getLog10PError(), + g.getFilters(), g.getAttributes(), g.isPhased())); + + } + + return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); + } + else + return inputVC; + } /** @@ -296,7 +369,7 @@ public class VariantContextUtils { } public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getChromosomeCount() == 0 ) + if ( vc.getCalledChrCount() == 0 ) return 0.0; return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } @@ -309,7 +382,7 @@ public class VariantContextUtils { @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return VariantContext.modifyGenotypes(vc, null); + return new VariantContextBuilder(vc).noGenotypes().make(); } /** @@ -326,39 +399,42 @@ public class VariantContextUtils { return r; } - public static VariantContext pruneVariantContext(VariantContext vc) { - return pruneVariantContext(vc, null); + private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; } - public static VariantContext pruneVariantContext(final VariantContext vc, final Collection keysToPreserve ) { - final MutableVariantContext mvc = new MutableVariantContext(vc); + /** + * @deprecated use variant context builder version instead + * @param vc + * @param keysToPreserve + * @return + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } - if ( keysToPreserve == null || keysToPreserve.size() == 0 ) - mvc.clearAttributes(); - else { - final Map d = mvc.getAttributes(); - mvc.clearAttributes(); - for ( String key : keysToPreserve ) - if ( d.containsKey(key) ) - mvc.putAttribute(key, d.get(key)); + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); + + // VC info + final Map attributes = subsetAttributes(vc.commonInfo, keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + Map genotypeAttributes = subsetAttributes(g.commonInfo, keysToPreserve); + genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.getFilters(), + genotypeAttributes, g.isPhased())); } - // this must be done as the ID is stored in the attributes field - if ( vc.hasID() ) mvc.setID(vc.getID()); - - Collection gs = mvc.getGenotypes().values(); - mvc.clearGenotypes(); - for ( Genotype g : gs ) { - MutableGenotype mg = new MutableGenotype(g); - mg.clearAttributes(); - if ( keysToPreserve != null ) - for ( String key : keysToPreserve ) - if ( g.hasAttribute(key) ) - mg.putAttribute(key, g.getAttribute(key)); - mvc.addGenotype(mg); - } - - return mvc; + return builder.genotypes(genotypes).attributes(attributes); } public enum GenotypeMergeType { @@ -391,75 +467,6 @@ public class VariantContextUtils { KEEP_IF_ALL_UNFILTERED } - /** - * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many - * VCs containing partial, extra genotype information which should be added to the master. For example, - * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing - * information per genotype. The master merge will add the PQ information from each genotype record, where - * appropriate, to the master VC. - * - * @param unsortedVCs collection of VCs - * @param masterName name of master VC - * @return master-merged VC - */ - public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { - VariantContext master = findMaster(unsortedVCs, masterName); - Map genotypes = master.getGenotypes(); - for (Genotype g : genotypes.values()) { - genotypes.put(g.getSampleName(), new MutableGenotype(g)); - } - - Map masterAttributes = new HashMap(master.getAttributes()); - - for (VariantContext vc : unsortedVCs) { - if (!vc.getSource().equals(masterName)) { - for (Genotype g : vc.getGenotypes().values()) { - MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); - for (Map.Entry attr : g.getAttributes().entrySet()) { - if (!masterG.hasAttribute(attr.getKey())) { - //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); - masterG.putAttribute(attr.getKey(), attr.getValue()); - } - } - - if (masterG.isPhased() != g.isPhased()) { - if (masterG.sameGenotype(g)) { - // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); - masterG.setAlleles(g.getAlleles()); - masterG.setPhase(g.isPhased()); - } - //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); - } - -// if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { -// System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); -// masterG.setNegLog10PError(g.getNegLog10PError()); -// } - - } - - for (Map.Entry attr : vc.getAttributes().entrySet()) { - if (!masterAttributes.containsKey(attr.getKey())) { - //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); - masterAttributes.put(attr.getKey(), attr.getValue()); - } - } - } - } - - return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); - } - - private static VariantContext findMaster(Collection unsortedVCs, String masterName) { - for (VariantContext vc : unsortedVCs) { - if (vc.getSource().equals(masterName)) { - return vc; - } - } - - throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); - } - /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with @@ -503,7 +510,7 @@ public class VariantContextUtils { for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); + VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -524,9 +531,9 @@ public class VariantContextUtils { int depth = 0; int maxAC = -1; final Map attributesWithMaxAC = new TreeMap(); - double negLog10PError = -1; + double log10PError = 1; VariantContext vcWithMaxAC = null; - Map genotypes = new TreeMap(); + GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; @@ -552,7 +559,7 @@ public class VariantContextUtils { mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); + log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); @@ -563,7 +570,7 @@ public class VariantContextUtils { // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); + if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator @@ -656,14 +663,17 @@ public class VariantContextUtils { if ( depth > 0 ) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - if ( ! rsIDs.isEmpty() ) { - attributes.put(VariantContext.ID_KEY, Utils.join(",", rsIDs)); - } + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); - VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); // Trim the padded bases of all alleles if necessary - merged = createVariantContextWithTrimmedAlleles(merged); - + VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } @@ -698,6 +708,7 @@ public class VariantContextUtils { return true; } + public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -716,7 +727,7 @@ public class VariantContextUtils { // nothing to do if we don't need to trim bases if (trimVC) { List alleles = new ArrayList(); - Map genotypes = new TreeMap(); + GenotypesContext genotypes = GenotypesContext.create(); // set the reference base for indels in the attributes Map attributes = new TreeMap(inputVC.getAttributes()); @@ -750,9 +761,9 @@ public class VariantContextUtils { if (!hasNullAlleles) return inputVC; // now we can recreate new genotypes with trimmed alleles - for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { + for ( final Genotype genotype : inputVC.getGenotypes() ) { - List originalAlleles = sample.getValue().getAlleles(); + List originalAlleles = genotype.getAlleles(); List trimmedAlleles = new ArrayList(); for ( Allele a : originalAlleles ) { if ( a.isCalled() ) @@ -760,21 +771,22 @@ public class VariantContextUtils { else trimmedAlleles.add(Allele.NO_CALL); } - genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); + final VariantContextBuilder builder = new VariantContextBuilder(inputVC); + return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } - public static Map stripPLs(Map genotypes) { - Map newGs = new HashMap(genotypes.size()); + public static GenotypesContext stripPLs(GenotypesContext genotypes) { + GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - for ( Map.Entry g : genotypes.entrySet() ) { - newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue()); + for ( final Genotype g : genotypes ) { + newGs.add(g.hasLikelihoods() ? removePLs(g) : g); } return newGs; @@ -951,20 +963,19 @@ public class VariantContextUtils { } } - private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { - for ( Genotype g : oneVC.getGenotypes().values() ) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + for ( Genotype g : oneVC.getGenotypes() ) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); - if ( ! mergedGenotypes.containsKey(name) ) { + if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; if ( uniqifySamples || alleleMapping.needsRemapping() ) { - MutableGenotype mutG = new MutableGenotype(name, g); - if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); - newG = mutG; + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new Genotype(name, alleles, g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased()); } - mergedGenotypes.put(name, newG); + mergedGenotypes.add(newG); } } } @@ -992,37 +1003,36 @@ public class VariantContextUtils { } // create new Genotype objects - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getValue().getAlleles() ) { + for ( Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); - + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); } public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { if ( allowedAttributes == null ) return vc; - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getValue().getAttributes().entrySet() ) { + for ( Map.Entry attr : genotype.getAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs)); + newGenotypes.add(Genotype.modifyAttributes(genotype, attrs)); } - return VariantContext.modifyGenotypes(vc, newGenotypes); + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { @@ -1055,355 +1065,10 @@ public class VariantContextUtils { return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } - public abstract static class AlleleMergeRule { - // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): - abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); - - public String toString() { - return "all samples are mergeable"; - } + public static final Set genotypeNames(final Collection genotypes) { + final Set names = new HashSet(genotypes.size()); + for ( final Genotype g : genotypes ) + names.add(g.getSampleName()); + return names; } - - // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record - - public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { - if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) - return null; - - // Check that it's logically possible to merge the VCs: - if (!allSamplesAreMergeable(vc1, vc2)) - return null; - - // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) - if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) - return null; - - return reallyMergeIntoMNP(vc1, vc2, referenceFile); - } - - private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { - int startInter = vc1.getEnd() + 1; - int endInter = vc2.getStart() - 1; - byte[] intermediateBases = null; - if (startInter <= endInter) { - intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); - StringUtil.toUpperCase(intermediateBases); - } - MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added - - Map mergedGenotypes = new HashMap(); - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - List mergedAllelesForSample = new LinkedList(); - - /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, - we preserve phase information (if any) relative to whatever precedes vc1: - */ - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); - mergedAllelesForSample.add(mergedAllele); - } - - double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError()); - Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered - - Map mergedGtAttribs = new HashMap(); - PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); - if (phaseQual.PQ != null) - mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); - - Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); - mergedGenotypes.put(sample, mergedGt); - } - - String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource()); - double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError()); - Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered - Map mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2); - - VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); - - mergedAttribs = new HashMap(mergedVc.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true); - mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs); - - return mergedVc; - } - - private static class AlleleOneAndTwo { - private Allele all1; - private Allele all2; - - public AlleleOneAndTwo(Allele all1, Allele all2) { - this.all1 = all1; - this.all2 = all2; - } - - public int hashCode() { - return all1.hashCode() + all2.hashCode(); - } - - public boolean equals(Object other) { - if (!(other instanceof AlleleOneAndTwo)) - return false; - - AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; - return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); - } - } - - private static class MergedAllelesData { - private Map mergedAlleles; - private byte[] intermediateBases; - private int intermediateLength; - - public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { - this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo - this.intermediateBases = intermediateBases; - this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; - - this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); - } - - public Allele ensureMergedAllele(Allele all1, Allele all2) { - return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor - } - - private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { - AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); - Allele mergedAllele = mergedAlleles.get(all12); - - if (mergedAllele == null) { - byte[] bases1 = all1.getBases(); - byte[] bases2 = all2.getBases(); - - byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; - System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); - if (intermediateBases != null) - System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); - System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); - - mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); - mergedAlleles.put(all12, mergedAllele); - } - - return mergedAllele; - } - - public Set getAllMergedAlleles() { - return new HashSet(mergedAlleles.values()); - } - } - - private static String mergeVariantContextNames(String name1, String name2) { - return name1 + "_" + name2; - } - - private static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { - Map mergedAttribs = new HashMap(); - - List vcList = new LinkedList(); - vcList.add(vc1); - vcList.add(vc2); - - String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; - for (String orAttrib : MERGE_OR_ATTRIBS) { - boolean attribVal = false; - for (VariantContext vc : vcList) { - attribVal = vc.getAttributeAsBoolean(orAttrib, false); - if (attribVal) // already true, so no reason to continue: - break; - } - mergedAttribs.put(orAttrib, attribVal); - } - - // Merge ID fields: - String iDVal = null; - for (VariantContext vc : vcList) { - String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); - if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { - if (iDVal == null) - iDVal = val; - else - iDVal += VCFConstants.ID_FIELD_SEPARATOR + val; - } - } - if (iDVal != null) - mergedAttribs.put(VariantContext.ID_KEY, iDVal); - - return mergedAttribs; - } - - private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { - GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); - GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); - - if (!loc1.onSameContig(loc2)) - throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); - - if (!loc1.isBefore(loc2)) - throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); - - if (vc1.isFiltered() || vc2.isFiltered()) - return false; - - if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets - return false; - - if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) - return false; - - return true; - } - - private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { - for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { - Genotype gt = gtEntry.getValue(); - if (gt.isNoCall() || gt.isFiltered()) - return false; - } - - return true; - } - - // Assumes that vc1 and vc2 were already checked to have the same sample names: - - private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { - // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom - return false; - } - - return true; - } - - public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { - if (gt1.getPloidy() != gt2.getPloidy()) - return false; - - /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. - - HOWEVER, EVEN if this is not the case, but gt1.isHom(), - it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. - */ - return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); - } - - private static class PhaseAndQuality { - public boolean isPhased; - public Double PQ = null; - - public PhaseAndQuality(Genotype gt) { - this.isPhased = gt.isPhased(); - if (this.isPhased) { - this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); - if ( this.PQ == -1 ) this.PQ = null; - } - } - } - - // Assumes that alleleSegregationIsKnown(gt1, gt2): - - private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { - if (gt2.isPhased() || gt2.isHom()) - return new PhaseAndQuality(gt1); // maintain the phase of gt1 - - if (!gt1.isHom()) - throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); - - /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype - - For example, if we're merging the third Genotype with the second one: - 0/1 - 1|1 - 0/1 - - Then, we want to output: - 0/1 - 1/2 - */ - return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] - } - - /* Checks if any sample has a MNP of ALT alleles (segregating together): - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate - return true; - } - } - - return false; - } - - /* Checks if all samples are consistent in their haplotypes: - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { - // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): - Map allele1ToAllele2 = new HashMap(); - Map allele2ToAllele1 = new HashMap(); - - // Note the segregation of the alleles for the reference genome: - allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); - allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); - - // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); - - Allele all1To2 = allele1ToAllele2.get(all1); - if (all1To2 == null) - allele1ToAllele2.put(all1, all2); - else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 - return false; - - Allele all2To1 = allele2ToAllele1.get(all2); - if (all2To1 == null) - allele2ToAllele1.put(all2, all1); - else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 - return false; - } - } - - return true; - } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java index a59ed7abe..ccce21f52 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java @@ -64,7 +64,7 @@ class VariantJEXLContext implements JexlContext { x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }}); x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }}); x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }}); - x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }}); + x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return -10 * vc.getLog10PError(); }}); x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }}); x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }}); x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }}); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 8d7dd82ac..17a7d1974 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -49,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java index dc3a6cafe..62c93bddd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -42,7 +43,7 @@ import java.util.Collections; public class MockLocusShard extends LocusShard { public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { super( genomeLocParser, - new SAMDataSource(Collections.emptyList(),genomeLocParser), + new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), intervals, null); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java deleted file mode 100755 index e41a6b3b7..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java +++ /dev/null @@ -1,223 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import static org.testng.Assert.fail; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * - * User: aaron - * Date: Apr 8, 2009 - * Time: 8:14:23 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 - *

- * Class SAMBAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMBAMDataSourceUnitTest extends BaseTest { - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - StingSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testMergingTwoBAMFiles() { - logger.warn("Executing testMergingTwoBAMFiles"); - - // setup the test files - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - - ArrayList readcountPerShard = new ArrayList(); - ArrayList readcountPerShard2 = new ArrayList(); - - // count up the first hundred shards - int shardsToCount = 100; - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - - } - readcountPerShard.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - - // setup the data and the counter before our second run - readers.clear(); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - count = 0; - // the sharding strat. - data = new SAMDataSource(readers,genomeLocParser); - strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser); - - logger.debug("Pile two:"); - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - // can we leave? - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - } - - readcountPerShard2.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - /*int pos = 0; - for (; pos < 100; pos++) { - if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { - fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos)); - } - } */ - - } - - - - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java new file mode 100755 index 000000000..ba2d68ec9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import static org.testng.Assert.fail; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * @author aaron + * @version 1.0 + * @date Apr 8, 2009 + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + false); + + Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + StingSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 1e39fd26f..91c18078e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -36,9 +36,11 @@ import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.*; import java.util.*; @@ -65,9 +67,9 @@ public class RefMetaDataTrackerUnitTest { C = Allele.create("C"); G = Allele.create("G"); T = Allele.create("T"); - AC_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)); - AG_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)); - AT_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)); + AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make(); + AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make(); + AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make(); span10_10 = makeSpan(10, 10); span1_20 = makeSpan(1, 20); span10_20 = makeSpan(10, 20); diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index 02e1ba99a..f7be1d845 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -44,12 +44,5 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); - - GATKReportTable simpleMetricsByAC = report.getTable("SimpleMetricsByAC.metrics"); - Assert.assertEquals(simpleMetricsByAC.getVersion(), GATKReportVersion.V0_1); - Object simpleMetricsByACPK = simpleMetricsByAC.getPrimaryKey("none.eval.none.novel.ac2"); - Assert.assertEquals(simpleMetricsByAC.get(simpleMetricsByACPK, "AC"), "2"); - - Assert.assertFalse(simpleMetricsByAC.containsPrimaryKey("none.eval.none.novel.ac2.bad")); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 7f4d96add..9226f97e2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -5,14 +5,13 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest { private List bamList; private Walker countReadWalker; private File output; - private long readSize = 100000; private TraverseReads traversalEngine = null; private IndexedFastaSequenceFile ref = null; @@ -117,18 +115,14 @@ public class TraverseReadsUnitTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testUnmappedReadCount() { - SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - ref.getSequenceDictionary(), - readSize, - genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); - while (shardStrategy.hasNext()) { + for(Shard shard: shardStrategy) { traversalEngine.startTimersIfNecessary(); - Shard shard = shardStrategy.next(); if (shard == null) { fail("Shard == null"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java index 462abeba1..5c8fa32a8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -33,7 +33,7 @@ public class SnpEffUnitTest { @Test public void testParseWellFormedEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); @@ -42,7 +42,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectNameEffect() { String effectName = "MADE_UP_EFFECT"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -51,7 +51,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectImpactEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -60,27 +60,27 @@ public class SnpEffUnitTest { @Test public void testParseWrongNumberOfMetadataFieldsEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); } @Test - public void testParseSnpEffWarningEffect() { + public void testParseSnpEffOneWarningOrErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") ); } @Test - public void testParseSnpEffErrorEffect() { + public void testParseSnpEffBothWarningAndErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 189f643d4..1824789a9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("8e7de435105499cd71ffc099e268a83e")); + Arrays.asList("fbb656369eaa48153d127bd12db59d8f")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64b6804cb1e27826e3a47089349be581")); + Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -54,6 +54,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsNotAsking2() { + // this genotype annotations in this file are actually out of order. If you don't parse the genotypes + // they don't get reordered. It's a good test of the genotype ordering system. WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("f2ddfa8105c290b1f34b7a261a02a1ac")); @@ -64,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); + Arrays.asList("42dd979a0a931c18dc9be40308bac321")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -72,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); + Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -80,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b49fe03aa4b675db80a9db38a3552c95")); + Arrays.asList("477eac07989593b58bb361f3429c085a")); executeTest("test exclude annotations", spec); } @@ -88,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); + Arrays.asList("062155edec46a8c52243475fbf3a2943")); executeTest("test overwriting header", spec); } @@ -96,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("16e3a1403fc376320d7c69492cad9345")); + Arrays.asList("06635f2dd91b539bfbce9bf7914d8e43")); executeTest("not passing it any reads", spec); } @@ -104,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); + Arrays.asList("820eeba1f6e3a0758a69d937c524a38e")); executeTest("getting DB tag with dbSNP", spec); } @@ -112,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); + Arrays.asList("31cc2ce157dd20771418c08d6b3be1fa")); executeTest("getting DB tag with HM3", spec); } @@ -120,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31")); + Arrays.asList("074865f8f8c0ca7bfd58681f396c49e9")); executeTest("using expression", spec); } @@ -128,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("4a6f0675242f685e9072c1da5ad9e715")); + Arrays.asList("97b26db8135d083566fb585a677fbe8a")); executeTest("using expression with ID", spec); } @@ -148,9 +150,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("122321a85e448f21679f6ca15c5e22ad") + Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java index 1f3f8ebe6..3783525d1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java @@ -52,7 +52,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { @Test public void testCallableLociWalker2() { - String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100;1:10,000,110-10,000,120 -summary %s"; + String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e", "d287510eac04acf5a56f5cde2cba0e4a")); executeTest("formatBed by interval", spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index 9af39e92c..1c5db4262 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -12,25 +12,25 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest { String md5_1 = "328d2d52cedfdc52da7d1abff487633d"; WalkerTestSpec spec1a = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", 1, Arrays.asList(md5_1)); executeTest("testFastaReference", spec1a); WalkerTestSpec spec1b = new WalkerTestSpec( - "-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", + "-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", 1, Arrays.asList(md5_1)); executeTest("testFastaReference", spec1b); WalkerTestSpec spec2 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s", 1, Arrays.asList("0567b32ebdc26604ddf2a390de4579ac")); executeTest("testFastaAlternateReferenceIndels", spec2); WalkerTestSpec spec3 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500;1:10,029,200-10,029,500 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s", 1, Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641")); executeTest("testFastaAlternateReferenceSnps", spec3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 1cb43ceb1..2c04cebd4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -29,17 +29,23 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { } @Test - public void testMasks() { + public void testMask1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF3 " + validationDataLocation + "vcfexample2.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("578f9e774784c25871678e6464fd212b")); executeTest("test mask all", spec1); + } + @Test + public void testMask2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("bfa86a674aefca1b13d341cb14ab3c4f")); executeTest("test mask some", spec2); + } + @Test + public void testMask3() { WalkerTestSpec spec3 = new WalkerTestSpec( baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("5939f80d14b32d88587373532d7b90e5")); @@ -71,12 +77,15 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { } @Test - public void testGenotypeFilters() { + public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("96b61e4543a73fe725e433f007260039")); executeTest("test genotype filter #1", spec1); + } + @Test + public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("6c8112ab17ce39c8022c891ae73bf38e")); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java index 425b969e2..a87f121f6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java @@ -7,7 +7,7 @@ import org.testng.annotations.Test; import static java.lang.Math.log10; -public class GenotypeLikelihoodsUnitTest extends BaseTest { +public class GenotypePriorsUnitTest extends BaseTest { private final static double DELTA = 1e-8; @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b80f214b1..34e1ad30e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -30,20 +29,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b27939251539439a382538e507e03507")); + Arrays.asList("286f0de92e4ce57986ba861390c6019d")); executeTest("test MultiSample Pilot1", spec); } @Test - public void testWithAllelesPassedIn() { + public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("8de2602679ffc92388da0b6cb4325ef6")); + Arrays.asList("ea5b5dcea3a6eef7ec60070b551c994e")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); + } + @Test + public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e")); + Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -51,7 +53,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6762b72ae60155ad71738d7c76b80e4b")); + Arrays.asList("3ccce5d909f8f128e496f6841836e5f7")); executeTest("test SingleSample Pilot2", spec); } @@ -61,7 +63,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "bc71dba7bbdb23e7d5cc60461fdd897b"; + private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864"; @Test public void testCompressedOutput() { @@ -82,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "b9504e446b9313559c3ed97add7e8dc1"; + String md5 = "95614280c565ad90f8c000376fef822c"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -113,8 +115,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "bb3f294eab3e2cf52c70e63b23aac5ee" ); - e.put( "--computeSLOD", "eb34979efaadba1e34bd82bcacf5c722" ); + e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); + e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -127,9 +129,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "d40114aa201aa33ff5f174f15b6b73af" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "3c681b053fd2280f3c42041d24243752" ); - e.put( "--output_mode EMIT_ALL_SITES", "eafa6d71c5ecd64dfee5d7a3f60e392e" ); + e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" ); + e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -143,12 +145,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("c71ca370947739cb7d87b59452be7a07")); + Arrays.asList("902327e8a45fe585c8dfd1a7c4fcf60f")); executeTest("test confidence 1", spec1); + } + @Test + public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("1c0a599d475cc7d5e745df6e9b6c0d29")); + Arrays.asList("2343ac8113791f4e79643b333b34afc8")); executeTest("test confidence 2", spec2); } @@ -160,8 +165,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "f84da90c310367bd51f2ab6e346fa3d8" ); - e.put( 1.0 / 1850, "5791e7fef40d4412b6d8f84e0a809c6c" ); + e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" ); + e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -185,7 +190,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("9cc9538ac83770e12bd0830d285bfbd0")); + Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); executeTest(String.format("test multiple technologies"), spec); } @@ -204,7 +209,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("eaf8043edb46dfbe9f97ae03baa797ed")); + Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); executeTest(String.format("test calling with BAQ"), spec); } @@ -223,7 +228,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); + Arrays.asList("a64d2e65b5927260e4ce0d948760cc5c")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -238,7 +243,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); + Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -251,7 +256,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); + Arrays.asList("69107157632714150fc068d412e31939")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -261,7 +266,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); + Arrays.asList("4ffda07590e06d58ed867ae326d74b2d")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -271,7 +276,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); + Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -281,7 +286,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); + Arrays.asList("f93f8a35b47bcf96594ada55e2312c73")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -290,7 +295,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("ad884e511a751b05e64db5314314365a")); + Arrays.asList("9be28cb208d8b0314d2bc2696e2fd8d4")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java deleted file mode 100644 index 2e4556af0..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class MergeMNPsIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String VCF, int maxDistMNP) { - return "-T MergeMNPs" + - " -R " + reference + - " --variant:vcf " + validationDataLocation + VCF + - " --maxGenomicDistanceForMNP " + maxDistMNP + - " -o %s" + - " -NO_HEADER"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) - + " -L chr20:556259-756570", - 1, - Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4")); - executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) - + " -L chr20:556259-756570", - 1, - Arrays.asList("53dd312468296826bdd3c22387390c88")); - executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) - + " -L chr20:556259-756570", - 1, - Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee")); - executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec); - } - - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java deleted file mode 100644 index db1e4a82f..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String VCF, int maxDist) { - return "-T MergeSegregatingAlternateAlleles" + - " -R " + reference + - " --variant:vcf " + validationDataLocation + VCF + - " --maxGenomicDistance " + maxDist + - " -o %s" + - " -NO_HEADER"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) - + " -L chr20:556259-756570", - 1, - Arrays.asList("af5e1370822551c0c6f50f23447dc627")); - executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) - + " -L chr20:556259-756570", - 1, - Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb")); - executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) - + " -L chr20:556259-756570", - 1, - Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5")); - executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec); - } - - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java deleted file mode 100755 index 7607049db..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; - -/** - * Test routine for new VariantContext object - */ -@Reference(window=@Window(start=-20,stop=1)) -public class TestVariantContextWalker extends RodWalker { - @Output - PrintStream out; - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Argument(fullName="takeFirstOnly", doc="Only take the first second at a locus, as opposed to all", required=false) - boolean takeFirstOnly = false; - - @Argument(fullName="onlyContextsOfType", doc="Only take variant contexts of this type", required=false) - VariantContext.Type onlyOfThisType = null; - - @Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false) - boolean onlyContextsStartinAtCurrentPosition = false; - - @Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) - boolean printContexts = false; - - @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) - VCFWriter writer = null; - - private boolean wroteHeader = false; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref == null ) - return 0; - else { - EnumSet allowedTypes = onlyOfThisType == null ? null : EnumSet.of(onlyOfThisType); - - int n = 0; - List contexts; - if ( onlyContextsStartinAtCurrentPosition ) - contexts = tracker.getValues(variantCollection.variants, context.getLocation()); - else // ! onlyContextsStartinAtCurrentPosition - contexts = tracker.getValues(variantCollection.variants); - - for ( VariantContext vc : contexts ) { - if ( allowedTypes == null || allowedTypes.contains(vc.getType()) ) { - // we need to trigger decoding of the genotype string to pass integration tests - vc.getGenotypes(); - - if ( writer != null && n == 0 ) { - if ( ! wroteHeader ) { - writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc)); - wroteHeader = true; - } - - writer.add(vc); - } - - n++; - if ( printContexts ) out.printf(" %s%n", vc); - if ( takeFirstOnly ) break; - } - } - - if ( n > 0 && printContexts ) { - out.printf("%s => had %d variant context objects%n", context.getLocation(), n); - out.printf("---------------------------------------------%n"); - } - - return n; - } - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer point, Integer sum) { - return point + sum; - } - - @Override - public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3dceb9bd2..403ecce78 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, - Arrays.asList("d9dcb352c53106f54fcc981f15d35a90") + Arrays.asList("abe943d1aac120d7e75b9b9e5dac2399") ); executeTest("testFunctionClassWithSnpeff", spec); } @@ -50,7 +50,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + Arrays.asList("5fd9624c7a35ffb79d0feb1e233fc757") ); executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); } @@ -70,7 +70,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("fb926edfd3d811e18b33798a43ef4379") + Arrays.asList("4a8765cd02d36e63f6d0f0c10a6c674b") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -91,7 +91,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("26b7d57e3a204ac80a28cb29485b59b7") + Arrays.asList("4106ab8f742ad1c3138c29220151503c") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -113,7 +113,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("1df8184062f330bea9da8bacacc5a09d") + Arrays.asList("6cee3a8d68307a118944f2df5401ac89") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -134,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("927f26414509db9e7c0a2c067d57c949") + Arrays.asList("af5dd27354d5dfd0d2fe03149af09b55") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -155,7 +155,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e6fddefd95122cabc5a0f0b95bce6d34") + Arrays.asList("062a231e203671e19aa9c6507710d762") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -176,7 +176,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("df10486dae73a9cf8c647964f51ba3e0") + Arrays.asList("75abdd2b17c0a5e04814b6969a3d4d7e") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -197,7 +197,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("524adb0b7ff70e227b8803a88f36713e") + Arrays.asList("bdbb5f8230a4a193058750c5e506c733") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -220,7 +220,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("ef6449789dfc032602458b7c5538a1bc") + Arrays.asList("f076120da22930294840fcc396f5f141") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -245,7 +245,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("13b90e94fa82d72bb04a0a5addb27c3f") + Arrays.asList("69201f4a2a7a44b38805a4aeeb8830b6") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -264,7 +264,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("8458b9d7803d75aae551fac7dbd152d6") + Arrays.asList("c3bd3cb6cfb21a8c2b4d5f69104bf6c2") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -277,7 +277,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("b954dee127ec4205ed7d33c91aa3e045")); + 1, Arrays.asList("861f94e3237d62bd5bc00757319241f7")); executeTestParallel("testSelect1", spec); } @@ -294,7 +294,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ae0027197547731a9a5c1eec5fbe0221")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("955c33365e017679047fabec0f14d5e0")); executeTestParallel("testCompVsEvalAC",spec); } @@ -312,7 +312,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fb7d989e44bd74c5376cb5732f9f3f64")); executeTestParallel("testCompOverlap",spec); } @@ -324,7 +324,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("835b44fc3004cc975c968c9f92ed25d6")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("da5bcb305c5ef207ce175821efdbdefd")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -336,7 +336,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("f0e003f1293343c3210ae95e8936b19a")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fde839ece1442388f21a2f0b936756a8")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -353,13 +353,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("1efae6b3b88c752b771e0c8fae24464e")); executeTestParallel("testMultipleCompTracks",spec); } @Test - public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "7425ca5c439afd7bb33ed5cfea02c2b3"; + public void testPerSampleAndSubsettedSampleHaveSameResults1() { + String md5 = "bc9bcabc3105e2515d9a2d41506d2de1"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( @@ -414,7 +414,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("924b6123edb9da540d0abc66f6f33e16") + Arrays.asList("e53546243250634fc03e83b4e61ec55f") ); executeTest("testAlleleCountStrat", spec); } @@ -435,7 +435,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("9794e2dba205c6929dc89899fdf0bf6b") + Arrays.asList("c8086f0525bc13e666afeb670c2e13ae") ); executeTest("testIntervalStrat", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 00044f859..3a25bc5c1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("d46a735ffa898f4aa6b3758c5b03f06d") + Arrays.asList("1f7ed8c0f671dd227ab764624ef0d64c") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index f1f849bf5..e9f138a0e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigIndexValid() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } @@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigInfoKnownContig() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference } @@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } + + private static class FlankingGenomeLocTestData extends TestDataProvider { + final GenomeLocParser parser; + final int basePairs; + final GenomeLoc original, flankStart, flankStop; + + private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) { + super(FlankingGenomeLocTestData.class, name); + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.flankStart = flankStart == null ? null : parse(parser, flankStart); + this.flankStop = flankStop == null ? null : parse(parser, flankStop); + } + + private static GenomeLoc parse(GenomeLocParser parser, String str) { + return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str); + } + } + + @DataProvider(name = "flankingGenomeLocs") + public Object[][] getFlankingGenomeLocs() { + int contigLength = 10000; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength); + GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary()); + + new FlankingGenomeLocTestData("atStartBase1", parser, 1, + "chr1:1", null, "chr1:2"); + + new FlankingGenomeLocTestData("atStartBase50", parser, 50, + "chr1:1", null, "chr1:2-51"); + + new FlankingGenomeLocTestData("atStartRange50", parser, 50, + "chr1:1-10", null, "chr1:11-60"); + + new FlankingGenomeLocTestData("atEndBase1", parser, 1, + "chr1:" + contigLength, "chr1:" + (contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndBase50", parser, 50, + "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 10, contigLength), + String.format("chr1:%d-%d", contigLength - 60, contigLength - 11), + null); + + new FlankingGenomeLocTestData("nearStartBase1", parser, 1, + "chr1:2", "chr1:1", "chr1:3"); + + new FlankingGenomeLocTestData("nearStartRange50", parser, 50, + "chr1:21-30", "chr1:1-20", "chr1:31-80"); + + new FlankingGenomeLocTestData("nearEndBase1", parser, 1, + "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength); + + new FlankingGenomeLocTestData("nearEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 30, contigLength - 21), + String.format("chr1:%d-%d", contigLength - 80, contigLength - 31), + String.format("chr1:%d-%d", contigLength - 20, contigLength)); + + new FlankingGenomeLocTestData("beyondStartBase1", parser, 1, + "chr1:3", "chr1:2", "chr1:4"); + + new FlankingGenomeLocTestData("beyondStartRange50", parser, 50, + "chr1:101-200", "chr1:51-100", "chr1:201-250"); + + new FlankingGenomeLocTestData("beyondEndBase1", parser, 1, + "chr1:" + (contigLength - 3), + "chr1:" + (contigLength - 4), + "chr1:" + (contigLength - 2)); + + new FlankingGenomeLocTestData("beyondEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 200, contigLength - 101), + String.format("chr1:%d-%d", contigLength - 250, contigLength - 201), + String.format("chr1:%d-%d", contigLength - 100, contigLength - 51)); + + new FlankingGenomeLocTestData("unmapped", parser, 50, + "unmapped", null, null); + + new FlankingGenomeLocTestData("fullContig", parser, 50, + "chr1", null, null); + + return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStart); + assertEquals(actual, data.flankStart, description); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStop); + assertEquals(actual, data.flankStop, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 46134cd24..630beaece 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -16,7 +16,8 @@ public class ReadUtilsUnitTest extends BaseTest { GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets @BeforeTest public void init() { @@ -29,7 +30,7 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); } private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 3f5d05e66..7a2696b7b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest { double t6 = t.getElapsedTime(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - - t.stop().start(); - Assert.assertTrue(t.isRunning(), "second started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset"); - Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time } private final static void idleLoop() { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index f625af23c..ecb5a6d33 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -30,8 +30,10 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import org.testng.annotations.*; + +import java.util.LinkedList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -44,180 +46,214 @@ public class ReadClipperUnitTest extends BaseTest { // TODO: Add error messages on failed tests + //int debug = 0; + GATKSAMRecord read, expected; ReadClipper readClipper; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 - @BeforeClass + + public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) { + Assert.assertEquals(read.getReadBases(), readBases); + Assert.assertEquals(read.getBaseQualityString(), baseQuals); + Assert.assertEquals(read.getCigarString(), cigar); + } + + public class testParameter { + int inputStart; + int inputStop; + int substringStart; + int substringStop; + String cigar; + + public testParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) { + inputStart = InputStart; + inputStop = InputStop; + substringStart = SubstringStart; + substringStop = SubstringStop; + cigar = Cigar; + } + } + + // What the test read looks like + // Ref: 1 2 3 4 5 6 7 8 + // Read: 0 1 2 3 - - - - + // ----------------------------- + // Bases: A C T G - - - - + // Quals: ! + 5 ? - - - - + + @BeforeMethod public void init() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); readClipper = new ReadClipper(read); + //logger.warn(read.getCigarString()); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipBothEndsByReferenceCoordinates() { - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + //int debug = 1; //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader())); + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader())); + //clip 1 base - expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4); Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); Assert.assertEquals(expected.getCigarString(), "1H2M1H"); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReadCoordinates(0,0); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(0,0,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(3,3,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(0,1,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(2,3,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReadCoordinates(3,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReadCoordinates(0,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReadCoordinates(2,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinates() { logger.warn("Executing testHardClipByReferenceCoordinates"); - + //logger.warn(debug); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinates(-1,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(-1,1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(4,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(-1,2,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(3,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinates(3,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinates(-1,2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinates(2,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinates(p.inputStart,p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesLeftTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start + testList.add(new testParameter(2, -1, 2, 4, "2H2M"));//clip 2 bases at start - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesRightTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader())); - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + List testList = new LinkedList(); + testList.add(new testParameter(-1, 4, 0, 3, "3M1H"));//clip 1 base at end + testList.add(new testParameter(-1, 3, 0, 2, "2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) // TODO This function is returning null reads public void testHardClipLowQualEnds() { - logger.warn("Executing testHardClipByReferenceCoordinates"); + logger.warn("Executing testHardClipByReferenceCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 2 bases at start - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + List testList = new LinkedList(); + testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(11,-1,2,4,"2H2M"));//clip 2 bases at start + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + /* todo find a better way to test lowqual tail clipping on both sides // Reverse Quals sequence readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - //clip 1 base at end - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + testList = new LinkedList(); + testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - // revert Qual sequence - readClipper.getRead().setBaseQualityString(QUALS); + for ( testParameter p : testList ) { + init(); + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + */ } -} + + public class CigarReadMaker { + + } + + @Test ( enabled = false ) + public void testHardClipSoftClippedBases() { + + // Generate a list of cigars to test + // We will use testParameter in the following way + // Right tail, left tail, + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 35c6a4993..96a33b738 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -2,9 +2,7 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broad.tribble.Tribble; import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; @@ -120,24 +118,23 @@ public class VCFWriterUnitTest extends BaseTest { GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1",1); List alleles = new ArrayList(); Set filters = null; - Map attributes = new HashMap(); - Map genotypes = new HashMap(); + Map attributes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size()); alleles.add(Allele.create("-",true)); alleles.add(Allele.create("CC",false)); attributes.put("DP","50"); for (String name : header.getGenotypeSamples()) { - Map gtattributes = new HashMap(); + Map gtattributes = new HashMap(); gtattributes.put("BB","1"); Genotype gt = new Genotype(name,alleles.subList(1,2),0,null,gtattributes,true); - genotypes.put(name,gt); + genotypes.add(gt); } - return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes, (byte)'A'); - - + return new VariantContextBuilder("RANDOM", loc.getContig(), loc.getStart(), loc.getStop(), alleles) + .genotypes(genotypes).attributes(attributes).referenceBaseForIndel((byte)'A').make(); } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 9c3b905c2..a9035ffd9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,9 +1,12 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.testng.Assert; @@ -762,4 +765,235 @@ public class IntervalUtilsUnitTest extends BaseTest { List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testExceptionUponLegacyIntervalSyntax() throws Exception { + GenomeAnalysisEngine toolkit = new GenomeAnalysisEngine(); + toolkit.setGenomeLocParser(new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)))); + + // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: + IntervalBinding binding = new IntervalBinding("1;2"); + List intervals = binding.getIntervals(toolkit); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index f3d0dedcd..a66c78f3c 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -96,26 +96,38 @@ public class GenotypeLikelihoodsUnitTest { } @Test - public void testGetNegLog10GQ(){ + public void testGetLog10GQ(){ GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); //GQ for the best guess genotype - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),-3.9); double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); //GQ for the other genotypes - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); //Test missing likelihoods gl = new GenotypeLikelihoods("."); - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); - Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); } + @Test + public void testgetQualFromLikelihoods(){ + double[] likelihoods = new double[]{-1, 0, -2}; + // qual values we expect for each possible "best" genotype + double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294}; + + for ( int i = 0; i < likelihoods.length; i++ ) { + Assert.assertEquals(GenotypeLikelihoods.getQualFromLikelihoods(i, likelihoods), expectedQuals[i], 1e-6, + "GQ value for genotype " + i + " was not calculated correctly"); + } + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java index c4f1efd04..e0a037105 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java @@ -71,8 +71,8 @@ public class GenotypeUnitTest extends BaseTest { // public boolean sameGenotype(Genotype other) // public boolean sameGenotype(Genotype other, boolean ignorePhase) // public String getSampleName() -// public boolean hasNegLog10PError() -// public double getNegLog10PError() +// public boolean hasLog10PError() +// public double getLog10PError() // public double getPhredScaledQual() // public boolean hasAttribute(String key) // public Object getAttribute(String key) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java new file mode 100644 index 000000000..ee0a5dfe0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.variantcontext; + + +// the imports for unit testing. + + +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class GenotypesContextUnitTest extends BaseTest { + Allele Aref, C, T; + Genotype AA, AT, TT, AC, CT, CC, MISSING; + List allGenotypes; + + @BeforeSuite + public void before() { + C = Allele.create("C"); + Aref = Allele.create("A", true); + T = Allele.create("T"); + AA = new Genotype("AA", Arrays.asList(Aref, Aref)); + AT = new Genotype("AT", Arrays.asList(Aref, T)); + TT = new Genotype("TT", Arrays.asList(T, T)); + AC = new Genotype("AC", Arrays.asList(Aref, C)); + CT = new Genotype("CT", Arrays.asList(C, T)); + CC = new Genotype("CC", Arrays.asList(C, C)); + MISSING = new Genotype("MISSING", Arrays.asList(C, C)); + + allGenotypes = Arrays.asList(AA, AT, TT, AC, CT, CC); + } + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private interface ContextMaker { + public GenotypesContext make(List initialSamples); + } + + private ContextMaker baseMaker = new ContextMaker() { + @Override + public GenotypesContext make(final List initialSamples) { + return GenotypesContext.copy(initialSamples); + } + + @Override + public String toString() { + return "GenotypesContext"; + } + }; + + private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker { + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + GenotypesContext gc = GenotypesContext.copy((List)data); + gc.ensureSampleNameMap(); + gc.ensureSampleOrdering(); + return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset); + } + + @Override + public GenotypesContext make(final List initialSamples) { + return new LazyGenotypesContext(this, initialSamples, initialSamples.size()); + } + + @Override + public String toString() { + return "LazyGenotypesContext"; + } + } + + private Collection allMakers = Arrays.asList(baseMaker, new lazyMaker()); + + private class GenotypesContextProvider extends TestDataProvider { + ContextMaker maker; + final List initialSamples; + + private GenotypesContextProvider(ContextMaker maker, List initialSamples) { + super(GenotypesContextProvider.class, String.format("%s with %d samples", maker.toString(), initialSamples.size())); + this.maker = maker; + this.initialSamples = initialSamples; + } + + public GenotypesContext makeContext() { + return maker.make(initialSamples); + } + } + + @DataProvider(name = "GenotypesContextProvider") + public Object[][] MakeSampleNamesTest() { + for ( ContextMaker maker : allMakers ) { + for ( int i = 0; i < allGenotypes.size(); i++ ) { + List samples = allGenotypes.subList(0, i); + // sorted + new GenotypesContextProvider(maker, samples); + // unsorted + new GenotypesContextProvider(maker, Utils.reverse(samples)); + } + } + + return GenotypesContextProvider.getTests(GenotypesContextProvider.class); + } + + private final static void testIterable(Iterable genotypeIterable, Set expectedNames) { + int count = 0; + for ( final Genotype g : genotypeIterable ) { + Assert.assertTrue(expectedNames.contains(g.getSampleName())); + count++; + } + Assert.assertEquals(count, expectedNames.size(), "Iterable returned unexpected number of genotypes"); + } + + @Test(dataProvider = "GenotypesContextProvider") + public void testInitialSamplesAreAsExpected(GenotypesContextProvider cfg) { + testGenotypesContextContainsExpectedSamples(cfg.makeContext(), cfg.initialSamples); + } + + private final void testGenotypesContextContainsExpectedSamples(GenotypesContext gc, List expectedSamples) { + Assert.assertEquals(gc.isEmpty(), expectedSamples.isEmpty()); + Assert.assertEquals(gc.size(), expectedSamples.size()); + + // get(index) is doing the right thing + for ( int i = 0; i < expectedSamples.size(); i++ ) { + Assert.assertEquals(gc.get(i), expectedSamples.get(i)); + } + Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); + + // we can fetch samples by name + final Set genotypeNames = VariantContextUtils.genotypeNames(expectedSamples); + for ( final String name : genotypeNames ) { + Assert.assertTrue(gc.containsSample(name)); + } + Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); + + // all of the iterators are working + testIterable(gc.iterateInSampleNameOrder(), genotypeNames); + testIterable(gc, genotypeNames); + testIterable(gc.iterateInSampleNameOrder(genotypeNames), genotypeNames); + if ( ! genotypeNames.isEmpty() ) { + Set first = Collections.singleton(genotypeNames.iterator().next()); + testIterable(gc.iterateInSampleNameOrder(first), first); + } + + // misc. utils are working as expected + Assert.assertEquals(gc.getSampleNames(), genotypeNames); + Assert.assertTrue(ParsingUtils.isSorted(gc.getSampleNamesOrderedByName())); + Assert.assertTrue(ParsingUtils.isSorted(gc.iterateInSampleNameOrder())); + Assert.assertTrue(gc.containsSamples(genotypeNames)); + + final Set withMissing = new HashSet(Arrays.asList(MISSING.getSampleName())); + withMissing.addAll(genotypeNames); + Assert.assertFalse(gc.containsSamples(withMissing)); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testImmutable(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + Assert.assertEquals(gc.isMutable(), true); + gc.immutable(); + Assert.assertEquals(gc.isMutable(), false); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class ) + public void testImmutableCall1(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + gc.immutable(); + gc.add(MISSING); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testClear(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + gc.clear(); + testGenotypesContextContainsExpectedSamples(gc, Collections.emptyList()); + } + + private static final List with(List genotypes, Genotype ... add) { + List l = new ArrayList(genotypes); + l.addAll(Arrays.asList(add)); + return l; + } + + private static final List without(List genotypes, Genotype ... remove) { + List l = new ArrayList(genotypes); + l.removeAll(Arrays.asList(remove)); + return l; + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testAdds(GenotypesContextProvider cfg) { + Genotype add1 = new Genotype("add1", Arrays.asList(Aref, Aref)); + Genotype add2 = new Genotype("add2", Arrays.asList(Aref, Aref)); + + GenotypesContext gc = cfg.makeContext(); + gc.add(add1); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1)); + + gc = cfg.makeContext(); + gc.add(add1); + gc.add(add2); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); + + gc = cfg.makeContext(); + gc.addAll(Arrays.asList(add1, add2)); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testRemoves(GenotypesContextProvider cfg) { + Genotype rm1 = AA; + Genotype rm2 = AC; + + GenotypesContext gc = cfg.makeContext(); + if (gc.size() > 1) { + Genotype rm = gc.get(0); + gc.remove(rm); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm)); + } + + gc = cfg.makeContext(); + gc.remove(rm1); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1)); + + gc = cfg.makeContext(); + gc.remove(rm1); + gc.remove(rm2); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); + + gc = cfg.makeContext(); + gc.removeAll(Arrays.asList(rm1, rm2)); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); + + gc = cfg.makeContext(); + HashSet expected = new HashSet(); + if ( gc.contains(rm1) ) expected.add(rm1); + if ( gc.contains(rm2) ) expected.add(rm2); + gc.retainAll(Arrays.asList(rm1, rm2)); + + // ensure that the two lists are the same + Assert.assertEquals(new HashSet(gc.getGenotypes()), expected); + // because the list order can change, we use the gc's list itself + testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes()); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testSet(GenotypesContextProvider cfg) { + Genotype set = new Genotype("replace", Arrays.asList(Aref, Aref)); + int n = cfg.makeContext().size(); + for ( int i = 0; i < n; i++ ) { + GenotypesContext gc = cfg.makeContext(); + Genotype setted = gc.set(i, set); + Assert.assertNotNull(setted); + ArrayList l = new ArrayList(cfg.initialSamples); + l.set(i, set); + testGenotypesContextContainsExpectedSamples(gc, l); + } + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testReplace(GenotypesContextProvider cfg) { + int n = cfg.makeContext().size(); + for ( int i = 0; i < n; i++ ) { + GenotypesContext gc = cfg.makeContext(); + Genotype toReplace = gc.get(i); + Genotype replacement = new Genotype(toReplace.getSampleName(), Arrays.asList(Aref, Aref)); + gc.replace(replacement); + ArrayList l = new ArrayList(cfg.initialSamples); + l.set(i, replacement); + Assert.assertEquals(replacement, gc.get(i)); + testGenotypesContextContainsExpectedSamples(gc, l); + } + } + + // subset to samples tested in VariantContextUnitTest +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java new file mode 100644 index 000000000..a71949369 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import com.google.caliper.runner.CaliperMain; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.*; +import java.util.*; + +/** + * Caliper microbenchmark of parsing a VCF file + */ +public class VariantContextBenchmark extends SimpleBenchmark { + @Param({"/Users/depristo/Desktop/broadLocal/localData/ALL.chr20.merged_beagle_mach.20101123.snps_indels_svs.genotypes.vcf"}) + String vcfFile; + + @Param({"1000"}) + int linesToRead; // set automatically by framework + + @Param({"100"}) + int nSamplesToTake; // set automatically by framework + + @Param({"10"}) + int dupsToMerge; // set automatically by framework + + @Param + Operation operation; // set automatically by framework + + private String INPUT_STRING; + + public enum Operation { + READ, + SUBSET_TO_SAMPLES, + GET_TYPE, + GET_ID, + GET_GENOTYPES, + GET_ATTRIBUTE_STRING, + GET_ATTRIBUTE_INT, + GET_N_SAMPLES, + GET_GENOTYPES_FOR_SAMPLES, + GET_GENOTYPES_IN_ORDER_OF_NAME, + CALC_GENOTYPE_COUNTS, + MERGE + } + + private GenomeLocParser b37GenomeLocParser; + + @Override protected void setUp() { + try { + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference)); + b37GenomeLocParser = new GenomeLocParser(seq); + } catch ( FileNotFoundException e) { + throw new RuntimeException(e); + } + + // read it into a String so that we don't try to benchmark IO issues + try { + FileInputStream s = new FileInputStream(new File(vcfFile)); + AsciiLineReader lineReader = new AsciiLineReader(s); + int counter = 0; + StringBuffer sb = new StringBuffer(); + while (counter++ < linesToRead ) { + String line = lineReader.readLine(); + if ( line == null ) + break; + sb.append(line + "\n"); + } + s.close(); + INPUT_STRING = sb.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private interface FunctionToBenchmark { + public void run(T vc); + } + + private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { + try { + InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); + AsciiLineReader lineReader = new AsciiLineReader(is); + codec.readHeader(lineReader); + + int counter = 0; + while (counter++ < linesToRead ) { + String line = lineReader.readLine(); + if ( line == null ) + break; + + T vc = codec.decode(line); + func.run(vc); + } + } catch (Exception e) { + System.out.println("Benchmarking run failure because of " + e.getMessage()); + } + } + + public void timeV14(int rep) { + for ( int i = 0; i < rep; i++ ) { + FunctionToBenchmark func = getV14FunctionToBenchmark(); + FeatureCodec codec = new VCFCodec(); + runBenchmark(codec, func); + } + } + + public FunctionToBenchmark getV14FunctionToBenchmark() { + switch ( operation ) { + case READ: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // empty operation + } + }; + case SUBSET_TO_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + VariantContext sub = vc.subContextFromSamples(samples); + sub.getNSamples(); + } + }; + case GET_TYPE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getType(); + } + }; + case GET_ID: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getID(); + } + }; + case GET_GENOTYPES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getGenotypes().size(); + } + }; + + case GET_GENOTYPES_FOR_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + vc.getGenotypes(samples).size(); + } + }; + + case GET_ATTRIBUTE_STRING: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttribute("AN", null); + } + }; + + case GET_ATTRIBUTE_INT: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttributeAsInt("AC", 0); + } + }; + + case GET_N_SAMPLES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getNSamples(); + } + }; + + case GET_GENOTYPES_IN_ORDER_OF_NAME: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // TODO - TEST IS BROKEN +// int n = 0; +// for ( final Genotype g: vc.getGenotypesOrderedByName() ) n++; + } + }; + + case CALC_GENOTYPE_COUNTS: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getHetCount(); + } + }; + + case MERGE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + List toMerge = new ArrayList(); + + for ( int i = 0; i < dupsToMerge; i++ ) { + GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + gc.add(new Genotype(g.getSampleName()+"_"+i, g)); + } + toMerge.add(new VariantContextBuilder(vc).genotypes(gc).make()); + } + + VariantContextUtils.simpleMerge(b37GenomeLocParser, toMerge, null, + VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + VariantContextUtils.GenotypeMergeType.UNSORTED, + true, false, "set", false, true); + } + }; + + default: throw new IllegalArgumentException("Unexpected operation " + operation); + } + } + + // -------------------------------------------------------------------------------- + // + // V13 + // + // In order to use this, you must move the v13 version from archive and uncomment + // + // git mv private/archive/java/src/org/broadinstitute/sting/utils/variantcontext/v13 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13 + // + // -------------------------------------------------------------------------------- + +// public void timeV13(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// FunctionToBenchmark func = getV13FunctionToBenchmark(); +// FeatureCodec codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec(); +// runBenchmark(codec, func); +// } +// } +// +// public FunctionToBenchmark getV13FunctionToBenchmark() { +// switch ( operation ) { +// case READ: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // empty operation +// } +// }; +// case SUBSET_TO_SAMPLES: +// return new FunctionToBenchmark() { +// List samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); +// sub.getNSamples(); +// } +// }; +// +// case GET_TYPE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getType(); +// } +// }; +// case GET_ID: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getID(); +// } +// }; +// case GET_GENOTYPES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getGenotypes().size(); +// } +// }; +// +// case GET_GENOTYPES_FOR_SAMPLES: +// return new FunctionToBenchmark() { +// Set samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); +// vc.getGenotypes(samples).size(); +// } +// }; +// +// case GET_ATTRIBUTE_STRING: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttribute("AN", null); +// } +// }; +// +// case GET_ATTRIBUTE_INT: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttributeAsInt("AC", 0); +// } +// }; +// +// case GET_N_SAMPLES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getNSamples(); +// } +// }; +// +// case GET_GENOTYPES_IN_ORDER_OF_NAME: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // TODO - TEST IS BROKEN +// //vc.getGenotypesOrderedByName(); +// } +// }; +// +// case CALC_GENOTYPE_COUNTS: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getHetCount(); +// } +// }; +// +// case MERGE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// List toMerge = new ArrayList(); +// +// for ( int i = 0; i < dupsToMerge; i++ ) { +// Map gc = new HashMap(); +// for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { +// String name = g.getSampleName()+"_"+i; +// gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name, +// g.getAlleles(), g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); +// toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); +// } +// } +// +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, +// toMerge, null, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, +// true, false, "set", false, true); +// } +// }; +// +// default: throw new IllegalArgumentException("Unexpected operation " + operation); +// } +// } + + public static void main(String[] args) { + CaliperMain.main(VariantContextBenchmark.class, args); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java deleted file mode 100755 index 67fe7d012..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java +++ /dev/null @@ -1,65 +0,0 @@ - - -package org.broadinstitute.sting.utils.variantcontext; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.HashMap; -import java.util.Map; -import java.util.Arrays; - -public class VariantContextIntegrationTest extends WalkerTest { - private static String cmdRoot = "-T TestVariantContext" + - " -R " + b36KGReference; - - private static String root = cmdRoot + - " -L 1:1-1,000,000 -V " + b36dbSNP129; - - private static final class VCITTest extends TestDataProvider { - String args, md5; - - private VCITTest(final String args, final String md5) { - super(VCITTest.class); - this.args = args; - this.md5 = md5; - } - } - - @DataProvider(name = "VCITTestData") - public Object[][] createVCITTestData() { - new VCITTest("--printPerLocus", "e9d0f1fe80659bb55b40aa6c3a2e921e"); - new VCITTest("--printPerLocus --onlyContextsOfType SNP", "0e620db3e45771df42c54a9c0ae4a29f"); - new VCITTest("--printPerLocus --onlyContextsOfType INDEL", "b725c204fefe3814644d50e7c20f9dfe"); - new VCITTest("--printPerLocus --onlyContextsOfType MIXED", "3ccc33f496a1718df55722d11cc14334"); - new VCITTest("--printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); - new VCITTest("--printPerLocus --takeFirstOnly", "3a45561da042b2b44b6a679744f16103"); - new VCITTest("--printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "4746f269ecc377103f83eb61cc162c39"); - new VCITTest("--printPerLocus --onlyContextsStartinAtCurrentPosition", "2749e3fae458650a85a2317e346dc44c"); - new VCITTest("--printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "9bd48c2a40813023e29ffaa23d59d382"); - - return VCITTest.getTests(VCITTest.class); - } - - @Test(dataProvider = "VCITTestData") - public void testConversionSelection(VCITTest test) { - String extraArgs = test.args; - String md5 = test.md5; - - WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testSelectors", spec); - } - - @Test - public void testToVCF() { - // this really just tests that we are seeing the same number of objects over all of chr1 - - WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -NO_HEADER -V:VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", - 2, // just one output file - Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "ff91731213fd0bbdc200ab6fd1c93e63")); - executeTest("testToVCF", spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index a4d78b637..fca7440e4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -6,15 +6,16 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.testng.annotations.BeforeSuite; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.lang.reflect.Array; +import java.util.*; public class VariantContextUnitTest extends BaseTest { @@ -41,6 +42,8 @@ public class VariantContextUnitTest extends BaseTest { int mixedLocStart = 20; int mixedLocStop = 23; + VariantContextBuilder basicBuilder, snpBuilder, insBuilder; + @BeforeSuite public void before() { del = Allele.create("-"); @@ -56,6 +59,13 @@ public class VariantContextUnitTest extends BaseTest { ATCref = Allele.create("ATC", true); } + @BeforeMethod + public void beforeTest() { + basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); + snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); + insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC)).referenceBaseForIndel((byte)'A'); + } + @Test public void testDetermineTypes() { Allele ACref = Allele.create("AC", true); @@ -70,68 +80,68 @@ public class VariantContextUnitTest extends BaseTest { // test REF List alleles = Arrays.asList(Tref); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION); // test SNPs alleles = Arrays.asList(Tref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); alleles = Arrays.asList(Tref, A, C); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); // test MNPs alleles = Arrays.asList(ACref, TA); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); alleles = Arrays.asList(ATCref, CAT, Allele.create("GGG")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); // test MIXED alleles = Arrays.asList(TAref, T, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(TAref, T, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(ACref, ATC, AT); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -139,8 +149,8 @@ public class VariantContextUnitTest extends BaseTest { public void testMultipleSNPAlleleOrdering() { final List allelesNaturalOrder = Arrays.asList(Aref, C, T); final List allelesUnnaturalOrder = Arrays.asList(Aref, T, C); - VariantContext naturalVC = new VariantContext("natural", snpLoc, snpLocStart, snpLocStop, allelesNaturalOrder); - VariantContext unnaturalVC = new VariantContext("unnatural", snpLoc, snpLocStart, snpLocStop, allelesUnnaturalOrder); + VariantContext naturalVC = snpBuilder.alleles(allelesNaturalOrder).make(); + VariantContext unnaturalVC = snpBuilder.alleles(allelesUnnaturalOrder).make(); Assert.assertEquals(new ArrayList(naturalVC.getAlleles()), allelesNaturalOrder); Assert.assertEquals(new ArrayList(unnaturalVC.getAlleles()), allelesUnnaturalOrder); } @@ -149,7 +159,7 @@ public class VariantContextUnitTest extends BaseTest { public void testCreatingSNPVariantContext() { List alleles = Arrays.asList(Aref, T); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + VariantContext vc = snpBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), snpLoc); Assert.assertEquals(vc.getStart(), snpLocStart); @@ -175,8 +185,8 @@ public class VariantContextUnitTest extends BaseTest { @Test public void testCreatingRefVariantContext() { - List alleles = Arrays.asList(Aref); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + List alleles = Arrays.asList(Aref); + VariantContext vc = snpBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), snpLoc); Assert.assertEquals(vc.getStart(), snpLocStart); @@ -202,7 +212,7 @@ public class VariantContextUnitTest extends BaseTest { @Test public void testCreatingDeletionVariantContext() { List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).referenceBaseForIndel((byte)'A').make(); Assert.assertEquals(vc.getChr(), delLoc); Assert.assertEquals(vc.getStart(), delLocStart); @@ -229,7 +239,7 @@ public class VariantContextUnitTest extends BaseTest { @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + VariantContext vc = insBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), insLoc); Assert.assertEquals(vc.getStart(), insLocStart); @@ -255,18 +265,18 @@ public class VariantContextUnitTest extends BaseTest { @Test public void testCreatingPartiallyCalledGenotype() { List alleles = Arrays.asList(Aref, C); - Genotype g = new Genotype("foo", Arrays.asList(C, Allele.NO_CALL), 10); - VariantContext vc = new VariantContext("test", snpLoc, snpLocStart, snpLocStop, alleles, Arrays.asList(g)); + Genotype g = new Genotype("foo", Arrays.asList(C, Allele.NO_CALL)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g).make(); Assert.assertTrue(vc.isSNP()); Assert.assertEquals(vc.getNAlleles(), 2); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotype("foo"), g); - Assert.assertEquals(vc.getChromosomeCount(), 2); // we know that there are 2 chromosomes, even though one isn't called - Assert.assertEquals(vc.getChromosomeCount(Aref), 0); - Assert.assertEquals(vc.getChromosomeCount(C), 1); + Assert.assertEquals(vc.getCalledChrCount(), 1); // we only have 1 called chromosomes, we exclude the NO_CALL one isn't called + Assert.assertEquals(vc.getCalledChrCount(Aref), 0); + Assert.assertEquals(vc.getCalledChrCount(C), 1); Assert.assertFalse(vc.getGenotype("foo").isHet()); Assert.assertFalse(vc.getGenotype("foo").isHom()); Assert.assertFalse(vc.getGenotype("foo").isNoCall()); @@ -275,55 +285,71 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertEquals(vc.getGenotype("foo").getType(), Genotype.Type.MIXED); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs1() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs2() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs3() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Throwable.class) public void testBadConstructorArgs4() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Collections.emptyList()); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.emptyList()).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles1() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles2() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make(); } - @Test (expectedExceptions = IllegalStateException.class) + @Test (expectedExceptions = Throwable.class) public void testBadLoc1() { List alleles = Arrays.asList(Aref, T, del); - new VariantContext("test", delLoc, delLocStart, delLocStop, alleles); + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); + } + + @Test (expectedExceptions = Throwable.class) + public void testBadID1() { + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make(); + } + + @Test (expectedExceptions = Exception.class) + public void testBadID2() { + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make(); + } + + @Test (expectedExceptions = Throwable.class) + public void testBadPError() { + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).log10PError(0.5).make(); } @Test public void testAccessingSimpleSNPGenotypes() { List alleles = Arrays.asList(Aref, T); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getSampleNames().size(), 3); Assert.assertEquals(vc.getGenotypes().size(), 3); @@ -342,36 +368,37 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertFalse(vc.hasGenotype("at")); Assert.assertFalse(vc.hasGenotype("tt")); - Assert.assertEquals(vc.getChromosomeCount(), 6); - Assert.assertEquals(vc.getChromosomeCount(Aref), 3); - Assert.assertEquals(vc.getChromosomeCount(T), 3); + Assert.assertEquals(vc.getCalledChrCount(), 6); + Assert.assertEquals(vc.getCalledChrCount(Aref), 3); + Assert.assertEquals(vc.getCalledChrCount(T), 3); } @Test public void testAccessingCompleteGenotypes() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); - Genotype g4 = new Genotype("Td", Arrays.asList(T, del), 10); - Genotype g5 = new Genotype("dd", Arrays.asList(del, del), 10); - Genotype g6 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + Genotype g4 = new Genotype("Td", Arrays.asList(T, del)); + Genotype g5 = new Genotype("dd", Arrays.asList(del, del)); + Genotype g6 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3, g4, g5, g6)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3, g4, g5, g6).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotypes().size(), 6); Assert.assertEquals(3, vc.getGenotypes(Arrays.asList("AA", "Td", "dd")).size()); - Assert.assertEquals(10, vc.getChromosomeCount()); - Assert.assertEquals(3, vc.getChromosomeCount(Aref)); - Assert.assertEquals(4, vc.getChromosomeCount(T)); - Assert.assertEquals(3, vc.getChromosomeCount(del)); - Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + Assert.assertEquals(10, vc.getCalledChrCount()); + Assert.assertEquals(3, vc.getCalledChrCount(Aref)); + Assert.assertEquals(4, vc.getCalledChrCount(T)); + Assert.assertEquals(3, vc.getCalledChrCount(del)); + Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); } @Test @@ -380,76 +407,79 @@ public class VariantContextUnitTest extends BaseTest { List alleles2 = Arrays.asList(Aref); List alleles3 = Arrays.asList(Aref, T, del); for ( List alleles : Arrays.asList(alleles1, alleles2, alleles3)) { - Genotype g1 = new Genotype("AA1", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AA2", Arrays.asList(Aref, Aref), 10); - Genotype g3 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3)); + Genotype g1 = new Genotype("AA1", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AA2", Arrays.asList(Aref, Aref)); + Genotype g3 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertTrue(vc.isMonomorphic()); - Assert.assertFalse(vc.isPolymorphic()); + Assert.assertTrue(vc.isMonomorphicInSamples()); + Assert.assertFalse(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotypes().size(), 3); - Assert.assertEquals(4, vc.getChromosomeCount()); - Assert.assertEquals(4, vc.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc.getChromosomeCount(T)); - Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + Assert.assertEquals(4, vc.getCalledChrCount()); + Assert.assertEquals(4, vc.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc.getCalledChrCount(T)); + Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); } } @Test public void testFilters() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - MutableVariantContext vc = new MutableVariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1,g2)); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1, g2).make(); Assert.assertTrue(vc.isNotFiltered()); Assert.assertFalse(vc.isFiltered()); Assert.assertEquals(0, vc.getFilters().size()); + Assert.assertFalse(vc.filtersWereApplied()); + Assert.assertNull(vc.getFiltersMaybeNull()); - vc.addFilter("BAD_SNP_BAD!"); + vc = new VariantContextBuilder(vc).filters("BAD_SNP_BAD!").make(); Assert.assertFalse(vc.isNotFiltered()); Assert.assertTrue(vc.isFiltered()); Assert.assertEquals(1, vc.getFilters().size()); + Assert.assertTrue(vc.filtersWereApplied()); + Assert.assertNotNull(vc.getFiltersMaybeNull()); - vc.addFilters(Arrays.asList("REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE")); + Set filters = new HashSet(Arrays.asList("BAD_SNP_BAD!", "REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE")); + vc = new VariantContextBuilder(vc).filters(filters).make(); Assert.assertFalse(vc.isNotFiltered()); Assert.assertTrue(vc.isFiltered()); Assert.assertEquals(3, vc.getFilters().size()); - - vc.clearFilters(); - - Assert.assertTrue(vc.isNotFiltered()); - Assert.assertFalse(vc.isFiltered()); - Assert.assertEquals(0, vc.getFilters().size()); + Assert.assertTrue(vc.filtersWereApplied()); + Assert.assertNotNull(vc.getFiltersMaybeNull()); } @Test - public void testVCromGenotypes() { + public void testVCFfromGenotypes() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); - Genotype g4 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); - Genotype g5 = new Genotype("--", Arrays.asList(del, del), 10); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop , alleles, Arrays.asList(g1,g2,g3,g4,g5)); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + Genotype g4 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + Genotype g5 = new Genotype("--", Arrays.asList(del, del)); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2)); - VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1)); - VariantContext vc23 = vc.subContextFromGenotypes(Arrays.asList(g2, g3)); - VariantContext vc4 = vc.subContextFromGenotypes(Arrays.asList(g4)); - VariantContext vc14 = vc.subContextFromGenotypes(Arrays.asList(g1, g4)); - VariantContext vc5 = vc.subContextFromGenotypes(Arrays.asList(g5)); + VariantContext vc12 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g2.getSampleName()))); + VariantContext vc1 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName()))); + VariantContext vc23 = vc.subContextFromSamples(new HashSet(Arrays.asList(g2.getSampleName(), g3.getSampleName()))); + VariantContext vc4 = vc.subContextFromSamples(new HashSet(Arrays.asList(g4.getSampleName()))); + VariantContext vc14 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g4.getSampleName()))); + VariantContext vc5 = vc.subContextFromSamples(new HashSet(Arrays.asList(g5.getSampleName()))); - Assert.assertTrue(vc12.isPolymorphic()); - Assert.assertTrue(vc23.isPolymorphic()); - Assert.assertTrue(vc1.isMonomorphic()); - Assert.assertTrue(vc4.isMonomorphic()); - Assert.assertTrue(vc14.isMonomorphic()); - Assert.assertTrue(vc5.isPolymorphic()); + Assert.assertTrue(vc12.isPolymorphicInSamples()); + Assert.assertTrue(vc23.isPolymorphicInSamples()); + Assert.assertTrue(vc1.isMonomorphicInSamples()); + Assert.assertTrue(vc4.isMonomorphicInSamples()); + Assert.assertTrue(vc14.isMonomorphicInSamples()); + Assert.assertTrue(vc5.isPolymorphicInSamples()); Assert.assertTrue(vc12.isSNP()); Assert.assertTrue(vc12.isVariant()); @@ -476,12 +506,35 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertTrue(vc5.isVariant()); Assert.assertTrue(vc5.isBiallelic()); - Assert.assertEquals(3, vc12.getChromosomeCount(Aref)); - Assert.assertEquals(1, vc23.getChromosomeCount(Aref)); - Assert.assertEquals(2, vc1.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc4.getChromosomeCount(Aref)); - Assert.assertEquals(2, vc14.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc5.getChromosomeCount(Aref)); + Assert.assertEquals(3, vc12.getCalledChrCount(Aref)); + Assert.assertEquals(1, vc23.getCalledChrCount(Aref)); + Assert.assertEquals(2, vc1.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc4.getCalledChrCount(Aref)); + Assert.assertEquals(2, vc14.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc5.getCalledChrCount(Aref)); + } + + public void testGetGenotypeMethods() { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + GenotypesContext gc = GenotypesContext.create(g1, g2, g3); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + + Assert.assertEquals(vc.getGenotype("AA"), g1); + Assert.assertEquals(vc.getGenotype("AT"), g2); + Assert.assertEquals(vc.getGenotype("TT"), g3); + Assert.assertEquals(vc.getGenotype("CC"), null); + + Assert.assertEquals(vc.getGenotypes(), gc); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT")), Arrays.asList(g1, g2)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "TT")), Arrays.asList(g1, g3)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "TT")), Arrays.asList(g1, g2, g3)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "CC")), Arrays.asList(g1, g2)); + + Assert.assertEquals(vc.getGenotype(0), g1); + Assert.assertEquals(vc.getGenotype(1), g2); + Assert.assertEquals(vc.getGenotype(2), g3); } // -------------------------------------------------------------------------------- @@ -520,7 +573,7 @@ public class VariantContextUnitTest extends BaseTest { @Test(dataProvider = "getAlleles") public void testMergeAlleles(GetAllelesTest cfg) { final List altAlleles = cfg.alleles.subList(1, cfg.alleles.size()); - final VariantContext vc = new VariantContext("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).referenceBaseForIndel((byte)'A').make(); Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles"); Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size"); @@ -550,4 +603,267 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertFalse(vc.hasAllele(missingAllele)); Assert.assertFalse(vc.hasAllele(missingAllele, true)); } -} + + private class SitesAndGenotypesVC extends TestDataProvider { + VariantContext vc, copy; + + private SitesAndGenotypesVC(String name, VariantContext original) { + super(SitesAndGenotypesVC.class, name); + this.vc = original; + this.copy = new VariantContextBuilder(original).make(); + } + + public String toString() { + return String.format("%s input=%s", super.toString(), vc); + } + } + + @DataProvider(name = "SitesAndGenotypesVC") + public Object[][] MakeSitesAndGenotypesVCs() { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + + VariantContext sites = new VariantContextBuilder("sites", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).make(); + VariantContext genotypes = new VariantContextBuilder(sites).source("genotypes").genotypes(g1, g2, g3).make(); + + new SitesAndGenotypesVC("sites", sites); + new SitesAndGenotypesVC("genotypes", genotypes); + + return SitesAndGenotypesVC.getTests(SitesAndGenotypesVC.class); + } + + // -------------------------------------------------------------------------------- + // + // Test modifying routines + // + // -------------------------------------------------------------------------------- + @Test(dataProvider = "SitesAndGenotypesVC") + public void runModifyVCTests(SitesAndGenotypesVC cfg) { + VariantContext modified = new VariantContextBuilder(cfg.vc).loc("chr2", 123, 123).make(); + Assert.assertEquals(modified.getChr(), "chr2"); + Assert.assertEquals(modified.getStart(), 123); + Assert.assertEquals(modified.getEnd(), 123); + + modified = new VariantContextBuilder(cfg.vc).id("newID").make(); + Assert.assertEquals(modified.getID(), "newID"); + + Set newFilters = Collections.singleton("newFilter"); + modified = new VariantContextBuilder(cfg.vc).filters(newFilters).make(); + Assert.assertEquals(modified.getFilters(), newFilters); + + modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); + Assert.assertEquals(modified.getAttribute("AC"), 1); + modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); + Assert.assertEquals(modified.getAttribute("AC"), 2); + modified = new VariantContextBuilder(modified).attributes(null).make(); + Assert.assertTrue(modified.getAttributes().isEmpty()); + + Genotype g1 = new Genotype("AA2", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT2", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT2", Arrays.asList(T, T)); + GenotypesContext gc = GenotypesContext.create(g1,g2,g3); + modified = new VariantContextBuilder(cfg.vc).genotypes(gc).make(); + Assert.assertEquals(modified.getGenotypes(), gc); + modified = new VariantContextBuilder(cfg.vc).noGenotypes().make(); + Assert.assertTrue(modified.getGenotypes().isEmpty()); + + // test that original hasn't changed + Assert.assertEquals(cfg.vc.getChr(), cfg.copy.getChr()); + Assert.assertEquals(cfg.vc.getStart(), cfg.copy.getStart()); + Assert.assertEquals(cfg.vc.getEnd(), cfg.copy.getEnd()); + Assert.assertEquals(cfg.vc.getAlleles(), cfg.copy.getAlleles()); + Assert.assertEquals(cfg.vc.getAttributes(), cfg.copy.getAttributes()); + Assert.assertEquals(cfg.vc.getID(), cfg.copy.getID()); + Assert.assertEquals(cfg.vc.getGenotypes(), cfg.copy.getGenotypes()); + Assert.assertEquals(cfg.vc.getLog10PError(), cfg.copy.getLog10PError()); + Assert.assertEquals(cfg.vc.getFilters(), cfg.copy.getFilters()); + } + + // -------------------------------------------------------------------------------- + // + // Test subcontext + // + // -------------------------------------------------------------------------------- + private class SubContextTest extends TestDataProvider { + Set samples; + boolean updateAlleles; + + private SubContextTest(Collection samples, boolean updateAlleles) { + super(SubContextTest.class); + this.samples = new HashSet(samples); + this.updateAlleles = updateAlleles; + } + + public String toString() { + return String.format("%s samples=%s updateAlleles=%b", super.toString(), samples, updateAlleles); + } + } + + @DataProvider(name = "SubContextTest") + public Object[][] MakeSubContextTest() { + for ( boolean updateAlleles : Arrays.asList(true, false)) { + new SubContextTest(Collections.emptySet(), updateAlleles); + new SubContextTest(Collections.singleton("AA"), updateAlleles); + new SubContextTest(Collections.singleton("AT"), updateAlleles); + new SubContextTest(Collections.singleton("TT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles); + } + + return SubContextTest.getTests(SubContextTest.class); + } + + @Test(dataProvider = "SubContextTest") + public void runSubContextTest(SubContextTest cfg) { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + + GenotypesContext gc = GenotypesContext.create(g1, g2, g3); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + VariantContext sub = cfg.updateAlleles ? vc.subContextFromSamples(cfg.samples) : vc.subContextFromSamples(cfg.samples, vc.getAlleles()); + + // unchanged attributes should be the same + Assert.assertEquals(sub.getChr(), vc.getChr()); + Assert.assertEquals(sub.getStart(), vc.getStart()); + Assert.assertEquals(sub.getEnd(), vc.getEnd()); + Assert.assertEquals(sub.getLog10PError(), vc.getLog10PError()); + Assert.assertEquals(sub.getFilters(), vc.getFilters()); + Assert.assertEquals(sub.getID(), vc.getID()); + Assert.assertEquals(sub.getReferenceBaseForIndel(), vc.getReferenceBaseForIndel()); + Assert.assertEquals(sub.getAttributes(), vc.getAttributes()); + + Set expectedGenotypes = new HashSet(); + if ( cfg.samples.contains(g1.getSampleName()) ) expectedGenotypes.add(g1); + if ( cfg.samples.contains(g2.getSampleName()) ) expectedGenotypes.add(g2); + if ( cfg.samples.contains(g3.getSampleName()) ) expectedGenotypes.add(g3); + GenotypesContext expectedGC = GenotypesContext.copy(expectedGenotypes); + + // these values depend on the results of sub + if ( cfg.updateAlleles ) { + // do the work to see what alleles should be here, and which not + Set alleles = new HashSet(); + for ( final Genotype g : expectedGC ) alleles.addAll(g.getAlleles()); + if ( ! alleles.contains(Aref) ) alleles.add(Aref); // always have the reference + Assert.assertEquals(new HashSet(sub.getAlleles()), alleles); + } else { + // not updating alleles -- should be the same + Assert.assertEquals(sub.getAlleles(), vc.getAlleles()); + } + + // same sample names => success + Assert.assertEquals(sub.getGenotypes().getSampleNames(), expectedGC.getSampleNames()); + } + + // -------------------------------------------------------------------------------- + // + // Test sample name functions + // + // -------------------------------------------------------------------------------- + private class SampleNamesTest extends TestDataProvider { + List sampleNames; + List sampleNamesInOrder; + + private SampleNamesTest(List sampleNames, List sampleNamesInOrder) { + super(SampleNamesTest.class); + this.sampleNamesInOrder = sampleNamesInOrder; + this.sampleNames = sampleNames; + } + + public String toString() { + return String.format("%s samples=%s order=%s", super.toString(), sampleNames, sampleNamesInOrder); + } + } + + @DataProvider(name = "SampleNamesTest") + public Object[][] MakeSampleNamesTest() { + new SampleNamesTest(Arrays.asList("1"), Arrays.asList("1")); + new SampleNamesTest(Arrays.asList("2", "1"), Arrays.asList("1", "2")); + new SampleNamesTest(Arrays.asList("1", "2"), Arrays.asList("1", "2")); + new SampleNamesTest(Arrays.asList("1", "2", "3"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("2", "1", "3"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("2", "3", "1"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("3", "1", "2"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("3", "2", "1"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("NA2", "NA1"), Arrays.asList("NA1", "NA2")); + return SampleNamesTest.getTests(SampleNamesTest.class); + } + + private final static void assertGenotypesAreInOrder(Iterable gIt, List names) { + int i = 0; + for ( final Genotype g : gIt ) { + Assert.assertEquals(g.getSampleName(), names.get(i), "Unexpected genotype ordering"); + i++; + } + } + + + @Test(dataProvider = "SampleNamesTest") + public void runSampleNamesTest(SampleNamesTest cfg) { + GenotypesContext gc = GenotypesContext.create(cfg.sampleNames.size()); + for ( final String name : cfg.sampleNames ) { + gc.add(new Genotype(name, Arrays.asList(Aref, T))); + } + + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + + // same sample names => success + Assert.assertEquals(vc.getSampleNames(), new HashSet(cfg.sampleNames), "vc.getSampleNames() = " + vc.getSampleNames()); + Assert.assertEquals(vc.getSampleNamesOrderedByName(), cfg.sampleNamesInOrder, "vc.getSampleNamesOrderedByName() = " + vc.getSampleNamesOrderedByName()); + + assertGenotypesAreInOrder(vc.getGenotypesOrderedByName(), cfg.sampleNamesInOrder); + assertGenotypesAreInOrder(vc.getGenotypesOrderedBy(cfg.sampleNames), cfg.sampleNames); + } + + @Test + public void testGenotypeCounting() { + Genotype noCall = new Genotype("nocall", Arrays.asList(Allele.NO_CALL)); + Genotype mixed = new Genotype("mixed", Arrays.asList(Aref, Allele.NO_CALL)); + Genotype homRef = new Genotype("homRef", Arrays.asList(Aref, Aref)); + Genotype het = new Genotype("het", Arrays.asList(Aref, T)); + Genotype homVar = new Genotype("homVar", Arrays.asList(T, T)); + + List allGenotypes = Arrays.asList(noCall, mixed, homRef, het, homVar); + final int nCycles = allGenotypes.size() * 10; + + for ( int i = 0; i < nCycles; i++ ) { + int nNoCall = 0, nNoCallAlleles = 0, nA = 0, nT = 0, nMixed = 0, nHomRef = 0, nHet = 0, nHomVar = 0; + int nSamples = 0; + GenotypesContext gc = GenotypesContext.create(); + for ( int j = 0; j < i; j++ ) { + nSamples++; + Genotype g = allGenotypes.get(j % allGenotypes.size()); + final String name = String.format("%s_%d%d", g.getSampleName(), i, j); + gc.add(new Genotype(name, g.getAlleles())); + switch ( g.getType() ) { + case NO_CALL: nNoCall++; nNoCallAlleles++; break; + case HOM_REF: nA += 2; nHomRef++; break; + case HET: nA++; nT++; nHet++; break; + case HOM_VAR: nT += 2; nHomVar++; break; + case MIXED: nA++; nNoCallAlleles++; nMixed++; break; + default: throw new RuntimeException("Unexpected genotype type " + g.getType()); + } + + } + + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + Assert.assertEquals(vc.getNSamples(), nSamples); + if ( nSamples > 0 ) { + Assert.assertEquals(vc.isPolymorphicInSamples(), nT > 0); + Assert.assertEquals(vc.isMonomorphicInSamples(), nT == 0); + } + Assert.assertEquals(vc.getCalledChrCount(), nA + nT); + + Assert.assertEquals(vc.getCalledChrCount(Allele.NO_CALL), nNoCallAlleles); + Assert.assertEquals(vc.getCalledChrCount(Aref), nA); + Assert.assertEquals(vc.getCalledChrCount(T), nT); + + Assert.assertEquals(vc.getNoCallCount(), nNoCall); + Assert.assertEquals(vc.getHomRefCount(), nHomRef); + Assert.assertEquals(vc.getHetCount(), nHet); + Assert.assertEquals(vc.getHomVarCount(), nHomVar); + Assert.assertEquals(vc.getMixedCount(), nMixed); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 845d9c216..ccf560f83 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -98,9 +99,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { int start = 10; int stop = start; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContext(source, "1", start, stop, alleles, - genotypes == null ? null : VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes), - 1.0, filters, null, Cref.getBases()[0]); + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).referenceBaseForIndel(Cref.getBases()[0]).make(); } // -------------------------------------------------------------------------------- @@ -246,20 +245,18 @@ public class VariantContextUtilsUnitTest extends BaseTest { @Test(dataProvider = "simplemergersiddata") public void testRSIDMerge(SimpleMergeRSIDTest cfg) { - final VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); + VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); final List inputs = new ArrayList(); for ( final String id : cfg.inputs ) { - MutableVariantContext vc = new MutableVariantContext(snpVC1); - if ( ! id.equals(".") ) vc.setID(id); - inputs.add(vc); + inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); } final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser, inputs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); - Assert.assertEquals(merged.getID(), cfg.expected.equals(".") ? null : cfg.expected); + Assert.assertEquals(merged.getID(), cfg.expected); } // -------------------------------------------------------------------------------- @@ -412,44 +409,44 @@ public class VariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "mergeGenotypes") public Object[][] mergeGenotypesData() { new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s2", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); new MergeGenotypesTest("PreserveNoCall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1), makeG("s2", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); new MergeGenotypesTest("PerserveAlleles", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, 2)), - makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1), makeG("s2", Aref, C, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), + makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s3", Aref, T, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); // // merging genothpes with PLs @@ -457,41 +454,41 @@ public class VariantContextUtilsUnitTest extends BaseTest { // first, do no harm new MergeGenotypesTest("OrderedPLs", "1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)), - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles", "1", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", - makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, 1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)), + makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, 1), makeG("s3", Aref, T, 3))); + makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1, 1, 2, 3, 4, 5, 6)), - makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, 2, 6, 5, 4, 3, 2, 1)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1), makeG("s2", Aref, T, 2))); + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); return MergeGenotypesTest.getTests(MergeGenotypesTest.class); } @@ -510,7 +507,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { } // necessary to not overload equals for genotypes - private void assertGenotypesAreMostlyEqual(Map actual, Map expected) { + private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { if (actual == expected) { return; } @@ -523,13 +520,11 @@ public class VariantContextUtilsUnitTest extends BaseTest { Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); } - for (Map.Entry entry : actual.entrySet()) { - String key = entry.getKey(); - Genotype value = entry.getValue(); - Genotype expectedValue = expected.get(key); + for (Genotype value : actual) { + Genotype expectedValue = expected.get(value.getSampleName()); Assert.assertEquals(value.alleles, expectedValue.alleles, "Alleles in Genotype aren't equal"); - Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError(), "GQ values aren't equal"); + Assert.assertEquals(value.getLog10PError(), expectedValue.getLog10PError(), "GQ values aren't equal"); Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); if ( value.hasLikelihoods() ) Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); @@ -538,21 +533,21 @@ public class VariantContextUtilsUnitTest extends BaseTest { @Test public void testMergeGenotypesUniquify() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)); + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser, Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); // test genotypes - Assert.assertEquals(merged.getGenotypes().keySet(), new HashSet(Arrays.asList("s1.1", "s1.2"))); + Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); } @Test(expectedExceptions = UserException.class) public void testMergeGenotypesRequireUnique() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)); + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser, Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java index b5f6b1b1a..6f5756bdc 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.variantcontext; import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; @@ -143,7 +144,7 @@ public class VariantJEXLContextUnitTest extends BaseTest { private JEXLMap getVarContext() { List alleles = Arrays.asList(Aref, T); - VariantContext vc = new VariantContext("test", snpLoc.getContig(), snpLoc.getStart(), snpLoc.getStop(), alleles); + VariantContext vc = new VariantContextBuilder("test", snpLoc.getContig(), snpLoc.getStart(), snpLoc.getStop(), alleles).make(); return new JEXLMap(Arrays.asList(exp),vc); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 768eab7e4..913bd243c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -26,7 +26,6 @@ package org.broadinstitute.sting.queue import function.QFunction import java.io.File -import java.util.Arrays import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.queue.engine.{QGraphSettings, QGraph} @@ -34,6 +33,9 @@ import collection.JavaConversions._ import org.broadinstitute.sting.utils.classloader.PluginManager import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.utils.io.IOUtils +import org.broadinstitute.sting.utils.help.ApplicationDetails +import java.util.{ResourceBundle, Arrays} +import org.broadinstitute.sting.utils.text.TextFormattingUtils /** * Entry point of Queue. Compiles and runs QScripts passed in to the command line. @@ -175,6 +177,42 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentTypeDescriptors = Arrays.asList(new ScalaCompoundArgumentTypeDescriptor) + override def getApplicationDetails : ApplicationDetails = { + new ApplicationDetails(createQueueHeader(), + List.empty[String], + ApplicationDetails.createDefaultRunningInstructions(getClass.asInstanceOf[Class[CommandLineProgram]]), + "") + } + + private def createQueueHeader() : List[String] = { + List(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), + "Copyright (c) 2011 The Broad Institute", + "Please view our documentation at http://www.broadinstitute.org/gsa/wiki", + "For support, please view our support site at http://getsatisfaction.com/gsa") + } + + private def getQueueVersion : String = { + var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + + if ( stingResources.containsKey("org.broadinstitute.sting.queue.QueueVersion.version") ) { + stingResources.getString("org.broadinstitute.sting.queue.QueueVersion.version") + } + else { + "" + } + } + + private def getBuildTimestamp : String = { + var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + + if ( stingResources.containsKey("build.timestamp") ) { + stingResources.getString("build.timestamp") + } + else { + "" + } + } + def shutdown() = { shuttingDown = true qGraph.shutdown() diff --git a/public/java/src/net/sf/samtools/GATKBinList.java b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala similarity index 57% rename from public/java/src/net/sf/samtools/GATKBinList.java rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala index b53062aaf..d90db0de4 100644 --- a/public/java/src/net/sf/samtools/GATKBinList.java +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala @@ -22,30 +22,27 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package net.sf.samtools; +package org.broadinstitute.sting.queue.extensions.gatk -import java.util.BitSet; +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Output, Argument, Input} +import java.io.File +import org.broadinstitute.sting.utils.interval.IntervalUtils -/** - * A temporary solution to work around Java access rights issues: - * override chunk and make it public. - * TODO: Eliminate once we determine the final fate of the BAM index reading code. - */ -public class GATKBinList extends BinList { - /** - * Create a new BinList over sequenceCount sequences, consisting of the given bins. - * @param referenceSequence Reference sequence to which these bins are relevant. - * @param bins The given bins to include. - */ - public GATKBinList(final int referenceSequence, final BitSet bins) { - super(referenceSequence,bins); - } +class WriteFlankingIntervalsFunction extends InProcessFunction { + @Input(doc="The reference sequence") + var reference : File = _ - /** - * Retrieves the bins stored in this list. - * @return A bitset where a bin is present in the list if the bit is true. - */ - public BitSet getBins() { - return super.getBins(); - } + @Input(doc="The interval list to flank") + var inputIntervals : File = _ + + @Output(doc="The output intervals file to write to") + var outputIntervals: File = _ + + @Argument(doc="Number of base pair to flank the input intervals") + var flankSize : Int = _ + + def run() { + IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize) + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala deleted file mode 100755 index 77eb3ccbc..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import collection.immutable.TreeSet - -// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { - @Input(doc="The interval list to expand") val inList : File = in - @Input(doc="The reference sequence") val refDict : File = ref - @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start - @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size - @Output(doc="The output intervals file to write to") val outList : File = out - @Argument(doc="The output format for the intervals") val outTypeStr = opType - @Argument(doc="The input format for the intervals") val inTypeStr = ipType - - var output : PrintStream = _ - var parser : GenomeLocParser = _ - var xrl : XReadLines = _ - val outType = IntervalFormatType.convert(outTypeStr) - val inType = IntervalFormatType.convert(inTypeStr) - - var offsetIn : Int = 0 - var offsetOut : Int = 0 - - var first : Boolean = true - var lastTwo : (GenomeLoc,GenomeLoc) = _ - - var intervalCache : TreeSet[GenomeLoc] = _ - val LINES_TO_CACHE : Int = 1000 - - def run = { - output = new PrintStream(outList) - intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{ - def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) } - }) - parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - xrl = new XReadLines(inList) - offsetIn = if (isBed(inType)) 1 else 0 - offsetOut = if( isBed(outType)) 1 else 0 - var line : String = xrl.next - while ( line.startsWith("@") ) { - line = xrl.next - } - var prevLoc: GenomeLoc = null - var curLoc: GenomeLoc = null - var nextLoc : GenomeLoc = parseGenomeInterval(line) - var linesProcessed : Int = 1 - while ( prevLoc != null || curLoc != null || nextLoc != null ) { - prevLoc = curLoc - curLoc = nextLoc - nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null - if ( curLoc != null ) { - val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc) - val right: GenomeLoc = refine(expandRight(curLoc),nextLoc) - if ( left != null ) { - intervalCache += left - } - if ( right != null ) { - intervalCache += right - } - } - linesProcessed += 1 - if ( linesProcessed % LINES_TO_CACHE == 0 ) { - val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt)) - intervalCache = intervalCache -- toPrint - toPrint.foreach(u => output.print("%s%n".format(repr(u)))) - } - //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc))) - } - - intervalCache.foreach(u => output.print("%s%n".format(repr(u)))) - - output.close() - } - - def expandLeft(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt) - } - - def expandRight(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt) - } - - def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = { - if ( borderG == null || ! newG.overlapsP(borderG) ) { - return newG - } else { - if ( newG.getStart < borderG.getStart ) { - if ( borderG.getStart - startInt > newG.getStart ) { - return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt) - } - } else { - if ( borderG.getStop + startInt < newG.getStop ){ - return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop) - } - } - } - - null - } - - def repr(loc : GenomeLoc) : String = { - if ( loc == null ) return "null" - if ( outType == IntervalFormatType.INTERVALS ) { - return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) - } - } - - def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { - t == IntervalFormatType.BED - } - - def parseGenomeInterval( s : String ) : GenomeLoc = { - val sp = s.split("\\s+") - // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) - } - - object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { - type IntervalFormatType = Value - val INTERVALS,BED,TDF = Value - - def convert(s : String) : IntervalFormatType = { - if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} - } - } -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala deleted file mode 100755 index e929477a1..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ /dev/null @@ -1,70 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import collection.JavaConversions._ -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} -import org.broadinstitute.sting.utils.text.XReadLines -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} - -class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { - @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals - @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false - @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - - - var outStream : PrintStream = _ - var contigs : List[String] = Nil - var dict : SAMSequenceDictionary = _ - var parser : GenomeLocParser = _ - - def run = { - outStream = new PrintStream(output) - dict = new SAMSequenceDictionary - // note: memory hog - val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex - sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) - parser = new GenomeLocParser(dict) - sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) - outStream.close() - } - - def writeOut(g : GenomeLoc) : Unit = { - outStream.print("%s%n".format(g.toString)) - } - - def parse(s : String) : (String,Int,Int) = { - if ( s.contains(":") ) { - val split1 = s.split(":") - val split2 = split1(1).split("-") - return (split1(0),split2(0).toInt,split2(1).toInt) - } else { - val split = s.split("\\s+") - return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) - } - } - - def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { - parser.createGenomeLoc(coords._1,coords._2,coords._3) - } - - def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { - if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { - a.init :+ (a.last :+ b) - } else { - a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) - } - } - - def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { - if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for - null - } - u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) - } - - def range(a: Int, b: Int) : Range = new Range(a,b+1,1) - -} \ No newline at end of file diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar old mode 100755 new mode 100644 similarity index 88% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar index bfd06f97f..ee5d02367 Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml similarity index 77% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml index f0568def4..5417641d3 100644 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml +++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml @@ -1,3 +1,3 @@ - +