diff --git a/java/src/net/sf/samtools/BAMFileReader2.java b/java/src/net/sf/samtools/BAMFileReader2.java index f0412892f..0509b8df6 100644 --- a/java/src/net/sf/samtools/BAMFileReader2.java +++ b/java/src/net/sf/samtools/BAMFileReader2.java @@ -45,6 +45,7 @@ class BAMFileReader2 private BinaryCodec mStream = null; // Underlying compressed data stream. private final BlockCompressedInputStream mCompressedInputStream; + private SAMFileReader mFileReader = null; private SAMFileHeader mFileHeader = null; // Populated if the file is seekable and an index exists private BAMFileIndex2 mFileIndex = null; @@ -100,6 +101,14 @@ class BAMFileReader2 mFirstRecordPointer = mCompressedInputStream.getFilePointer(); } + /** + * Sets the reader reading this file. + * @param reader The source reader. + */ + void setReader(SAMFileReader reader) { + mFileReader = reader; + } + void close() { if (mStream != null) { mStream.close(); @@ -415,11 +424,16 @@ class BAMFileReader2 void advance() { try { + long startCoordinate = mCompressedInputStream.getFilePointer(); mNextRecord = getNextRecord(); + long stopCoordinate = mCompressedInputStream.getFilePointer(); + if (mNextRecord != null) { ++this.samRecordIndex; // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setReader(mFileReader); mNextRecord.setValidationStringency(mValidationStringency); + mNextRecord.setCoordinates(new Chunk(startCoordinate,stopCoordinate)); if (mValidationStringency != ValidationStringency.SILENT) { final List validationErrors = mNextRecord.isValid(); diff --git a/java/src/net/sf/samtools/Block.java b/java/src/net/sf/samtools/Block.java deleted file mode 100644 index d3b7c5393..000000000 --- a/java/src/net/sf/samtools/Block.java +++ /dev/null @@ -1,34 +0,0 @@ -package net.sf.samtools; - -/** - * Represents the position of a block on disk. - * - * @author mhanna - * @version 0.1 - */ -public class Block { - public final long position; - public final int compressedBlockSize; - public final long uncompressedBlockSize; - - /** - * Create a block, loading no data into memory. - * @param position Position of this block on disk.s - * @param compressedBlockSize Size of the block on disk; if compressedData is present, should match compressedData.length. - * @param uncompressedBlockSize Size of the data in the block. - */ - public Block(final long position, final int compressedBlockSize, final long uncompressedBlockSize) { - this.position = position; - this.compressedBlockSize = compressedBlockSize; - this.uncompressedBlockSize = uncompressedBlockSize; - } - - /** - * Build a string representation of the block. - * @return A string indicating position and size. - */ - @Override - public String toString() { - return String.format("Block: pos = %d, compressed size = %d, uncompressed size = %d",position,compressedBlockSize,uncompressedBlockSize); - } -} diff --git a/java/src/net/sf/samtools/BlockReader.java b/java/src/net/sf/samtools/BlockReader.java deleted file mode 100644 index 1da6b5cd3..000000000 --- a/java/src/net/sf/samtools/BlockReader.java +++ /dev/null @@ -1,104 +0,0 @@ -package net.sf.samtools; - -import net.sf.samtools.util.BlockCompressedStreamConstants; - -import java.io.IOException; -import java.io.FileInputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; - -/** - * Read an individual block from the BGZF file. - * - * @author mhanna - * @version 0.1 - */ -public class BlockReader { - /** - * File channel from which to read. - */ - private final FileChannel channel; - - /** - * Store buffered information from the file temporarily. - */ - private final ByteBuffer buffer; - - /** - * Create a new block reader. Block readers can operate independently on the same input file. - * @param inputStream InputStream from which to read. - */ - public BlockReader(final FileInputStream inputStream) { - this.channel = inputStream.getChannel(); - this.buffer = ByteBuffer.allocateDirect(BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - - /** - * Closes the block reader's channel. - * @throws IOException On failure to close channel. - */ - public void close() throws IOException { - this.channel.close(); - } - - /** - * Determine whether the given offset is at the end of the file. - * @param position Position at which to start reading. Must be at the beginning of a block. - * @return True if we're at the end of the file (or at the end of the data stream), false otherwise. - * @throws IOException If whether the position is past the end of the file can't be determined. - */ - public boolean eof(long position) throws IOException { - return (channel.size() - position) <= 0 || - (channel.size() - position) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; - } - - /** - * Load the chunk information at a given position. - * @param position Position at which the chunk begins. - * @return Chunk of the BGZF file, starting at position. - * @throws IOException if the chunk could not be read. - */ - public Block getBlockAt(long position) throws IOException { - buffer.rewind(); - buffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - int count = channel.read(buffer,position); - if (count == 0) { - throw new IOException("Tried to read chunk past end of file"); - } - if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { - throw new IOException("Premature end of file"); - } - buffer.flip(); - - // Validate GZIP header - if (buffer.get() != BlockCompressedStreamConstants.GZIP_ID1 || - buffer.get() != (byte)BlockCompressedStreamConstants.GZIP_ID2 || - buffer.get() != BlockCompressedStreamConstants.GZIP_CM_DEFLATE || - buffer.get() != BlockCompressedStreamConstants.GZIP_FLG) { - throw new SAMFormatException("Invalid GZIP header"); - } - // Skip MTIME, XFL, OS fields - buffer.position(buffer.position() + 6); - if (buffer.getShort() != BlockCompressedStreamConstants.GZIP_XLEN) { - throw new SAMFormatException("Invalid GZIP header"); - } - // Skip blocksize subfield intro - buffer.position(buffer.position() + 4); - // Read ushort - final int compressedBlockSize = (buffer.getShort() & 0xffff) + 1; - if (compressedBlockSize < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || compressedBlockSize > buffer.capacity()) { - throw new IOException("Unexpected compressed block length: " + compressedBlockSize); - } - - // Read the uncompressed block size - buffer.rewind(); - buffer.limit(4); - channel.read(buffer,position+compressedBlockSize-4); - buffer.flip(); - final int uncompressedBlockSize = buffer.getInt(); - - return new Block(position,compressedBlockSize,uncompressedBlockSize); - } -} diff --git a/java/src/net/sf/samtools/BlockSegment.java b/java/src/net/sf/samtools/BlockSegment.java deleted file mode 100644 index eb6082e44..000000000 --- a/java/src/net/sf/samtools/BlockSegment.java +++ /dev/null @@ -1,112 +0,0 @@ -package net.sf.samtools; - -import java.util.List; -import java.util.Collections; -import java.util.ArrayList; - -/** - * Represents a segment of a given block. A block segment can easily be - * converted or compared to either a block or a chunk. - */ -class BlockSegment { - /** - * Offset to the start of this block within the file. - */ - private final long position; - - /** - * Starting coordinate within this segment. - */ - private final long blockStart; - - /** - * Last coordinate within this segment. - */ - private final long blockStop; - - /** - * Create a new block segment from the given block. - * @param block the block to use as the starting point for this segment. - */ - public BlockSegment(Block block) { - this(block.position,0,block.uncompressedBlockSize-1); - } - - BlockSegment(final long position,final long blockStart,final long blockStop) { - this.position = position; - this.blockStart = blockStart; - this.blockStop = blockStop; - } - - /** - * The size of the block segment. - * @return Number of bytes represented by this block segment. - */ - public long size() { - return blockStop - blockStart + 1; - } - - /** - * Convert the given segment to a chunk. - * @return the chunk equivalent of this block. - */ - public Chunk toChunk() { - return new Chunk(position << 16 | blockStart,position << 16 | blockStop); - } - - /** - * Does this block segment overlap the given chunk? - * @param chunk The chunk to test. - * @return true if the chunk is w/i the block; false otherwise. - */ - public boolean overlaps(Chunk chunk) { - Chunk segmentAsChunk = toChunk(); - - // Chunk is completely encompasses the given block. - if(chunk.getChunkStart() < segmentAsChunk.getChunkStart() && - chunk.getChunkEnd() > segmentAsChunk.getChunkEnd()) - return true; - - // Chunk starts w/i block. - if(chunk.getChunkStart() >= segmentAsChunk.getChunkStart() && - chunk.getChunkStart() <= segmentAsChunk.getChunkEnd()) - return true; - - // Chunk ends w/i block. - if(chunk.getChunkEnd() >= segmentAsChunk.getChunkStart() && - chunk.getChunkEnd() <= segmentAsChunk.getChunkEnd()) - return true; - - return false; - } - - /** - * Subtracts the given chunk from the block segment. - * @param chunk The chunk to subtract. - * @return The block segment(s) derived from subtracting the chunk. - */ - public List minus(Chunk chunk) { - // If no overlap exists, just return a new instance of the current block segment. - if(!overlaps(chunk)) - return Collections.singletonList(this); - - List differenceSegments = new ArrayList(); - Chunk segmentAsChunk = toChunk(); - - // If the segment begins before the chunk, build out a segment from the start of the block to just before - // the start of the chunk or the end of the block segment, whichever comes first. - if(segmentAsChunk.getChunkStart() < chunk.getChunkStart()) { - long segmentStop = (position == chunk.getChunkStart()>>16) ? (chunk.getChunkStart()&0xFFFF)-1 : blockStop; - differenceSegments.add(new BlockSegment(position,blockStart,segmentStop)); - } - - // If the segment ends after the chunk, build out a segment from either after the end of the chunk or the - // start of the block segment, whichever comes last. - if(segmentAsChunk.getChunkEnd() > chunk.getChunkEnd()) { - long segmentStart = (position == chunk.getChunkEnd()>>16) ? (chunk.getChunkEnd()&0xFFFF)+1 : blockStart; - differenceSegments.add(new BlockSegment(position,segmentStart,blockStop)); - } - - return differenceSegments; - } -} diff --git a/java/src/net/sf/samtools/Chunk.java b/java/src/net/sf/samtools/Chunk.java index 949ec8879..a9750ce28 100644 --- a/java/src/net/sf/samtools/Chunk.java +++ b/java/src/net/sf/samtools/Chunk.java @@ -17,7 +17,7 @@ public class Chunk implements Cloneable,Comparable { private long mChunkStart; private long mChunkEnd; - Chunk(final long start, final long end) { + public Chunk(final long start, final long end) { mChunkStart = start; mChunkEnd = end; } diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java index 9943e6d53..9f2b2a7b8 100644 --- a/java/src/net/sf/samtools/SAMFileReader2.java +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -73,6 +73,7 @@ public class SAMFileReader2 extends SAMFileReader { try { BAMFileReader2 reader = new BAMFileReader2(file,eagerDecode,getDefaultValidationStringency()); + reader.setReader(this); BAMFileIndex2 index = new BAMFileIndex2(indexFile != null ? indexFile : findIndexFileFromParent(file)); reader.setFileIndex(index); diff --git a/java/src/net/sf/samtools/SAMRecord.java b/java/src/net/sf/samtools/SAMRecord.java new file mode 100644 index 000000000..d21a6b7f0 --- /dev/null +++ b/java/src/net/sf/samtools/SAMRecord.java @@ -0,0 +1,1558 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.StringUtil; + +import java.util.*; + +/** + * Java binding for a SAM file record. c.f. http://samtools.sourceforge.net/SAM1.pdf + * + * The presence of reference name/reference index and alignment start + * do not necessarily mean that a read is aligned. Those values may merely be set to force a SAMRecord + * to appear in a certain place in the sort order. The readUnmappedFlag must be checked to determine whether + * or not a read is mapped. Only if the readUnmappedFlag is false can the reference name/index and alignment start + * be interpretted as indicating an actual alignment position. + * + * Likewise, presence of mate reference name/index and mate alignment start do not necessarily mean that the + * mate is aligned. These may be set for an unaligned mate if the mate has been forced into a particular place + * in the sort order per the above paragraph. Only if the mateUnmappedFlag is false can the mate reference name/index + * and mate alignment start be interpretted as indicating the actual alignment position of the mate. + * + * Note also that there are a number of getters & setters that are linked, i.e. they present different representations + * of the same underlying data. In these cases there is typically a representation that is preferred because it + * ought to be faster than some other representation. The following are the preferred representations: + * + * getReadNameLength() is preferred to getReadName().length() + * get/setReadBases() is preferred to get/setReadString() + * get/setBaseQualities() is preferred to get/setBaseQualityString() + * get/setReferenceIndex() is preferred to get/setReferenceName() + * get/setMateReferenceIndex() is preferred to get/setMateReferenceName() + * getCigarLength() is preferred to getCigar().getNumElements() + * get/setCigar() is preferred to get/setCigarString() + * + * Note that setIndexingBin() need not be called when writing SAMRecords. It will be computed as necessary. It is only + * present as an optimization in the event that the value is already known and need not be computed. + * + * setHeader() need not be called when writing SAMRecords. It may be convenient to call it, however, because + * get/setReferenceIndex() and get/setMateReferenceIndex() must have access to the SAM header, either as an argument + * or previously passed to setHeader(). + * + * setHeader() is called by the SAM reading code, so the get/setReferenceIndex() and get/setMateReferenceIndex() + * methods will have access to the sequence dictionary. + * + * Some of the get() methods return values that are mutable, due to the limitations of Java. A caller should + * never change the value returned by a get() method. If you want to change the value of some attribute of a + * SAMRecord, create a new value object and call the appropriate set() method. + */ +public class SAMRecord implements Cloneable +{ + /** + * Alignment score for a good alignment, but where computing a Phred-score is not feasible. + */ + public static final int UNKNOWN_MAPPING_QUALITY = 255; + + /** + * Alignment score for an unaligned read. + */ + public static final int NO_MAPPING_QUALITY = 0; + + /** + * If a read has this reference name, it is unaligned, but not all unaligned reads have + * this reference name (see above). + */ + public static final String NO_ALIGNMENT_REFERENCE_NAME = "*"; + + /** + * If a read has this reference index, it is unaligned, but not all unaligned reads have + * this reference index (see above). + */ + public static final int NO_ALIGNMENT_REFERENCE_INDEX = -1; + + /** + * Cigar string for an unaligned read. + */ + public static final String NO_ALIGNMENT_CIGAR = "*"; + + /** + * If a read has reference name "*", it will have this value for position. + */ + public static final int NO_ALIGNMENT_START = 0; + + /** + * This should rarely be used, since a read with no sequence doesn't make much sense. + */ + public static final byte[] NULL_SEQUENCE = new byte[0]; + + public static final String NULL_SEQUENCE_STRING = "*"; + + /** + * This should rarely be used, since all reads should have quality scores. + */ + public static final byte[] NULL_QUALS = new byte[0]; + public static final String NULL_QUALS_STRING = "*"; + + /** + * abs(insertSize) must be <= this + */ + public static final int MAX_INSERT_SIZE = 1<<29; + + /** + * It is not necessary in general to use the flag constants, because there are getters + * & setters that handles these symbolically. + */ + private static final int READ_PAIRED_FLAG = 0x1; + private static final int PROPER_PAIR_FLAG = 0x2; + private static final int READ_UNMAPPED_FLAG = 0x4; + private static final int MATE_UNMAPPED_FLAG = 0x8; + private static final int READ_STRAND_FLAG = 0x10; + private static final int MATE_STRAND_FLAG = 0x20; + private static final int FIRST_OF_PAIR_FLAG = 0x40; + private static final int SECOND_OF_PAIR_FLAG = 0x80; + private static final int NOT_PRIMARY_ALIGNMENT_FLAG = 0x100; + private static final int READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200; + private static final int DUPLICATE_READ_FLAG = 0x400; + + + private String mReadName = null; + private byte[] mReadBases = NULL_SEQUENCE; + private byte[] mBaseQualities = NULL_QUALS; + private String mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + private int mAlignmentStart = NO_ALIGNMENT_START; + private transient int mAlignmentEnd = NO_ALIGNMENT_START; + private int mMappingQuality = NO_MAPPING_QUALITY; + private String mCigarString = NO_ALIGNMENT_CIGAR; + private Cigar mCigar = null; + private List mAlignmentBlocks = null; + private int mFlags = 0; + private String mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + private int mMateAlignmentStart = 0; + private int mInferredInsertSize = 0; + private List mAttributes = null; + private Integer mReferenceIndex = null; + private Integer mMateReferenceIndex = null; + private Integer mIndexingBin = null; + + /** + * Some attributes (e.g. CIGAR) are not decoded immediately. Use this to decide how to validate when decoded. + */ + private SAMFileReader.ValidationStringency mValidationStringency = SAMFileReader.ValidationStringency.SILENT; + + private SAMFileReader mReader = null; + private SAMFileHeader mHeader = null; + + /** + * Where is this chunk located on the file system? + */ + private Chunk coordinates; + + public SAMRecord(final SAMFileHeader header) { + mHeader = header; + } + + public String getReadName() { + return mReadName; + } + + /** + * This method is preferred over getReadName().length(), because for BAMRecord + * it may be faster. + * @return length not including a null terminator. + */ + public int getReadNameLength() { + return mReadName.length(); + } + + public void setReadName(final String value) { + mReadName = value; + } + + /** + * @return read sequence as a string of ACGTN=. + */ + public String getReadString() { + final byte[] readBases = getReadBases(); + if (readBases.length == 0) { + return NULL_SEQUENCE_STRING; + } + return StringUtil.bytesToString(readBases); + } + + public void setReadString(final String value) { + if (NULL_SEQUENCE_STRING.equals(value)) { + mReadBases = NULL_SEQUENCE; + } else { + final byte[] bases = StringUtil.stringToBytes(value); + SAMUtils.normalizeBases(bases); + setReadBases(bases); + } + } + + + /** + * Do not modify the value returned by this method. If you want to change the bases, create a new + * byte[] and call setReadBases() or call setReadString(). + * @return read sequence as ASCII bytes ACGTN=. + */ + public byte[] getReadBases() { + return mReadBases; + } + + public void setReadBases(final byte[] value) { + mReadBases = value; + } + + /** + * This method is preferred over getReadBases().length, because for BAMRecord it may be faster. + * @return number of bases in the read. + */ + public int getReadLength() { + return getReadBases().length; + } + + /** + * @return Base qualities, encoded as a FASTQ string. + */ + public String getBaseQualityString() { + if (Arrays.equals(NULL_QUALS, getBaseQualities())) { + return NULL_QUALS_STRING; + } + return SAMUtils.phredToFastq(getBaseQualities()); + } + + public void setBaseQualityString(final String value) { + if (NULL_QUALS_STRING.equals(value)) { + setBaseQualities(NULL_QUALS); + } else { + setBaseQualities(SAMUtils.fastqToPhred(value)); + } + } + + /** + * Do not modify the value returned by this method. If you want to change the qualities, create a new + * byte[] and call setBaseQualities() or call setBaseQualityString(). + * @return Base qualities, as binary phred scores (not ASCII). + */ + public byte[] getBaseQualities() { + return mBaseQualities; + } + + public void setBaseQualities(final byte[] value) { + mBaseQualities = value; + } + + /** + * If the original base quality scores have been store in the "OQ" tag will return the numeric + * score as a byte[] + */ + public byte[] getOriginalBaseQualities() { + final String oqString = (String) getAttribute("OQ"); + if (oqString != null && oqString.length() > 0) { + return SAMUtils.fastqToPhred(oqString); + } + else { + return null; + } + } + + /** + * Sets the original base quality scores into the "OQ" tag as a String. Supplied value should be + * as phred-scaled numeric qualities. + */ + public void setOriginalBaseQualities(final byte[] oq) { + setAttribute("OQ", SAMUtils.phredToFastq(oq)); + } + + private static boolean hasReferenceName(final Integer referenceIndex, final String referenceName) { + return (referenceIndex != null && referenceIndex != NO_ALIGNMENT_REFERENCE_INDEX) || + !NO_ALIGNMENT_REFERENCE_NAME.equals(referenceName); + } + + /** + * @return true if this SAMRecord has a reference, either as a String or index (or both). + */ + private boolean hasReferenceName() { + return hasReferenceName(mReferenceIndex, mReferenceName); + } + + /** + * @return true if this SAMRecord has a mate reference, either as a String or index (or both). + */ + private boolean hasMateReferenceName() { + return hasReferenceName(mMateReferenceIndex, mMateReferenceName); + } + + /** + * @return Reference name, or null if record has no reference. + */ + public String getReferenceName() { + return mReferenceName; + } + + public void setReferenceName(final String value) { + mReferenceName = value.intern(); + mReferenceIndex = null; + } + + /** + * @return index of the reference sequence for this read in the sequence dictionary, or -1 + * if read has no reference sequence set, or if a String reference name is not found in the sequence index.. + */ + public Integer getReferenceIndex() { + if (mReferenceIndex == null) { + if (mReferenceName == null) { + mReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; + } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mReferenceName)) { + mReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; + } else { + mReferenceIndex = mHeader.getSequenceIndex(mReferenceName); + } + } + return mReferenceIndex; + } + + /** + * @param referenceIndex Must either equal -1 (indicating no reference), or exist in the sequence dictionary + * in the header associated with this record. + */ + public void setReferenceIndex(final int referenceIndex) { + mReferenceIndex = referenceIndex; + if (mReferenceIndex == NO_ALIGNMENT_REFERENCE_INDEX) { + mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + } else { + mReferenceName = mHeader.getSequence(referenceIndex).getSequenceName(); + } + } + + /** + * @return Mate reference name, or null if one is not assigned. + */ + public String getMateReferenceName() { + return mMateReferenceName; + } + + public void setMateReferenceName(final String mateReferenceName) { + this.mMateReferenceName = mateReferenceName.intern(); + mMateReferenceIndex = null; + } + + /** + * @return index of the reference sequence for this read's mate in the sequence dictionary, or -1 + * if mate has no reference sequence set. + */ + public Integer getMateReferenceIndex() { + if (mMateReferenceIndex == null) { + if (mMateReferenceName == null) { + mMateReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; + } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mMateReferenceName)){ + mMateReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; + } else { + mMateReferenceIndex = mHeader.getSequenceIndex(mMateReferenceName); + } + } + return mMateReferenceIndex; + } + + /** + * @param referenceIndex Must either equal -1 (indicating no reference), or exist in the sequence dictionary + * in the header associated with this record. + */ + public void setMateReferenceIndex(final int referenceIndex) { + mMateReferenceIndex = referenceIndex; + if (mMateReferenceIndex == NO_ALIGNMENT_REFERENCE_INDEX) { + mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + } else { + mMateReferenceName = mHeader.getSequence(referenceIndex).getSequenceName(); + } + } + + /** + * @return 1-based inclusive leftmost position of the clippped sequence, or 0 if there is no position. + */ + public int getAlignmentStart() { + return mAlignmentStart; + } + + /** + * @param value 1-based inclusive leftmost position of the clippped sequence, or 0 if there is no position. + */ + public void setAlignmentStart(final int value) { + mAlignmentStart = value; + // Clear cached alignment end + mAlignmentEnd = NO_ALIGNMENT_START; + // Change to alignmentStart could change indexing bin + setIndexingBin(null); + } + + /** + * @return 1-based inclusive rightmost position of the clippped sequence, or 0 read if unmapped. + */ + public int getAlignmentEnd() { + if (getReadUnmappedFlag()) { + return NO_ALIGNMENT_START; + } + else if (this.mAlignmentEnd == NO_ALIGNMENT_START) { + this.mAlignmentEnd = mAlignmentStart + getCigar().getReferenceLength() - 1; + } + + return this.mAlignmentEnd; + } + + /** + * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read + * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) + * then this method will return 96. + * + * Invalid to call on an unmapped read. + */ + public int getUnclippedStart() { + int pos = getAlignmentStart(); + + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { + pos -= cig.getLength(); + } + else { + break; + } + } + + return pos; + } + + /** + * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read + * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) + * then this method will return 107. + * + * Invalid to call on an unmapped read. + */ + public int getUnclippedEnd() { + int pos = getAlignmentEnd(); + final List cigs = getCigar().getCigarElements(); + for (int i=cigs.size() - 1; i>=0; --i) { + final CigarElement cig = cigs.get(i); + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { + pos += cig.getLength(); + } + else { + break; + } + } + + return pos; + } + + /** + * Unsupported. This property is derived from alignment start and CIGAR. + */ + public void setAlignmentEnd(final int value) { + throw new UnsupportedOperationException("Not supported: setAlignmentEnd"); + } + + /** + * @return 1-based inclusive leftmost position of the clippped mate sequence, or 0 if there is no position. + */ + public int getMateAlignmentStart() { + return mMateAlignmentStart; + } + + public void setMateAlignmentStart(final int mateAlignmentStart) { + this.mMateAlignmentStart = mateAlignmentStart; + } + + /** + * @return insert size (difference btw 5' end of read & 5' end of mate), if possible, else 0. + * Negative if mate maps to lower position than read. + */ + public int getInferredInsertSize() { + return mInferredInsertSize; + } + + public void setInferredInsertSize(final int inferredInsertSize) { + this.mInferredInsertSize = inferredInsertSize; + } + + /** + * @return phred scaled mapping quality. 255 implies valid mapping but quality is hard to compute. + */ + public int getMappingQuality() { + return mMappingQuality; + } + + public void setMappingQuality(final int value) { + mMappingQuality = value; + } + + public String getCigarString() { + if (mCigarString == null && getCigar() != null) { + mCigarString = TextCigarCodec.getSingleton().encode(getCigar()); + } + return mCigarString; + } + + public void setCigarString(final String value) { + mCigarString = value; + mCigar = null; + mAlignmentBlocks = null; + // Clear cached alignment end + mAlignmentEnd = NO_ALIGNMENT_START; + // Change to cigar could change alignmentEnd, and thus indexing bin + setIndexingBin(null); + } + + /** + * Do not modify the value returned by this method. If you want to change the Cigar, create a new + * Cigar and call setCigar() or call setCigarString() + * @return Cigar object for the read, or null if there is none. + */ + public Cigar getCigar() { + if (mCigar == null && mCigarString != null) { + mCigar = TextCigarCodec.getSingleton().decode(mCigarString); + if (getValidationStringency() != SAMFileReader.ValidationStringency.SILENT && !this.getReadUnmappedFlag()) { + // Don't know line number, and don't want to force read name to be decoded. + SAMUtils.processValidationErrors(validateCigar(-1L), -1L, getValidationStringency()); + } + } + return mCigar; + } + + /** + * This method is preferred over getCigar().getNumElements(), because for BAMRecord it may be faster. + * @return number of cigar elements (number + operator) in the cigar string. + */ + public int getCigarLength() { + return getCigar().numCigarElements(); + } + + public void setCigar(final Cigar cigar) { + initializeCigar(cigar); + // Change to cigar could change alignmentEnd, and thus indexing bin + setIndexingBin(null); + } + + /** + * For setting the Cigar string when BAMRecord has decoded it. Use this rather than setCigar() + * so that indexing bin doesn't get clobbered. + */ + protected void initializeCigar(final Cigar cigar) { + this.mCigar = cigar; + mCigarString = null; + mAlignmentBlocks = null; + // Clear cached alignment end + mAlignmentEnd = NO_ALIGNMENT_START; + } + + /** + * Get the SAMReadGroupRecord for this SAMRecord. + * @return The SAMReadGroupRecord from the SAMFileHeader for this SAMRecord, or null if + * 1) this record has no RG tag, or 2) the header doesn't contain the read group with + * the given ID. + * @throws NullPointerException if this.getHeader() returns null. + * @throws ClassCastException if RG tag does not have a String value. + */ + public SAMReadGroupRecord getReadGroup() { + final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); + if (rgId == null) { + return null; + } + return getHeader().getReadGroup(rgId); + } + + /** + * It is preferrable to use the get*Flag() methods that handle the flag word symbolically. + */ + public int getFlags() { + return mFlags; + } + + public void setFlags(final int value) { + mFlags = value; + // Could imply change to readUnmapped flag, which could change indexing bin + setIndexingBin(null); + } + + /** + * the read is paired in sequencing, no matter whether it is mapped in a pair. + */ + public boolean getReadPairedFlag() { + return (mFlags & READ_PAIRED_FLAG) != 0; + } + + private void requireReadPaired() { + if (!getReadPairedFlag()) { + throw new IllegalStateException("Inappropriate call if not paired read"); + } + } + + /** + * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment). + */ + public boolean getProperPairFlag() { + requireReadPaired(); + return getProperPairFlagUnchecked(); + } + + private boolean getProperPairFlagUnchecked() { + return (mFlags & PROPER_PAIR_FLAG) != 0; + } + + /** + * the query sequence itself is unmapped. + */ + public boolean getReadUnmappedFlag() { + return (mFlags & READ_UNMAPPED_FLAG) != 0; + } + + /** + * the mate is unmapped. + */ + public boolean getMateUnmappedFlag() { + requireReadPaired(); + return getMateUnmappedFlagUnchecked(); + } + + private boolean getMateUnmappedFlagUnchecked() { + return (mFlags & MATE_UNMAPPED_FLAG) != 0; + } + + /** + * strand of the query (false for forward; true for reverse strand). + */ + public boolean getReadNegativeStrandFlag() { + return (mFlags & READ_STRAND_FLAG) != 0; + } + + /** + * strand of the mate (false for forward; true for reverse strand). + */ + public boolean getMateNegativeStrandFlag() { + requireReadPaired(); + return getMateNegativeStrandFlagUnchecked(); + } + + private boolean getMateNegativeStrandFlagUnchecked() { + return (mFlags & MATE_STRAND_FLAG) != 0; + } + + /** + * the read is the first read in a pair. + */ + public boolean getFirstOfPairFlag() { + requireReadPaired(); + return getFirstOfPairFlagUnchecked(); + } + + private boolean getFirstOfPairFlagUnchecked() { + return (mFlags & FIRST_OF_PAIR_FLAG) != 0; + } + + /** + * the read is the second read in a pair. + */ + public boolean getSecondOfPairFlag() { + requireReadPaired(); + return getSecondOfPairFlagUnchecked(); + } + + private boolean getSecondOfPairFlagUnchecked() { + return (mFlags & SECOND_OF_PAIR_FLAG) != 0; + } + + /** + * the alignment is not primary (a read having split hits may have multiple primary alignment records). + */ + public boolean getNotPrimaryAlignmentFlag() { + return (mFlags & NOT_PRIMARY_ALIGNMENT_FLAG) != 0; + } + + /** + * the read fails platform/vendor quality checks. + */ + public boolean getReadFailsVendorQualityCheckFlag() { + return (mFlags & READ_FAILS_VENDOR_QUALITY_CHECK_FLAG) != 0; + } + + /** + * the read is either a PCR duplicate or an optical duplicate. + */ + public boolean getDuplicateReadFlag() { + return (mFlags & DUPLICATE_READ_FLAG) != 0; + } + + /** + * the read is paired in sequencing, no matter whether it is mapped in a pair. + */ + public void setReadPairedFlag(final boolean flag) { + setFlag(flag, READ_PAIRED_FLAG); + } + + /** + * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment). + */ + public void setProperPairFlag(final boolean flag) { + setFlag(flag, PROPER_PAIR_FLAG); + } + + /** + * the query sequence itself is unmapped. + */ + public void setReadUmappedFlag(final boolean flag) { + setFlag(flag, READ_UNMAPPED_FLAG); + // Change to readUnmapped could change indexing bin + setIndexingBin(null); + } + + /** + * the mate is unmapped. + */ + public void setMateUnmappedFlag(final boolean flag) { + setFlag(flag, MATE_UNMAPPED_FLAG); + } + + /** + * strand of the query (false for forward; true for reverse strand). + */ + public void setReadNegativeStrandFlag(final boolean flag) { + setFlag(flag, READ_STRAND_FLAG); + } + + /** + * strand of the mate (false for forward; true for reverse strand). + */ + public void setMateNegativeStrandFlag(final boolean flag) { + setFlag(flag, MATE_STRAND_FLAG); + } + + /** + * the read is the first read in a pair. + */ + public void setFirstOfPairFlag(final boolean flag) { + setFlag(flag, FIRST_OF_PAIR_FLAG); + } + + /** + * the read is the second read in a pair. + */ + public void setSecondOfPairFlag(final boolean flag) { + setFlag(flag, SECOND_OF_PAIR_FLAG); + } + + /** + * the alignment is not primary (a read having split hits may have multiple primary alignment records). + */ + public void setNotPrimaryAlignmentFlag(final boolean flag) { + setFlag(flag, NOT_PRIMARY_ALIGNMENT_FLAG); + } + + /** + * the read fails platform/vendor quality checks. + */ + public void setReadFailsVendorQualityCheckFlag(final boolean flag) { + setFlag(flag, READ_FAILS_VENDOR_QUALITY_CHECK_FLAG); + } + + /** + * the read is either a PCR duplicate or an optical duplicate. + */ + public void setDuplicateReadFlag(final boolean flag) { + setFlag(flag, DUPLICATE_READ_FLAG); + } + + private void setFlag(final boolean flag, final int bit) { + if (flag) { + mFlags |= bit; + } else { + mFlags &= ~bit; + } + } + + public SAMFileReader.ValidationStringency getValidationStringency() { + return mValidationStringency; + } + + /** + * Control validation of lazily-decoded elements. + */ + public void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + /** + * Get the value for a SAM tag. + * WARNING: Some value types (e.g. byte[]) are mutable. It is dangerous to change one of these values in + * place, because some SAMRecord implementations keep track of when attributes have been changed. If you + * want to change an attribute value, call setAttribute() to replace the value. + * + * @param tag Two-character tag name. + * @return Appropriately typed tag value, or null if the requested tag is not present. + */ + public final Object getAttribute(final String tag) { + return getAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag)); + } + + /** + * Get the tag value and attempt to coerce it into the requested type. + * @param tag The requested tag. + * @return The value of a tag, converted into an Integer if possible. + * @throws RuntimeException If the value is not an integer type, or will not fit in an Integer. + */ + public final Integer getIntegerAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof Integer) { + return (Integer)val; + } + if (!(val instanceof Number)) { + throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); + } + final long longVal = ((Number)val).longValue(); + if (longVal < Integer.MIN_VALUE || longVal > Integer.MAX_VALUE) { + throw new RuntimeException("Value for tag " + tag + " is not in Integer range: " + longVal); + } + return (int)longVal; + } + + /** + * Get the tag value and attempt to coerce it into the requested type. + * @param tag The requested tag. + * @return The value of a tag, converted into a Short if possible. + * @throws RuntimeException If the value is not an integer type, or will not fit in a Short. + */ + public final Short getShortAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof Short) { + return (Short)val; + } + if (!(val instanceof Number)) { + throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); + } + final long longVal = ((Number)val).longValue(); + if (longVal < Short.MIN_VALUE || longVal > Short.MAX_VALUE) { + throw new RuntimeException("Value for tag " + tag + " is not in Short range: " + longVal); + } + return (short)longVal; + } + + /** + * Get the tag value and attempt to coerce it into the requested type. + * @param tag The requested tag. + * @return The value of a tag, converted into a Byte if possible. + * @throws RuntimeException If the value is not an integer type, or will not fit in a Byte. + */ + public final Byte getByteAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof Byte) { + return (Byte)val; + } + if (!(val instanceof Number)) { + throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); + } + final long longVal = ((Number)val).longValue(); + if (longVal < Byte.MIN_VALUE || longVal > Byte.MAX_VALUE) { + throw new RuntimeException("Value for tag " + tag + " is not in Short range: " + longVal); + } + return (byte)longVal; + } + + public final String getStringAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof String) { + return (String)val; + } + throw new SAMException("Value for tag " + tag + " is not a String: " + val.getClass()); + } + + public final Character getCharacterAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof Character) { + return (Character)val; + } + throw new SAMException("Value for tag " + tag + " is not a Character: " + val.getClass()); + } + + public final Float getFloatAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof Float) { + return (Float)val; + } + throw new SAMException("Value for tag " + tag + " is not a Float: " + val.getClass()); + } + + public final byte[] getByteArrayAttribute(final String tag) { + final Object val = getAttribute(tag); + if (val == null) return null; + if (val instanceof byte[]) { + return (byte[])val; + } + throw new SAMException("Value for tag " + tag + " is not a byte[]: " + val.getClass()); + } + + protected Object getAttribute(final short tag) { + if (mAttributes == null) { + return null; + } + for (final SAMBinaryTagAndValue tagAndValue : mAttributes) { + if (tagAndValue.tag == tag) { + return tagAndValue.value; + } + } + return null; + } + + /** + * Set a named attribute onto the SAMRecord. Passing a null value causes the attribute to be cleared. + * @param tag two-character tag name. See http://samtools.sourceforge.net/SAM1.pdf for standard and user-defined tags. + * @param value Supported types are String, Char, Integer, Float, byte[]. + * If value == null, tag is cleared. + * + * Byte and Short are allowed but discouraged. If written to a SAM file, these will be converted to Integer, + * whereas if written to BAM, getAttribute() will return as Byte or Short, respectively. + * + * Long with value between 0 and MAX_UINT is allowed for BAM but discouraged. Attempting to write such a value + * to SAM will cause an exception to be thrown. + */ + final public void setAttribute(final String tag, final Object value) { + setAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag), value); + } + + protected void setAttribute(final short tag, final Object value) { + if (value != null && + !(value instanceof Byte || value instanceof Short || value instanceof Integer || + value instanceof String || value instanceof Character || value instanceof Float || + value instanceof byte[])) { + throw new SAMException("Attribute type " + value.getClass() + " not supported. Tag: " + + SAMTagUtil.getSingleton().makeStringTag(tag)); + } + if (mAttributes == null) { + mAttributes = new ArrayList(); + } + int i; + for (i = 0; i < mAttributes.size(); ++i) { + if (mAttributes.get(i).tag == tag) { + break; + } + } + if (i < mAttributes.size()) { + if (value != null) { + mAttributes.set(i, new SAMBinaryTagAndValue(tag, value)); + } else { + mAttributes.remove(i); + } + } else if (value != null) { + mAttributes.add(new SAMBinaryTagAndValue(tag, value)); + } + } + + /** + * Removes all attributes. + */ + public void clearAttributes() { + mAttributes.clear(); + } + + /** + * Replace any existing attributes with the given list. Does not copy the list + * but installs it directly. + */ + protected void setAttributes(final List attributes) { + mAttributes = attributes; + } + /** + * @return List of all tags on this record. Returns null if there are no tags. + */ + protected List getBinaryAttributes() { + if (mAttributes == null || mAttributes.isEmpty()) { + return Collections.emptyList(); + } + return Collections.unmodifiableList(mAttributes); + } + + /** + * Tag name and value of an attribute, for getAttributes() method. + */ + public static class SAMTagAndValue { + public final String tag; + public final Object value; + + public SAMTagAndValue(final String tag, final Object value) { + this.tag = tag; + this.value = value; + } + } + + /** + * @return list of {tag, value} tuples + */ + public final List getAttributes() { + final List binaryAttributes = getBinaryAttributes(); + final List ret = new ArrayList(binaryAttributes.size()); + for (final SAMBinaryTagAndValue tagAndValue : binaryAttributes) { + ret.add(new SAMTagAndValue(SAMTagUtil.getSingleton().makeStringTag(tagAndValue.tag), + tagAndValue.value)); + } + return ret; + } + + Integer getIndexingBin() { + return mIndexingBin; + } + + /** + * Used internally when writing BAMRecords. + * @param mIndexingBin c.f. http://samtools.sourceforge.net/SAM1.pdf + */ + void setIndexingBin(final Integer mIndexingBin) { + this.mIndexingBin = mIndexingBin; + } + + /** + * Does not change state of this. + * @return indexing bin based on alignment start & end. + */ + int computeIndexingBin() { + // reg2bin has zero-based, half-open API + final int alignmentStart = getAlignmentStart()-1; + int alignmentEnd = getAlignmentEnd(); + if (alignmentEnd <= 0) { + // If alignment end cannot be determined (e.g. because this read is not really aligned), + // then treat this as a one base alignment for indexing purposes. + alignmentEnd = alignmentStart + 1; + } + return SAMUtils.reg2bin(alignmentStart, alignmentEnd); + } + + public SAMFileHeader getHeader() { + return mHeader; + } + + /** + * Setting header into SAMRecord facilitates conversion btw reference sequence names and indices + * @param header contains sequence dictionary for this SAMRecord + */ + public void setHeader(final SAMFileHeader header) { + this.mHeader = header; + } + + /** + * If this record has a valid binary representation of the variable-length portion of a binary record stored, + * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true + * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length + * portion has been changed. + */ + public byte[] getVariableBinaryRepresentation() { + return null; + } + + /** + * Depending on the concrete implementation, the binary file size of attributes may be known without + * computing them all. + * @return binary file size of attribute, if known, else -1 + */ + public int getAttributesBinarySize() { + return -1; + } + + public String format() { + final StringBuilder buffer = new StringBuilder(); + addField(buffer, getReadName(), null, null); + addField(buffer, getFlags(), null, null); + addField(buffer, getReferenceName(), null, "*"); + addField(buffer, getAlignmentStart(), 0, "*"); + addField(buffer, getMappingQuality(), 0, "0"); + addField(buffer, getCigarString(), null, "*"); + addField(buffer, getMateReferenceName(), null, "*"); + addField(buffer, getMateAlignmentStart(), 0, "*"); + addField(buffer, getInferredInsertSize(), 0, "*"); + addField(buffer, getReadString(), null, "*"); + addField(buffer, getBaseQualityString(), null, "*"); + if (mAttributes != null) { + for (final SAMBinaryTagAndValue entry : getBinaryAttributes()) { + addField(buffer, formatTagValue(entry.tag, entry.value)); + } + } + return buffer.toString(); + } + + private void addField(final StringBuilder buffer, final Object value, final Object defaultValue, final String defaultString) { + if (safeEquals(value, defaultValue)) { + addField(buffer, defaultString); + } else if (value == null) { + addField(buffer, ""); + } else { + addField(buffer, value.toString()); + } + } + + private void addField(final StringBuilder buffer, final String field) { + if (buffer.length() > 0) { + buffer.append('\t'); + } + buffer.append(field); + } + + private String formatTagValue(final short tag, final Object value) { + final String tagString = SAMTagUtil.getSingleton().makeStringTag(tag); + if (value == null || value instanceof String) { + return tagString + ":Z:" + value; + } else if (value instanceof Integer || value instanceof Long || + value instanceof Short || value instanceof Byte) { + return tagString + ":i:" + value; + } else if (value instanceof Character) { + return tagString + ":A:" + value; + } else if (value instanceof Float) { + return tagString + ":f:" + value; + } else if (value instanceof byte[]) { + return tagString + ":H:" + StringUtil.bytesToHexString((byte[]) value); + } else { + throw new RuntimeException("Unexpected value type for tag " + tagString + + ": " + value + " of class " + value.getClass().getName()); + } + } + + private boolean safeEquals(final Object o1, final Object o2) { + if (o1 == o2) { + return true; + } else if (o1 == null || o2 == null) { + return false; + } else { + return o1.equals(o2); + } + } + + /** + * Force all lazily-initialized data members to be initialized. If a subclass overrides this method, + * typically it should also call super method. + */ + protected void eagerDecode() { + getCigar(); + getCigarString(); + } + + /** + * Returns blocks of the read sequence that have been aligned directly to the + * reference sequence. Note that clipped portions of the read and inserted and + * deleted bases (vs. the reference) are not represented in the alignment blocks. + */ + public List getAlignmentBlocks() { + if (this.mAlignmentBlocks != null) return this.mAlignmentBlocks; + + final Cigar cigar = getCigar(); + if (cigar == null) return Collections.emptyList(); + + + final List alignmentBlocks = new ArrayList(); + int readBase = 1; + int refBase = getAlignmentStart(); + + for (final CigarElement e : cigar.getCigarElements()) { + switch (e.getOperator()) { + case H : break; // ignore hard clips + case P : break; // ignore pads + case S : readBase += e.getLength(); break; // soft clip read bases + case N : refBase += e.getLength(); break; // reference skip + case D : refBase += e.getLength(); break; + case I : readBase += e.getLength(); break; + case M : + case EQ : + case X : + final int length = e.getLength(); + alignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); + readBase += length; + refBase += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + } + } + this.mAlignmentBlocks = Collections.unmodifiableList(alignmentBlocks); + + return this.mAlignmentBlocks; + } + + /** + * Run all validations of CIGAR. These include validation that the CIGAR makes sense independent of + * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. + * @param recordNumber For error reporting. -1 if not known. + * @return List of errors, or null if no errors. + */ + public List validateCigar(final long recordNumber) { + List ret = null; + + if (getValidationStringency() != SAMFileReader.ValidationStringency.SILENT && !this.getReadUnmappedFlag()) { + // Don't know line number, and don't want to force read name to be decoded. + ret = getCigar().isValid(getReadName(), recordNumber); + if (getReferenceIndex() != NO_ALIGNMENT_REFERENCE_INDEX) { + final SAMSequenceRecord sequence = getHeader().getSequence(getReferenceIndex()); + final int referenceSequenceLength = sequence.getSequenceLength(); + for (final AlignmentBlock alignmentBlock : getAlignmentBlocks()) { + if (alignmentBlock.getReferenceStart() + alignmentBlock.getLength() - 1 > referenceSequenceLength) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.CIGAR_MAPS_OFF_REFERENCE, + "CIGAR M operator maps off end of reference", getReadName(), recordNumber)); + break; + } + } + } + } + return ret; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof SAMRecord)) return false; + + final SAMRecord samRecord = (SAMRecord) o; + + // First check all the elements that do not require decoding + if (mAlignmentStart != samRecord.mAlignmentStart) return false; + if (mFlags != samRecord.mFlags) return false; + if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; + if (mMappingQuality != samRecord.mMappingQuality) return false; + if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; + if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) + return false; + if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) + return false; + if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) + return false; + + eagerDecode(); + samRecord.eagerDecode(); + + if (mAttributes != null ? !mAttributes.equals(samRecord.mAttributes) : samRecord.mAttributes != null) + return false; + if (!Arrays.equals(mBaseQualities, samRecord.mBaseQualities)) return false; + if (mCigar != null ? !mCigar.equals(samRecord.mCigar) : samRecord.mCigar != null) + return false; + if (mMateReferenceName != null ? !mMateReferenceName.equals(samRecord.mMateReferenceName) : samRecord.mMateReferenceName != null) + return false; + if (!Arrays.equals(mReadBases, samRecord.mReadBases)) return false; + if (mReadName != null ? !mReadName.equals(samRecord.mReadName) : samRecord.mReadName != null) return false; + if (mReferenceName != null ? !mReferenceName.equals(samRecord.mReferenceName) : samRecord.mReferenceName != null) + return false; + + return true; + } + + @Override + public int hashCode() { + eagerDecode(); + int result = mReadName != null ? mReadName.hashCode() : 0; + result = 31 * result + (mReadBases != null ? Arrays.hashCode(mReadBases) : 0); + result = 31 * result + (mBaseQualities != null ? Arrays.hashCode(mBaseQualities) : 0); + result = 31 * result + (mReferenceName != null ? mReferenceName.hashCode() : 0); + result = 31 * result + mAlignmentStart; + result = 31 * result + mMappingQuality; + result = 31 * result + (mCigarString != null ? mCigarString.hashCode() : 0); + result = 31 * result + mFlags; + result = 31 * result + (mMateReferenceName != null ? mMateReferenceName.hashCode() : 0); + result = 31 * result + mMateAlignmentStart; + result = 31 * result + mInferredInsertSize; + result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); + result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); + result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); + result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); + return result; + } + + /** + * Perform various validations of SAMRecord. + * Note that this method deliberately returns null rather than Collections.emptyList() if there + * are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable. + * @return null if valid. If invalid, returns a list of error messages. + */ + public List isValid() { + // ret is only instantiate if there are errors to report, in order to reduce GC in the typical case + // in which everything is valid. It's ugly, but more efficient. + ArrayList ret = null; + if (!getReadPairedFlag()) { + if (getProperPairFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); + } + if (getMateUnmappedFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mate unmapped flag should not be set for unpaired read.", getReadName())); + } + if (getMateNegativeStrandFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_NEG_STRAND, "Mate negative strand flag should not be set for unpaired read.", getReadName())); + } + if (getFirstOfPairFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR, "First of pair flag should not be set for unpaired read.", getReadName())); + } + if (getSecondOfPairFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR, "First of pair flag should not be set for unpaired read.", getReadName())); + } + if (getMateReferenceIndex() != NO_ALIGNMENT_REFERENCE_INDEX) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MATE_REF_INDEX, "MRNM should not be set for unpaired read.", getReadName())); + } + } else { + final List errors = isValidReferenceIndexAndPosition(mMateReferenceIndex, mMateReferenceName, + getMateAlignmentStart(), true); + if (errors != null) { + if (ret == null) ret = new ArrayList(); + ret.addAll(errors); + } + if (!hasMateReferenceName() && !getMateUnmappedFlag()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mapped mate should have mate reference name", getReadName())); + } +/* + TODO: PIC-97 This validation should be enabled, but probably at this point there are too many + BAM files that have the proper pair flag set when read or mate is unmapped. + if (getMateUnmappedFlag() && getProperPairFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); + } +*/ + } + if (getInferredInsertSize() > MAX_INSERT_SIZE || getInferredInsertSize() < -MAX_INSERT_SIZE) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INSERT_SIZE, "Insert size out of range", getReadName())); + } + if (getReadUnmappedFlag()) { + if (getNotPrimaryAlignmentFlag()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_NOT_PRIM_ALIGNMENT, "Not primary alignment flag should not be set for unmapped read.", getReadName())); + } + if (getMappingQuality() != 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ must should be 0 for unmapped read.", getReadName())); + } + if (getCigarLength() != 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR should have zero elements for unmapped read.", getReadName())); + } +/* + TODO: PIC-97 This validation should be enabled, but probably at this point there are too many + BAM files that have the proper pair flag set when read or mate is unmapped. + if (getProperPairFlagUnchecked()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unmapped read.", getReadName())); + } +*/ + } else { + if (getMappingQuality() >= 256) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be < 256.", getReadName())); + } + if (getCigarLength() == 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR should have > zero elements for mapped read.", getReadName())); + } + if (getHeader().getSequenceDictionary().size() == 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.MISSING_SEQUENCE_DICTIONARY, "Empty sequence dictionary.", getReadName())); + } + if (!hasReferenceName()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED, "Mapped read should have valid reference name", getReadName())); + } +/* + Oops! We know this is broken in older BAM files, so this having this validation will cause all sorts of + problems! + if (getIndexingBin() != null && getIndexingBin() != computeIndexingBin()) { + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, + "Indexing bin (" + getIndexingBin() + ") does not agree with computed value (" + computeIndexingBin() + ")", + getReadName())); + + } +*/ + } + // Validate the RG ID is found in header + final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); + if (rgId != null && getHeader().getReadGroup(rgId) == null) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.READ_GROUP_NOT_FOUND, + "RG ID on SAMRecord not found in header: " + rgId, getReadName())); + } + final List errors = isValidReferenceIndexAndPosition(mReferenceIndex, mReferenceName, getAlignmentStart(), false); + if (errors != null) { + if (ret == null) ret = new ArrayList(); + ret.addAll(errors); + } + if (this.getReadLength() == 0) { + String cq = (String)getAttribute(SAMTagUtil.getSingleton().CQ); + String cs = (String)getAttribute(SAMTagUtil.getSingleton().CS); + if (cq == null || cq.length() == 0 || cs == null || cs.length() == 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, + "Zero-length read without CS or CQ tag", getReadName())); + } else if (!getReadUnmappedFlag()) { + boolean hasIndel = false; + for (CigarElement cigarElement : getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.DELETION || + cigarElement.getOperator() == CigarOperator.INSERTION) { + hasIndel = true; + break; + } + } + if (!hasIndel) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, + "Colorspace read with zero-length bases but no indel", getReadName())); + } + } + } + if (this.getReadLength() != getBaseQualities().length && !Arrays.equals(getBaseQualities(), NULL_QUALS)) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, + "Read length does not match quals length", getReadName())); + } + if (ret == null || ret.size() == 0) { + return null; + } + return ret; + } + + /** + * Gets the reader that read this SAM file. + * @return + */ + public SAMFileReader getReader() { + return mReader; + } + + /** + * Sets the reader that read this SAM file. Protected access only. + * @param reader Reader which retrieved this file. + */ + protected void setReader(SAMFileReader reader) { + mReader = reader; + } + + /** + * Gets the position of this record, stored on the filesystem. + * @return Chunk indicating the physical position. + */ + public Chunk getCoordinates() { + return coordinates; + } + + /** + * Internal function: sets the coordin + * @param coordinates Where this file is actually sitting on the disk. + */ + protected void setCoordinates(final Chunk coordinates) { + this.coordinates = coordinates; + } + + private List isValidReferenceIndexAndPosition(final Integer referenceIndex, final String referenceName, + final int alignmentStart, final boolean isMate) { + final boolean hasReference = hasReferenceName(referenceIndex, referenceName); + + // ret is only instantiate if there are errors to report, in order to reduce GC in the typical case + // in which everything is valid. It's ugly, but more efficient. + ArrayList ret = null; + if (!hasReference) { + if (alignmentStart != 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should be 0 because reference name = *.", isMate), getReadName())); + } + } else { + if (alignmentStart == 0) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should != 0 because reference name != *.", isMate), getReadName())); + } + + if (getHeader().getSequenceDictionary().size() > 0) { + final SAMSequenceRecord sequence = + (referenceIndex != null? getHeader().getSequence(referenceIndex): getHeader().getSequence(referenceName)); + if (sequence == null) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_REFERENCE_INDEX, buildMessage("Reference sequence not found in sequence dictionary.", isMate), getReadName())); + } else { + if (alignmentStart > sequence.getSequenceLength()) { + if (ret == null) ret = new ArrayList(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start (" + alignmentStart + ") must be <= reference sequence length (" + + sequence.getSequenceLength() + ") on reference " + sequence.getSequenceName(), isMate), getReadName())); + } + } + } + } + return ret; + } + + private String buildMessage(final String baseMessage, final boolean isMate) { + return isMate ? "Mate " + baseMessage : baseMessage; + } + + /** + * Note that this does a shallow copy of everything, except for the attribute list, for which a copy of the list + * is made, but the attributes themselves are copied by reference. This should be safe because callers should + * never modify a mutable value returned by any of the get() methods anyway. + */ + @Override + public Object clone() throws CloneNotSupportedException { + final SAMRecord newRecord = (SAMRecord)super.clone(); + if (mAttributes != null) { + newRecord.mAttributes = (ArrayList)((ArrayList)mAttributes).clone(); + } + return newRecord; + } + + /** Simple toString() that gives a little bit of useful info about the read. */ + @Override + public String toString() { + StringBuilder builder = new StringBuilder(64); + builder.append(getReadName()); + if (getReadPairedFlag()) { + if (getFirstOfPairFlag()) { + builder.append(" 1/2"); + } + else { + builder.append(" 2/2"); + } + } + + builder.append(" "); + builder.append(String.valueOf(getReadLength())); + builder.append("b"); + + if (getReadUnmappedFlag()) { + builder.append(" unmapped read."); + } + else { + builder.append(" aligned read."); + } + + return builder.toString(); + } +} + diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java index cf87aa46d..5e83ea330 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.Map; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; /** * A common interface for shards that natively understand the BAM format. @@ -21,7 +22,7 @@ public interface BAMFormatAwareShard extends Shard { * Get the list of chunks delimiting this shard. * @return a list of chunks that contain data for this shard. */ - public Map> getChunks(); + public Map> getChunks(); /** * Returns true if this shard is meant to buffer reads, rather @@ -30,6 +31,12 @@ public interface BAMFormatAwareShard extends Shard { */ public boolean buffersReads(); + /** + * Checks to see whether the buffer is empty. + * @return True if the buffer is empty. + */ + public boolean isBufferEmpty(); + /** * Returns true if the read buffer is currently full. * @return True if this shard's buffer is full (and the shard can buffer reads). diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java index 74e066bb3..eec05c936 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; /** * Expresses a shard of read data in block format. @@ -27,7 +28,7 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware /** * The data backing the next chunks to deliver to the traversal engine. */ - private final Map> chunks; + private final Map> chunks; /** * The reads making up this shard. @@ -45,7 +46,7 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware */ private final Shard.ShardType shardType; - public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks, SamRecordFilter filter, Shard.ShardType shardType) { + public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks, SamRecordFilter filter, Shard.ShardType shardType) { this.sourceInfo = sourceInfo; this.chunks = chunks; this.filter = filter; @@ -61,6 +62,14 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware return true; } + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { + return reads.size() == 0; + } + /** * Returns true if the read buffer is currently full. * @return True if this shard's buffer is full (and the shard can buffer reads). @@ -95,7 +104,7 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware * Get the list of chunks delimiting this shard. * @return a list of chunks that contain data for this shard. */ - public Map> getChunks() { + public Map> getChunks() { return Collections.unmodifiableMap(chunks); } @@ -114,7 +123,7 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware @Override public String toString() { StringBuilder sb = new StringBuilder(); - for(Map.Entry> entry: chunks.entrySet()) { + for(Map.Entry> entry: chunks.entrySet()) { sb.append(entry.getKey()); sb.append(": "); for(Chunk chunk : entry.getValue()) { diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java index 6a5591913..1deb0dbfd 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; /** * A read shard strategy that delimits based on the number of @@ -28,18 +29,15 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { */ protected final BlockDrivenSAMDataSource dataSource; + private Shard nextShard = null; + /** our storage of the genomic locations they'd like to shard over */ private final List filePointers = new ArrayList(); /** * Position of the last shard in the file. */ - private Map position; - - /** - * True if the set of SAM readers is at the end of the stream. False otherwise. - */ - private boolean atEndOfStream = false; + private Map position; /** * Create a new read shard strategy, loading read shards from the given BAM file. @@ -53,6 +51,7 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { this.position = this.dataSource.getCurrentPosition(); if(locations != null) filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList(),this.dataSource.getNumIndexLevels()-1)); + advance(); } /** @@ -60,7 +59,7 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { * @return True if any more data is available. False otherwise. */ public boolean hasNext() { - return !atEndOfStream; + return nextShard != null; } /** @@ -70,48 +69,61 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { */ public Shard next() { if(!hasNext()) - throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard."); + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } - Map> shardPosition = null; + public void advance() { + Map> shardPosition = null; + nextShard = null; SamRecordFilter filter = null; if(!filePointers.isEmpty()) { - boolean foundData = false; for(FilePointer filePointer: filePointers) { shardPosition = dataSource.getFilePointersBounding(filePointer.bin); - for(SAMFileReader2 reader: shardPosition.keySet()) { - List chunks = shardPosition.get(reader); - Chunk filePosition = position.get(reader); - for(Chunk chunk: chunks) { - if(filePosition.getChunkStart() > chunk.getChunkEnd()) - chunks.remove(chunk); - else { - if(filePosition.getChunkStart() > chunk.getChunkStart()) - chunk.setChunkStart(filePosition.getChunkStart()); - foundData = true; - } + + Map> selectedReaders = new HashMap>(); + for(SAMReaderID id: shardPosition.keySet()) { + List chunks = shardPosition.get(id); + List selectedChunks = new ArrayList(); + Chunk filePosition = position.get(id); + for(Chunk chunk: chunks) + if(filePosition.getChunkStart() <= chunk.getChunkStart()) + selectedChunks.add(chunk); + else if(filePosition.getChunkStart() > chunk.getChunkStart() && filePosition.getChunkStart() < chunk.getChunkEnd()) { + selectedChunks.add(new Chunk(filePosition.getChunkStart(),chunk.getChunkEnd())); } + if(selectedChunks.size() > 0) + selectedReaders.put(id,selectedChunks); } - if(foundData) { - filter = new ReadOverlapFilter(filePointer.locations); - break; + if(selectedReaders.size() > 0) { + filter = new ReadOverlapFilter(filePointer.locations); + + BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),selectedReaders,filter,Shard.ShardType.READ); + dataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } } } } else { // TODO: This level of processing should not be necessary. - shardPosition = new HashMap>(); - for(Map.Entry entry: position.entrySet()) + shardPosition = new HashMap>(); + for(Map.Entry entry: position.entrySet()) shardPosition.put(entry.getKey(),Collections.singletonList(entry.getValue())); filter = null; + + BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,filter,Shard.ShardType.READ); + dataSource.fillShard(shard); + nextShard = !shard.isBufferEmpty() ? shard : null; } - BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,filter,Shard.ShardType.READ); - atEndOfStream = dataSource.fillShard(shard); - this.position = dataSource.getCurrentPosition(); - - return shard; } /** @@ -128,4 +140,5 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { public Iterator iterator() { return this; } + } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java index 389dd7001..3ed9230a7 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShard.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.shards; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import net.sf.samtools.Chunk; import net.sf.samtools.SAMFileReader2; import net.sf.samtools.SAMRecord; @@ -44,7 +45,7 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa /** * A list of the chunks associated with this shard. */ - private final Map> chunks; + private final Map> chunks; /** * An IndexDelimitedLocusShard can be used either for LOCUS or LOCUS_INTERVAL shard types. @@ -58,7 +59,7 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa * @param chunks Chunks associated with that interval. * @param shardType Type of the shard; must be either LOCUS or LOCUS_INTERVAL. */ - IndexDelimitedLocusShard(List intervals, Map> chunks, ShardType shardType) { + IndexDelimitedLocusShard(List intervals, Map> chunks, ShardType shardType) { super(intervals); this.chunks = chunks; if(shardType != ShardType.LOCUS && shardType != ShardType.LOCUS_INTERVAL) @@ -70,7 +71,7 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa * Gets the chunks associated with this locus shard. * @return A list of the chunks to use when retrieving locus data. */ - public Map> getChunks() { + public Map> getChunks() { return chunks; } @@ -81,6 +82,12 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa */ public boolean buffersReads() { return false; } + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { throw new UnsupportedOperationException("This shard does not buffer reads."); } + /** * Returns true if the read buffer is currently full. * @return True if this shard's buffer is full (and the shard can buffer reads). diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index 5d4c9537c..afec23638 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import java.util.*; @@ -110,7 +111,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { */ public IndexDelimitedLocusShard next() { FilePointer nextFilePointer = filePointerIterator.next(); - Map> chunksBounding = dataSource!=null ? dataSource.getFilePointersBounding(nextFilePointer.bin) : null; + Map> chunksBounding = dataSource!=null ? dataSource.getFilePointersBounding(nextFilePointer.bin) : null; return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index ac8281634..e81728cd0 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -43,6 +43,11 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { */ private final Map mergedReadGroupMappings = new HashMap(); + /** + * How far along is each reader? + */ + private final Map readerPositions = new HashMap(); + /** * Create a new block-aware SAM data source given the supplied read metadata. * @param reads The read metadata. @@ -55,10 +60,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); + initializeReaderPositions(readers); + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers.values(),SAMFileHeader.SortOrder.coordinate,true); mergedHeader = headerMerger.getMergedHeader(); hasReadGroupCollisions = headerMerger.hasReadGroupCollisions(); + // cache the read group id (original) -> read group id (merged) mapping. for(SAMReaderID id: readerIDs) { SAMFileReader reader = readers.getReader(id); ReadGroupMapping mapping = new ReadGroupMapping(); @@ -109,15 +117,15 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @param bin The bin for which to load data. * @return A map of the file pointers bounding the bin. */ - public Map> getFilePointersBounding(Bin bin) { + public Map> getFilePointersBounding(Bin bin) { SAMReaders readers = resourcePool.getReadersWithoutLocking(); - Map> filePointers = new HashMap>(); - for(SAMFileReader reader: readers) { - SAMFileReader2 reader2 = (SAMFileReader2)reader; + Map> filePointers = new HashMap>(); + for(SAMReaderID id: getReaderIDs()) { + SAMFileReader2 reader2 = (SAMFileReader2)readers.getReader(id); if(bin != null) - filePointers.put(reader2,reader2.getFilePointersBounding(bin)); + filePointers.put(id,reader2.getFilePointersBounding(bin)); else - filePointers.put(reader2,Collections.emptyList()); + filePointers.put(id,Collections.emptyList()); } return filePointers; } @@ -126,14 +134,8 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. */ - public Map getCurrentPosition() { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - Map currentPositions = new HashMap(); - for(SAMFileReader reader: readers) { - SAMFileReader2 reader2 = (SAMFileReader2)reader; - currentPositions.put(reader2,reader2.getCurrentPosition()); - } - return currentPositions; + public Map getCurrentPosition() { + return readerPositions; } /** @@ -200,7 +202,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @param shard Shard to fill. * @return true if at the end of the stream. False otherwise. */ - public boolean fillShard(BAMFormatAwareShard shard) { + public void fillShard(BAMFormatAwareShard shard) { if(!shard.buffersReads()) throw new StingException("Attempting to fill a non-buffering shard."); @@ -208,17 +210,40 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { // know why this is. Please add a comment here if you do. boolean enableVerification = shard instanceof ReadShard; - CloseableIterator iterator = getIterator(shard,enableVerification); - while(!shard.isBufferFull() && iterator.hasNext()) - shard.addRead(iterator.next()); + SAMReaders readers = resourcePool.getAvailableReaders(); - boolean atEndOfStream = !iterator.hasNext(); + CloseableIterator iterator = getIterator(readers,shard,enableVerification); + while(!shard.isBufferFull() && iterator.hasNext()) { + SAMRecord read = iterator.next(); + Chunk endChunk = new Chunk(read.getCoordinates().getChunkEnd(),Long.MAX_VALUE); + shard.addRead(read); + readerPositions.put(getReaderID(readers,read),endChunk); + } iterator.close(); - - return atEndOfStream; } + /** + * Gets the reader associated with the given read. + * @param readers Available readers. + * @param read + * @return + */ + private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { + for(SAMReaderID id: getReaderIDs()) { + if(readers.getReader(id) == read.getReader()) + return id; + } + throw new StingException("Unable to find id for reader associated with read " + read.getReadName()); + } + + private void initializeReaderPositions(SAMReaders readers) { + for(SAMReaderID id: getReaderIDs()) { + SAMFileReader2 reader2 = (SAMFileReader2)readers.getReader(id); + readerPositions.put(id,reader2.getCurrentPosition()); + } + } + public StingSAMIterator seek(Shard shard) { if(!(shard instanceof BAMFormatAwareShard)) throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass()); @@ -228,20 +253,19 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return bamAwareShard.iterator(); } else { - // Since the beginning of time for the GATK, enableVerification has been true only for ReadShards. I don't - // know why this is. Please add a comment here if you do. - boolean enableVerification = shard instanceof ReadShard; - return getIterator(bamAwareShard,enableVerification); + SAMReaders readers = resourcePool.getAvailableReaders(); + + // Since the beginning of time for the GATK, enableVerification has been true only for ReadShards, because + // + return getIterator(readers,bamAwareShard,shard instanceof ReadShard); } } - private StingSAMIterator getIterator(BAMFormatAwareShard shard, boolean enableVerification) { - SAMReaders readers = resourcePool.getAvailableReaders(); - + private StingSAMIterator getIterator(SAMReaders readers, BAMFormatAwareShard shard, boolean enableVerification) { Map> readerToIteratorMap = new HashMap>(); - for(SAMFileReader reader: readers) { - SAMFileReader2 reader2 = (SAMFileReader2)reader; - List chunks = shard.getChunks().get(reader2); + for(SAMReaderID id: getReaderIDs()) { + SAMFileReader2 reader2 = (SAMFileReader2)readers.getReader(id); + List chunks = shard.getChunks().get(id); readerToIteratorMap.put(reader2,reader2.iterator(chunks)); } @@ -372,6 +396,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { /** * Retrieve the reader from the data structure. * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. */ public SAMFileReader getReader(SAMReaderID id) { if(!readers.containsKey(id)) @@ -379,17 +404,6 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return readers.get(id); } - /** - * Convenience method to get the header associated with an individual ID. - * @param id ID for which to retrieve the header. - * @return Header for this SAM file. - */ - public SAMFileHeader getHeader(SAMReaderID id) { - if(!readers.containsKey(id)) - throw new NoSuchElementException("No reader is associated with id " + id); - return readers.get(id).getFileHeader(); - } - /** * Returns an iterator over all readers in this structure. * @return An iterator over readers. diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 9fb6f8c10..8c3d3066e 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -4,12 +4,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import net.sf.samtools.AlignmentBlock; -import net.sf.samtools.Cigar; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMValidationError; +import net.sf.samtools.*; /** * @author ebanks @@ -332,6 +327,8 @@ public class GATKSAMRecord extends SAMRecord { public void setValidationStringency(net.sf.samtools.SAMFileReader.ValidationStringency validationStringency) { mRecord.setValidationStringency(validationStringency); } + public SAMFileReader getReader() { return mRecord.getReader(); } + public SAMFileHeader getHeader() { return mRecord.getHeader(); } public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); } @@ -355,4 +352,6 @@ public class GATKSAMRecord extends SAMRecord { public Object clone() throws CloneNotSupportedException { return mRecord.clone(); } public String toString() { return mRecord.toString(); } + + public Chunk getCoordinates() { return mRecord.getCoordinates(); } }