gatk-3.8/java/lib/edu/mit/broad/sam/BAMFileReader.java

318 lines
12 KiB
Java

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.sam;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.BlockCompressedInputStream;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.util.StringLineReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Internal class for reading and querying BAM files.
*/
class BAMFileReader
extends SAMFileReader.ReaderImplementation {
private boolean mIsSeekable = false;
private BinaryCodec mStream = null;
private final BlockCompressedInputStream mCompressedInputStream;
private SAMFileHeader mFileHeader = null;
private BAMFileIndex mFileIndex = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<SAMRecord> mCurrentIterator = null;
private final boolean eagerDecode;
BAMFileReader(final InputStream stream, final boolean eagerDecode)
throws IOException {
mIsSeekable = false;
mCompressedInputStream = new BlockCompressedInputStream(stream);
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
this.eagerDecode = eagerDecode;
readHeader(null);
}
BAMFileReader(final File file, final boolean eagerDecode)
throws IOException {
mIsSeekable = true;
mCompressedInputStream = new BlockCompressedInputStream(file);
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
this.eagerDecode = eagerDecode;
readHeader(file);
mFirstRecordPointer = mCompressedInputStream.getFilePointer();
}
void close() {
if (mStream != null) {
mStream.close();
}
mStream = null;
mFileHeader = null;
mFileIndex = null;
}
BAMFileIndex getFileIndex() {
return mFileIndex;
}
void setFileIndex(final BAMFileIndex fileIndex) {
mFileIndex = fileIndex;
}
SAMFileHeader getFileHeader() {
return mFileHeader;
}
/**
* Currently this is ignored for BAM reading. Always do strict validation.
*/
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
}
CloseableIterator<SAMRecord> getIterator() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (mIsSeekable) {
try {
mCompressedInputStream.seek(mFirstRecordPointer);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
mCurrentIterator = new BAMFileIterator();
return mCurrentIterator;
}
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (!mIsSeekable) {
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
}
if (mFileIndex == null) {
throw new IllegalStateException("No BAM file index is available");
}
mCurrentIterator = new BAMFileIndexIterator(sequence, start, end, contained);
return mCurrentIterator;
}
private void readHeader(final File file)
throws IOException {
final byte[] buffer = new byte[4];
mStream.readBytes(buffer);
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
throw new IOException("Invalid BAM file header");
}
final int headerTextLength = mStream.readInt();
final String textHeader = mStream.readString(headerTextLength);
mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader),
file);
final int sequenceCount = mStream.readInt();
if (mFileHeader.getSequences().size() > 0) {
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
if (sequenceCount != mFileHeader.getSequences().size()) {
throw new SAMFormatException("Number of sequences in text header (" + mFileHeader.getSequences().size() +
") != number of sequences in binary header (" + sequenceCount + ") for file " + file);
}
for (int i = 0; i < sequenceCount; i++) {
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(file);
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
file);
}
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
file);
}
}
} else {
// If only binary sequences are present, copy them into mFileHeader
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
for (int i = 0; i < sequenceCount; i++) {
sequences.add(readSequenceRecord(file));
}
mFileHeader.setSequences(sequences);
}
}
private SAMSequenceRecord readSequenceRecord(final File file) {
final int nameLength = mStream.readInt();
if (nameLength <= 1) {
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file);
}
final String sequenceName = mStream.readString(nameLength - 1);
// Skip the null terminator
mStream.readByte();
final int sequenceLength = mStream.readInt();
final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName);
record.setSequenceLength(sequenceLength);
return record;
}
private class BAMFileIterator
implements CloseableIterator<SAMRecord> {
private SAMRecord mNextRecord = null;
private final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(getFileHeader());
BAMFileIterator() {
this(true);
}
BAMFileIterator(final boolean advance) {
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
if (advance) {
advance();
}
}
public void close() {
if (this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
public boolean hasNext() {
return (mNextRecord != null);
}
public SAMRecord next() {
final SAMRecord result = mNextRecord;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
void advance() {
try {
mNextRecord = getNextRecord();
if (eagerDecode && mNextRecord != null) {
mNextRecord.eagerDecode();
}
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
SAMRecord getNextRecord()
throws IOException {
return bamRecordCodec.decode();
}
}
private class BAMFileIndexIterator
extends BAMFileIterator {
private long[] mFilePointers = null;
private int mFilePointerIndex = 0;
private long mFilePointerLimit = -1;
private int mReferenceIndex = -1;
private int mRegionStart = 0;
private int mRegionEnd = 0;
private boolean mReturnContained = false;
BAMFileIndexIterator(final String sequence, final int start, final int end, final boolean contained) {
super(false); // delay advance() until after construction
final SAMFileHeader fileHeader = getFileHeader();
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
if (mReferenceIndex != -1) {
final BAMFileIndex fileIndex = getFileIndex();
mFilePointers = fileIndex.getSearchBins(mReferenceIndex, start, end);
}
mRegionStart = start;
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
mReturnContained = contained;
advance();
}
SAMRecord getNextRecord()
throws IOException {
while (true) {
// Advance to next file block if necessary
while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) {
if (mFilePointers == null ||
mFilePointerIndex >= mFilePointers.length) {
return null;
}
final long startOffset = mFilePointers[mFilePointerIndex++];
final long endOffset = mFilePointers[mFilePointerIndex++];
mCompressedInputStream.seek(startOffset);
mFilePointerLimit = endOffset;
}
// Pull next record from stream
final SAMRecord record = super.getNextRecord();
if (record == null) {
return null;
}
// If beyond the end of this reference sequence, end iteration
final int referenceIndex = record.getReferenceIndex();
if (referenceIndex != mReferenceIndex) {
if (referenceIndex < 0 ||
referenceIndex > mReferenceIndex) {
mFilePointers = null;
return null;
}
// If before this reference sequence, continue
continue;
}
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
// Quick exit to avoid expensive alignment end calculation
return record;
}
final int alignmentStart = record.getAlignmentStart();
final int alignmentEnd = record.getAlignmentEnd();
if (alignmentStart > mRegionEnd) {
// If scanned beyond target region, end iteration
mFilePointers = null;
return null;
}
// Filter for overlap with region
if (mReturnContained) {
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
return record;
}
} else {
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
return record;
}
}
}
}
}
}