Updates to chunk iteration. Includes the return of the dreaded *2.java files;
hopefully I can find a way to kill these off before the Picard patch is ready. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2550 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fcce77c245
commit
7893aaefe9
|
|
@ -1,15 +1,9 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.StringLineReader;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.SeekableStream;
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Arrays;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.File;
|
||||
|
|
@ -22,6 +16,11 @@ import java.io.File;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class BAMBlockIterator implements CloseableIterator<Block> {
|
||||
/**
|
||||
* The input stream for back files.
|
||||
*/
|
||||
private FileInputStream inputStream;
|
||||
|
||||
/**
|
||||
* File channel from which to read chunks.
|
||||
*/
|
||||
|
|
@ -35,13 +34,23 @@ public class BAMBlockIterator implements CloseableIterator<Block> {
|
|||
/**
|
||||
* Iterate through the BAM chunks in a file.
|
||||
* @param file stream File to use when accessing the BAM.
|
||||
* @throws IOException in case of problems reading from the file.
|
||||
*/
|
||||
public BAMBlockIterator(File file) throws IOException {
|
||||
FileInputStream inputStream = new FileInputStream(file);
|
||||
this.blockReader = new BlockReader(inputStream);
|
||||
inputStream = new FileInputStream(file);
|
||||
blockReader = new BlockReader(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the existing file.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new PicardException("Unable to close file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -59,7 +68,7 @@ public class BAMBlockIterator implements CloseableIterator<Block> {
|
|||
/**
|
||||
* Get the next chunk from the iterator.
|
||||
* @return The next chunk.
|
||||
* @throw NoSuchElementException if no next chunk is available.
|
||||
* @throws NoSuchElementException if no next chunk is available.
|
||||
*/
|
||||
public Block next() {
|
||||
if(!hasNext())
|
||||
|
|
|
|||
|
|
@ -0,0 +1,87 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Adapts a list of BAM blocks to BAM chunks.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BAMChunkIterator implements Iterator<Chunk> {
|
||||
/**
|
||||
* The wrapped chunk iterator.
|
||||
*/
|
||||
private final BAMBlockIterator blockIterator;
|
||||
|
||||
/**
|
||||
* List of prefetched segments. If prefetchedSegments is
|
||||
* empty, this should mean that there's nothing left in the queue.
|
||||
*/
|
||||
private Queue<BlockSegment> prefetchedSegments;
|
||||
|
||||
/**
|
||||
* List of chunks to filter out of the final chunk list.
|
||||
*/
|
||||
private final Queue<Chunk> filters;
|
||||
|
||||
/**
|
||||
* Creates a new chunk iterator wrapping the given block, optionally filtering
|
||||
* out a list of chunks from those blocks.
|
||||
* TODO: This class is too smart. Wrap filtering functionality in another class.
|
||||
* @param blockIterator block iterator to wrap.
|
||||
* @param filters List of chunks to filter out of the given input stream.
|
||||
*/
|
||||
public BAMChunkIterator(BAMBlockIterator blockIterator, List<Chunk> filters) {
|
||||
this.blockIterator = blockIterator;
|
||||
this.prefetchedSegments = new LinkedList<BlockSegment>();
|
||||
this.filters = new PriorityQueue<Chunk>(filters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is a next chunk available.
|
||||
* @return true if a next chunk is available; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return !prefetchedSegments.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next chunk in the list.
|
||||
* @return Next block, adapted to a chunk.
|
||||
*/
|
||||
public Chunk next() {
|
||||
if(prefetchedSegments.isEmpty())
|
||||
throw new NoSuchElementException("BAMChunkIterator is exhausted.");
|
||||
// TODO: Maybe tie together adjacent chunks together?
|
||||
Chunk nextChunk = prefetchedSegments.remove().toChunk();
|
||||
seedNextSegments();
|
||||
return nextChunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a BAM chunk iterator");
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed the next value or set of values for the iterator, if necessary.
|
||||
*/
|
||||
private void seedNextSegments() {
|
||||
while(prefetchedSegments.isEmpty() && blockIterator.hasNext()) {
|
||||
// Add the new segments to the prefetch...
|
||||
prefetchedSegments.add(new BlockSegment(blockIterator.next()));
|
||||
|
||||
// And iteratively subtract the filtering chunks.
|
||||
for(Chunk filter: filters) {
|
||||
Queue<BlockSegment> filteredBlockSegments = new LinkedList<BlockSegment>();
|
||||
for(BlockSegment blockSegment: prefetchedSegments)
|
||||
filteredBlockSegments.addAll(blockSegment.minus(filter));
|
||||
prefetchedSegments = filteredBlockSegments;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,630 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.StringLineReader;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.net.URL;
|
||||
|
||||
/**
|
||||
* Internal class for reading and querying BAM files.
|
||||
*/
|
||||
class BAMFileReader2
|
||||
extends SAMFileReader2.ReaderImplementation2 {
|
||||
// True if reading from a File rather than an InputStream
|
||||
private boolean mIsSeekable = false;
|
||||
// For converting bytes into other primitive types
|
||||
private BinaryCodec mStream = null;
|
||||
// Underlying compressed data stream.
|
||||
private final BlockCompressedInputStream mCompressedInputStream;
|
||||
private SAMFileHeader mFileHeader = null;
|
||||
// Populated if the file is seekable and an index exists
|
||||
private BAMFileIndex mFileIndex = null;
|
||||
private long mFirstRecordPointer = 0;
|
||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||
// If true, all SAMRecords are fully decoded as they are read.
|
||||
private final boolean eagerDecode;
|
||||
// For error-checking.
|
||||
private ValidationStringency mValidationStringency;
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a stream (not seekable)
|
||||
* @param stream source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader2(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency)
|
||||
throws IOException {
|
||||
mIsSeekable = false;
|
||||
mCompressedInputStream = new BlockCompressedInputStream(stream);
|
||||
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
readHeader(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a file (seekable)
|
||||
* @param file source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader2(final File file, final boolean eagerDecode, final ValidationStringency validationStringency)
|
||||
throws IOException {
|
||||
this(new BlockCompressedInputStream(file), eagerDecode, file.getAbsolutePath(), validationStringency);
|
||||
}
|
||||
|
||||
|
||||
BAMFileReader2(final URL url, final boolean eagerDecode, final ValidationStringency validationStringency)
|
||||
throws IOException {
|
||||
this(new BlockCompressedInputStream(url), eagerDecode, url.toString(), validationStringency);
|
||||
}
|
||||
|
||||
private BAMFileReader2(final BlockCompressedInputStream compressedInputStream, final boolean eagerDecode,
|
||||
final String source, final ValidationStringency validationStringency)
|
||||
throws IOException {
|
||||
mIsSeekable = true;
|
||||
mCompressedInputStream = compressedInputStream;
|
||||
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
readHeader(source);
|
||||
mFirstRecordPointer = mCompressedInputStream.getFilePointer();
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (mStream != null) {
|
||||
mStream.close();
|
||||
}
|
||||
mStream = null;
|
||||
mFileHeader = null;
|
||||
mFileIndex = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the file index, if one exists, else null.
|
||||
*/
|
||||
BAMFileIndex getFileIndex() {
|
||||
return mFileIndex;
|
||||
}
|
||||
|
||||
void setFileIndex(final BAMFileIndex fileIndex) {
|
||||
mFileIndex = fileIndex;
|
||||
}
|
||||
|
||||
SAMFileHeader getFileHeader() {
|
||||
return mFileHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set error-checking level for subsequent SAMRecord reads.
|
||||
*/
|
||||
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
||||
this.mValidationStringency = validationStringency;
|
||||
}
|
||||
|
||||
SAMFileReader.ValidationStringency getValidationStringency() {
|
||||
return this.mValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords in file order.
|
||||
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
||||
* that iterator must be closed before getIterator() can be called again.
|
||||
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
||||
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
||||
* done in that situation.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> getIterator() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mCompressedInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
mCurrentIterator = new BAMFileIterator();
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
CloseableIterator<SAMRecord> getIterator(List<Chunk> chunks) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mCompressedInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create an iterator over the given chunk boundaries.
|
||||
mCurrentIterator = new BAMFileIndexIterator(Chunk.toCoordinateArray(chunks));
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords that match the given interval.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
||||
* A value of zero implies the start of the reference sequence.
|
||||
* @param end A value of zero implies the end of the reference sequence.
|
||||
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
||||
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
||||
* @return Iterator for the matching SAMRecords
|
||||
*/
|
||||
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
if (mFileIndex == null) {
|
||||
throw new IllegalStateException("No BAM file index is available");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords with the given alignment start.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Alignment start sought.
|
||||
* @return Iterator for the matching SAMRecords.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
if (mFileIndex == null) {
|
||||
throw new IllegalStateException("No BAM file index is available");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
if (mFileIndex == null) {
|
||||
throw new IllegalStateException("No BAM file index is available");
|
||||
}
|
||||
try {
|
||||
final long startOfLastLinearBin = mFileIndex.getStartOfLastLinearBin();
|
||||
if (startOfLastLinearBin != -1) {
|
||||
mCompressedInputStream.seek(startOfLastLinearBin);
|
||||
} else {
|
||||
// No mapped reads in file, just start at the first read in file.
|
||||
mCompressedInputStream.seek(mFirstRecordPointer);
|
||||
}
|
||||
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
||||
return mCurrentIterator;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the header from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private void readHeader(final String source)
|
||||
throws IOException {
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
mStream.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||
throw new IOException("Invalid BAM file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = mStream.readInt();
|
||||
final String textHeader = mStream.readString(headerTextLength);
|
||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||
headerCodec.setValidationStringency(mValidationStringency);
|
||||
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
||||
source);
|
||||
|
||||
final int sequenceCount = mStream.readInt();
|
||||
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
||||
throw new SAMFormatException("Number of sequences in text header (" +
|
||||
mFileHeader.getSequenceDictionary().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
||||
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||
source);
|
||||
}
|
||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
source);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If only binary sequences are present, copy them into mFileHeader
|
||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
sequences.add(readSequenceRecord(source));
|
||||
}
|
||||
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single binary sequence record from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private SAMSequenceRecord readSequenceRecord(final String source) {
|
||||
final int nameLength = mStream.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
||||
}
|
||||
final String sequenceName = mStream.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
mStream.readByte();
|
||||
final int sequenceLength = mStream.readInt();
|
||||
return new SAMSequenceRecord(sequenceName, sequenceLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
||||
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
||||
*/
|
||||
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
||||
private SAMRecord mNextRecord = null;
|
||||
private final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(getFileHeader());
|
||||
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
||||
|
||||
BAMFileIterator() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param advance Trick to enable subclass to do more setup before advancing
|
||||
*/
|
||||
BAMFileIterator(final boolean advance) {
|
||||
this.bamRecordCodec.setInputStream(BAMFileReader2.this.mStream.getInputStream());
|
||||
|
||||
if (advance) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
final SAMRecord result = mNextRecord;
|
||||
advance();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
void advance() {
|
||||
try {
|
||||
mNextRecord = getNextRecord();
|
||||
if (mNextRecord != null) {
|
||||
++this.samRecordIndex;
|
||||
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
||||
mNextRecord.setValidationStringency(mValidationStringency);
|
||||
|
||||
if (mValidationStringency != ValidationStringency.SILENT) {
|
||||
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
||||
SAMUtils.processValidationErrors(validationErrors,
|
||||
this.samRecordIndex, BAMFileReader2.this.getValidationStringency());
|
||||
}
|
||||
}
|
||||
if (eagerDecode && mNextRecord != null) {
|
||||
mNextRecord.eagerDecode();
|
||||
}
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next record from the input stream.
|
||||
*/
|
||||
SAMRecord getNextRecord() throws IOException {
|
||||
return bamRecordCodec.decode();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The record that will be return by the next call to next()
|
||||
*/
|
||||
protected SAMRecord peek() {
|
||||
return mNextRecord;
|
||||
}
|
||||
}
|
||||
|
||||
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
||||
|
||||
/**
|
||||
* Creates an iterator over indexed data in the specified range.
|
||||
* @param sequence Sequence to which to constrain the data.
|
||||
* @param start Starting position within the above sequence to which the data should be constrained.
|
||||
* @param end Ending position within the above sequence to which the data should be constrained.s
|
||||
* @param queryType Type of query. Useful for establishing the boundary rules.
|
||||
* @return An iterator over the requested data.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
||||
final int start,
|
||||
final int end,
|
||||
final QueryType queryType) {
|
||||
long[] filePointers = null;
|
||||
|
||||
// Hit the index to determine the chunk boundaries for the required data.
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMFileIndex fileIndex = getFileIndex();
|
||||
filePointers = fileIndex.getSearchBins(referenceIndex, start, end);
|
||||
}
|
||||
|
||||
// Create an iterator over the above chunk boundaries.
|
||||
BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
||||
|
||||
// Add some preprocessing filters for edge-case reads that don't fit into this
|
||||
// query type.
|
||||
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
||||
}
|
||||
|
||||
private class BAMFileIndexIterator
|
||||
extends BAMFileIterator {
|
||||
|
||||
private long[] mFilePointers = null;
|
||||
private int mFilePointerIndex = 0;
|
||||
private long mFilePointerLimit = -1;
|
||||
|
||||
BAMFileIndexIterator(final long[] filePointers) {
|
||||
super(false); // delay advance() until after construction
|
||||
mFilePointers = filePointers;
|
||||
advance();
|
||||
}
|
||||
|
||||
SAMRecord getNextRecord()
|
||||
throws IOException {
|
||||
while (true) {
|
||||
// Advance to next file block if necessary
|
||||
while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||
if (mFilePointers == null ||
|
||||
mFilePointerIndex >= mFilePointers.length) {
|
||||
return null;
|
||||
}
|
||||
final long startOffset = mFilePointers[mFilePointerIndex++];
|
||||
final long endOffset = mFilePointers[mFilePointerIndex++];
|
||||
mCompressedInputStream.seek(startOffset);
|
||||
mFilePointerLimit = endOffset;
|
||||
}
|
||||
// Pull next record from stream
|
||||
final SAMRecord record = super.getNextRecord();
|
||||
if (record == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A decorating iterator that filters out records that are outside the bounds of the
|
||||
* given query parameters.
|
||||
*/
|
||||
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The next record to be returned. Will be null if no such record exists.
|
||||
*/
|
||||
private SAMRecord nextRead;
|
||||
|
||||
private final int mReferenceIndex;
|
||||
private final int mRegionStart;
|
||||
private final int mRegionEnd;
|
||||
private final QueryType mQueryType;
|
||||
|
||||
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
||||
this.wrappedIterator = iterator;
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
mRegionStart = start;
|
||||
if (queryType == QueryType.STARTING_AT) {
|
||||
mRegionEnd = mRegionStart;
|
||||
} else {
|
||||
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
||||
}
|
||||
mQueryType = queryType;
|
||||
advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a next element exists; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the given iterator.
|
||||
* @return The next SAM record in the iterator.
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
||||
final SAMRecord currentRead = nextRead;
|
||||
advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes down the existing iterator.
|
||||
*/
|
||||
public void close() {
|
||||
wrappedIterator.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
SAMRecord advance() {
|
||||
while (true) {
|
||||
// Pull next record from stream
|
||||
if(!wrappedIterator.hasNext())
|
||||
return null;
|
||||
|
||||
final SAMRecord record = wrappedIterator.next();
|
||||
// If beyond the end of this reference sequence, end iteration
|
||||
final int referenceIndex = record.getReferenceIndex();
|
||||
if (referenceIndex != mReferenceIndex) {
|
||||
if (referenceIndex < 0 ||
|
||||
referenceIndex > mReferenceIndex) {
|
||||
return null;
|
||||
}
|
||||
// If before this reference sequence, continue
|
||||
continue;
|
||||
}
|
||||
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
||||
// Quick exit to avoid expensive alignment end calculation
|
||||
return record;
|
||||
}
|
||||
final int alignmentStart = record.getAlignmentStart();
|
||||
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
||||
// the query region, regardless of whether the mapped mate will be returned.
|
||||
final int alignmentEnd;
|
||||
if (mQueryType == QueryType.STARTING_AT) {
|
||||
alignmentEnd = -1;
|
||||
} else {
|
||||
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
||||
record.getAlignmentEnd(): alignmentStart);
|
||||
}
|
||||
|
||||
if (alignmentStart > mRegionEnd) {
|
||||
// If scanned beyond target region, end iteration
|
||||
return null;
|
||||
}
|
||||
// Filter for overlap with region
|
||||
if (mQueryType == QueryType.CONTAINED) {
|
||||
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else if (mQueryType == QueryType.OVERLAPPING) {
|
||||
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else {
|
||||
if (alignmentStart == mRegionStart) {
|
||||
return record;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
||||
private BAMFileIndexUnmappedIterator() {
|
||||
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Represents a segment of a given block. A block segment can easily be
|
||||
* converted or compared to either a block or a chunk.
|
||||
*/
|
||||
class BlockSegment {
|
||||
/**
|
||||
* Offset to the start of this block within the file.
|
||||
*/
|
||||
private final long position;
|
||||
|
||||
/**
|
||||
* Starting coordinate within this segment.
|
||||
*/
|
||||
private final long blockStart;
|
||||
|
||||
/**
|
||||
* Last coordinate within this segment.
|
||||
*/
|
||||
private final long blockStop;
|
||||
|
||||
/**
|
||||
* Create a new block segment from the given block.
|
||||
* @param block the block to use as the starting point for this segment.
|
||||
*/
|
||||
public BlockSegment(Block block) {
|
||||
this(block.position,0,block.uncompressedBlockSize-1);
|
||||
}
|
||||
|
||||
BlockSegment(final long position,final long blockStart,final long blockStop) {
|
||||
this.position = position;
|
||||
this.blockStart = blockStart;
|
||||
this.blockStop = blockStop;
|
||||
}
|
||||
|
||||
/**
|
||||
* The size of the block segment.
|
||||
* @return Number of bytes represented by this block segment.
|
||||
*/
|
||||
public long size() {
|
||||
return blockStop - blockStart + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the given segment to a chunk.
|
||||
* @return the chunk equivalent of this block.
|
||||
*/
|
||||
public Chunk toChunk() {
|
||||
return new Chunk(position << 16 & blockStart,position << 16 & blockStop);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this block segment overlap the given chunk?
|
||||
* @param chunk The chunk to test.
|
||||
* @return true if the chunk is w/i the block; false otherwise.
|
||||
*/
|
||||
public boolean overlaps(Chunk chunk) {
|
||||
Chunk segmentAsChunk = toChunk();
|
||||
|
||||
// Chunk is completely encompasses the given block.
|
||||
if(chunk.getChunkStart() < segmentAsChunk.getChunkStart() &&
|
||||
chunk.getChunkEnd() > segmentAsChunk.getChunkEnd())
|
||||
return true;
|
||||
|
||||
// Chunk starts w/i block.
|
||||
if(chunk.getChunkStart() >= segmentAsChunk.getChunkStart() &&
|
||||
chunk.getChunkStart() <= segmentAsChunk.getChunkEnd())
|
||||
return true;
|
||||
|
||||
// Chunk ends w/i block.
|
||||
if(chunk.getChunkEnd() >= segmentAsChunk.getChunkStart() &&
|
||||
chunk.getChunkEnd() <= segmentAsChunk.getChunkEnd())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subtracts the given chunk from the block segment.
|
||||
* @param chunk The chunk to subtract.
|
||||
* @return The block segment(s) derived from subtracting the chunk.
|
||||
*/
|
||||
public List<BlockSegment> minus(Chunk chunk) {
|
||||
// If no overlap exists, just return a new instance of the current block segment.
|
||||
if(!overlaps(chunk))
|
||||
return Collections.singletonList(this);
|
||||
|
||||
List<BlockSegment> differenceSegments = new ArrayList<BlockSegment>();
|
||||
Chunk segmentAsChunk = toChunk();
|
||||
|
||||
// If the segment begins before the chunk, build out a segment from the start of the block to just before
|
||||
// the start of the chunk or the end of the block segment, whichever comes first.
|
||||
if(segmentAsChunk.getChunkStart() < chunk.getChunkStart()) {
|
||||
long segmentStop = (position == chunk.getChunkStart()>>16) ? (chunk.getChunkStart()&0xFFFF)-1 : blockStop;
|
||||
differenceSegments.add(new BlockSegment(position,blockStart,segmentStop));
|
||||
}
|
||||
|
||||
// If the segment ends after the chunk, build out a segment from either after the end of the chunk or the
|
||||
// start of the block segment, whichever comes last.
|
||||
if(segmentAsChunk.getChunkEnd() > chunk.getChunkEnd()) {
|
||||
long segmentStart = (position == chunk.getChunkEnd()>>16) ? (chunk.getChunkEnd()&0xFFFF)+1 : blockStart;
|
||||
differenceSegments.add(new BlockSegment(position,segmentStart,blockStop));
|
||||
}
|
||||
|
||||
return differenceSegments;
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@ package net.sf.samtools;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Test harness for playing with sharding by BGZF block.
|
||||
|
|
@ -30,6 +31,14 @@ public class BlockTestHarness {
|
|||
Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation();
|
||||
System.out.printf("Header location = %s%n", headerLocation);
|
||||
|
||||
BAMChunkIterator chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile),
|
||||
Collections.singletonList(headerLocation));
|
||||
while(chunkIterator.hasNext()) {
|
||||
Chunk chunk = chunkIterator.next();
|
||||
}
|
||||
|
||||
// The following test code blocks a bunch of files together.
|
||||
/*
|
||||
BAMBlockIterator blockIterator = new BAMBlockIterator(bamFile);
|
||||
long blockCount = 0;
|
||||
|
||||
|
|
@ -37,10 +46,11 @@ public class BlockTestHarness {
|
|||
while(blockIterator.hasNext()) {
|
||||
Block block = blockIterator.next();
|
||||
blockCount++;
|
||||
//System.out.println(block);
|
||||
System.out.println(block);
|
||||
}
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.out.printf("Number of chunks: %d; elapsed time: %dms%n", blockCount, endTime-startTime);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Represents a chunk stolen from the BAM file. Originally a private static inner class within
|
||||
* BAMFileIndex; now breaking it out so that the sharding system can use it.
|
||||
|
|
@ -33,6 +38,43 @@ class Chunk implements Comparable<Chunk> {
|
|||
mChunkEnd = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* The list of chunks is often represented as an array of
|
||||
* longs where every even-numbered index is a start coordinate
|
||||
* and every odd-numbered index is a stop coordinate. Convert
|
||||
* from that format back to a list of chunks.
|
||||
* @param coordinateArray List of chunks to convert.
|
||||
* @return A list of chunks.
|
||||
*/
|
||||
public static List<Chunk> toChunkList(long[] coordinateArray) {
|
||||
if(coordinateArray.length % 2 != 0)
|
||||
throw new PicardException("Data supplied does not appear to be in coordinate array format.");
|
||||
|
||||
// TODO: possibly also check for monotonically increasing; this seems to be an implicit requirement of this format.
|
||||
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||
for(int i = 0; i < coordinateArray.length; i += 2)
|
||||
chunkList.add(new Chunk(coordinateArray[i],coordinateArray[i+1]));
|
||||
|
||||
return chunkList;
|
||||
}
|
||||
|
||||
/**
|
||||
* The list of chunks is often represented as an array of
|
||||
* longs where every even-numbered index is a start coordinate
|
||||
* and every odd-numbered index is a stop coordinate.
|
||||
* @param chunks List of chunks to convert.
|
||||
* @return A long array of the format described above.
|
||||
*/
|
||||
public static long[] toCoordinateArray(List<Chunk> chunks) {
|
||||
long[] coordinateArray = new long[chunks.size()*2];
|
||||
int position = 0;
|
||||
for(Chunk chunk: chunks) {
|
||||
coordinateArray[position++] = chunk.getChunkStart();
|
||||
coordinateArray[position++] = chunk.getChunkEnd();
|
||||
}
|
||||
return coordinateArray;
|
||||
}
|
||||
|
||||
public int compareTo(final Chunk chunk) {
|
||||
int result = Long.signum(mChunkStart - chunk.mChunkStart);
|
||||
if (result == 0) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,522 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.IOUtil;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.SAMFileReader.ReaderImplementation;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.List;
|
||||
import java.net.URL;
|
||||
|
||||
|
||||
/**
|
||||
* Class for reading and querying SAM/BAM files. Delegates to appropriate concrete implementation.
|
||||
*/
|
||||
public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||
|
||||
private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY;
|
||||
|
||||
public static ValidationStringency getDefaultValidationStringency() {
|
||||
return defaultValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set validation stringency for all subsequently-created SAMFileReaders. This is the only way to
|
||||
* change the validation stringency for SAM header.
|
||||
*/
|
||||
public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) {
|
||||
SAMFileReader2.defaultValidationStringency = defaultValidationStringency;
|
||||
}
|
||||
|
||||
private boolean mIsBinary = false;
|
||||
private BAMFileIndex mFileIndex = null;
|
||||
private ReaderImplementation mReader = null;
|
||||
private File samFile = null;
|
||||
|
||||
/**
|
||||
* Internal interface for SAM/BAM file reader implementations.
|
||||
* Implemented as an abstract class to enforce better access control.
|
||||
*/
|
||||
static abstract class ReaderImplementation2 extends SAMFileReader.ReaderImplementation {
|
||||
abstract CloseableIterator<SAMRecord> getIterator(List<Chunk> chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
|
||||
*/
|
||||
public SAMFileReader2(final InputStream stream) {
|
||||
this(stream, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
|
||||
* that is named according to the convention, it will be found and opened, and indexed query will be allowed.
|
||||
*/
|
||||
public SAMFileReader2(final File file) {
|
||||
this(file, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. If the given file is a BAM, and an index is present, indexed query
|
||||
* will be allowed.
|
||||
*
|
||||
* @param file SAM or BAM to read.
|
||||
* @param indexFile Index file that is companion to BAM, or null if no index file, or if index file
|
||||
* should be found automatically.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final File indexFile) {
|
||||
this(file, indexFile, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
|
||||
*
|
||||
* @param stream input SAM or BAM.
|
||||
* @param eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final InputStream stream, final boolean eagerDecode) {
|
||||
init(stream, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a SAM or BAM file, possibly with an index file if present.
|
||||
* If the given file is a BAM, and an index is present, indexed query will be allowed.
|
||||
*
|
||||
* @param file SAM or BAM.
|
||||
* @param eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final boolean eagerDecode) {
|
||||
init(file, null, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a SAM or BAM file, possibly with an index file. If the given file is a BAM, and an index is present,
|
||||
* indexed query will be allowed.
|
||||
*
|
||||
* @param file SAM or BAM.
|
||||
* @param indexFile Location of index file, or null in order to use the default index file (if present).
|
||||
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final File indexFile, final boolean eagerDecode){
|
||||
init(file, indexFile, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a BAM file by http
|
||||
* indexed query will be allowed.
|
||||
*
|
||||
* @param url BAM.
|
||||
* @param indexFile Location of index file, or null in order to use the default index file (if present).
|
||||
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final URL url, final File indexFile, final boolean eagerDecode) {
|
||||
init(url, indexFile, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mReader != null) {
|
||||
mReader.close();
|
||||
}
|
||||
if (mFileIndex != null) {
|
||||
mFileIndex.close();
|
||||
}
|
||||
mReader = null;
|
||||
mFileIndex = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if this is a BAM reader.
|
||||
*/
|
||||
public boolean isBinary() {
|
||||
return mIsBinary;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if ths is a BAM file, and has an index
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
return (mFileIndex != null);
|
||||
}
|
||||
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return mReader.getFileHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Control validation of SAMRecords as they are read from file.
|
||||
* In order to control validation stringency for SAM Header, call SAMFileReader.setDefaultValidationStringency
|
||||
* before constructing a SAMFileReader.
|
||||
*/
|
||||
public void setValidationStringency(final ValidationStringency validationStringency) {
|
||||
mReader.setValidationStringency(validationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through file in order. For a SAMFileReader constructed from an InputStream, and for any SAM file,
|
||||
* a 2nd iteration starts where the 1st one left off. For a BAM constructed from a File, each new iteration
|
||||
* starts at the first record.
|
||||
* <p/>
|
||||
* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> iterator() {
|
||||
return mReader.getIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through the given chunks in the file.
|
||||
* @param chunks List of chunks for which to retrieve data.
|
||||
* @return An iterator over the given chunks.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
|
||||
// TODO: Add sanity checks so that we're not doing this against a BAM file.
|
||||
if(!(mReader instanceof ReaderImplementation2))
|
||||
throw new PicardException("This call requires a ReaderImplementation2-compliant interface");
|
||||
return ((ReaderImplementation2)mReader).getIterator(chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate
|
||||
* in parallel over the same underlying file.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @param contained If true, each SAMRecord returned is will have its alignment completely contained in the
|
||||
* interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest.
|
||||
* @return Iterator over the SAMRecords matching the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
return mReader.query(sequence, start, end, contained);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @return Iterator over the SAMRecords overlapping the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryOverlapping(final String sequence, final int start, final int end) {
|
||||
return query(sequence, start, end, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @return Iterator over the SAMRecords contained in the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryContained(final String sequence, final int start, final int end) {
|
||||
return query(sequence, start, end, true);
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
return mReader.queryUnmapped();
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* matches the arguments.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start Alignment start of interest.
|
||||
* @return Iterator over the SAMRecords with the given alignment start.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
return mReader.queryAlignmentStart(sequence, start);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the mate for the given read. Only valid to call this if hasIndex() == true.
|
||||
* This will work whether the mate has a coordinate or not, so long as the given read has correct
|
||||
* mate information. This method iterates over the SAM file, so there may not be an unclosed
|
||||
* iterator on the SAM file when this method is called.
|
||||
*
|
||||
* @param rec Record for which mate is sought. Must be a paired read.
|
||||
* @return rec's mate, or null if it cannot be found.
|
||||
*/
|
||||
public SAMRecord queryMate(final SAMRecord rec) {
|
||||
if (!rec.getReadPairedFlag()) {
|
||||
throw new IllegalArgumentException("queryMate called for unpaired read.");
|
||||
}
|
||||
if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) {
|
||||
throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both.");
|
||||
}
|
||||
final boolean firstOfPair = rec.getFirstOfPairFlag();
|
||||
final CloseableIterator<SAMRecord> it;
|
||||
if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
it = queryUnmapped();
|
||||
} else {
|
||||
it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart());
|
||||
}
|
||||
try {
|
||||
SAMRecord mateRec = null;
|
||||
while (it.hasNext()) {
|
||||
final SAMRecord next = it.next();
|
||||
if (!next.getReadPairedFlag()) {
|
||||
if (rec.getReadName().equals(next.getReadName())) {
|
||||
throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (firstOfPair) {
|
||||
if (next.getFirstOfPairFlag()) continue;
|
||||
} else {
|
||||
if (next.getSecondOfPairFlag()) continue;
|
||||
}
|
||||
if (rec.getReadName().equals(next.getReadName())) {
|
||||
if (mateRec != null) {
|
||||
throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() +
|
||||
" for " + (firstOfPair ? "second" : "first") + " end.");
|
||||
}
|
||||
mateRec = next;
|
||||
}
|
||||
}
|
||||
return mateRec;
|
||||
} finally {
|
||||
it.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void init(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
|
||||
try {
|
||||
final BufferedInputStream bufferedStream = IOUtil.toBufferedStream(stream);
|
||||
if (isBAMFile(bufferedStream)) {
|
||||
mIsBinary = true;
|
||||
mReader = new BAMFileReader2(bufferedStream, eagerDecode, validationStringency);
|
||||
} else if (isGzippedSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
|
||||
} else if (isSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(bufferedStream, validationStringency);
|
||||
} else {
|
||||
throw new SAMFormatException("Unrecognized file format");
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param url
|
||||
* @param indexFile
|
||||
* @param eagerDecode
|
||||
*/
|
||||
private void init(final URL url, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
|
||||
try {
|
||||
// Its too expensive to examine the remote file to determine type.
|
||||
// Rely on file extension.
|
||||
if (url.toString().toLowerCase().endsWith(".bam")) {
|
||||
mIsBinary = true;
|
||||
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
|
||||
mReader = reader;
|
||||
if (indexFile != null) {
|
||||
mFileIndex = new BAMFileIndex(indexFile);
|
||||
reader.setFileIndex(mFileIndex);
|
||||
}
|
||||
} else {
|
||||
throw new SAMFormatException("Unrecognized file format: " + url);
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void init(final File file, File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
this.samFile = file;
|
||||
|
||||
try {
|
||||
final BufferedInputStream bufferedStream = new BufferedInputStream(new FileInputStream(file));
|
||||
if (isBAMFile(bufferedStream)) {
|
||||
bufferedStream.close();
|
||||
mIsBinary = true;
|
||||
final BAMFileReader2 reader = new BAMFileReader2(file, eagerDecode, validationStringency);
|
||||
mReader = reader;
|
||||
if (indexFile == null) {
|
||||
indexFile = findIndexFile(file);
|
||||
}
|
||||
if (indexFile != null) {
|
||||
mFileIndex = new BAMFileIndex(indexFile);
|
||||
reader.setFileIndex(mFileIndex);
|
||||
if (indexFile.lastModified() < file.lastModified()) {
|
||||
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +
|
||||
" is older than BAM " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
} else if (isGzippedSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
|
||||
} else if (isSAMFile(bufferedStream)) {
|
||||
if (indexFile != null) {
|
||||
bufferedStream.close();
|
||||
throw new RuntimeException("Cannot use index file with textual SAM file");
|
||||
}
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(bufferedStream, file, validationStringency);
|
||||
} else {
|
||||
bufferedStream.close();
|
||||
throw new SAMFormatException("Unrecognized file format");
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Look for BAM index file according to standard naming convention.
|
||||
*
|
||||
* @param dataFile BAM file name.
|
||||
* @return Index file name, or null if not found.
|
||||
*/
|
||||
private File findIndexFile(final File dataFile) {
|
||||
// If input is foo.bam, look for foo.bai
|
||||
final String bamExtension = ".bam";
|
||||
File indexFile;
|
||||
final String fileName = dataFile.getName();
|
||||
if (fileName.endsWith(bamExtension)) {
|
||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + ".bai";
|
||||
indexFile = new File(dataFile.getParent(), bai);
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
// If foo.bai doesn't exist look for foo.bam.bai
|
||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stream stream.markSupported() must be true
|
||||
* @return true if this looks like a BAM file.
|
||||
*/
|
||||
private boolean isBAMFile(final InputStream stream)
|
||||
throws IOException {
|
||||
return BlockCompressedInputStream.isValidFile(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to check whether the file is a gzipped sam file. Returns true if it
|
||||
* is and false otherwise.
|
||||
*/
|
||||
private boolean isGzippedSAMFile(final BufferedInputStream stream) {
|
||||
if (!stream.markSupported()) {
|
||||
throw new IllegalArgumentException("Cannot test a stream that doesn't support marking.");
|
||||
}
|
||||
stream.mark(8000);
|
||||
|
||||
try {
|
||||
final GZIPInputStream gunzip = new GZIPInputStream(stream);
|
||||
final int ch = gunzip.read();
|
||||
return true;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
return false;
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
stream.reset();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new IllegalStateException("Could not reset stream.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSAMFile(final InputStream stream) {
|
||||
// For now, assume every non-binary file is a SAM text file.
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (this.samFile == null) {
|
||||
return getClass().getSimpleName() + "{initialized with stream}";
|
||||
} else {
|
||||
return getClass().getSimpleName() + "{" + this.samFile.getAbsolutePath() + "}";
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue