Supporting infrastructure for merging SAM files. Not yet integrated into the datasource.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2810 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-08 23:59:38 +00:00
parent fc810a1800
commit 57b8c9a53c
4 changed files with 368 additions and 437 deletions

View File

@ -0,0 +1,85 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.picard.sam;
import net.sf.picard.util.PeekableIterator;
import java.util.Comparator;
import java.util.Iterator;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
/**
* Iterator for SAM records that implements comparable to enable sorting of iterators.
* The comparison is performed by comparing the next record in the iterator to the next
* record in another iterator and returning the ordering between those SAM records.
*/
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator> {
private final Comparator<SAMRecord> comparator;
/**
* Constructs a wrapping iterator around the given iterator that will be able
* to compare itself to other ComparableSamRecordIterators using the given comparator.
*
* @param iterator the wrapped iterator.
* @param comparator the Comparator to use to provide ordering fo SAMRecords
*/
public ComparableSamRecordIterator(final Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) {
super(iterator);
this.comparator = comparator;
}
/**
* Compares this iterator to another comparable iterator based on the next record
* available in each iterator. If the two comparable iterators have different
* comparator types internally an exception is thrown.
*
* @param that another iterator to compare to
* @return a negative, 0 or positive number as described in the Comparator interface
*/
public int compareTo(final ComparableSamRecordIterator that) {
if (this.comparator.getClass() != that.comparator.getClass()) {
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
"have different orderings internally");
}
final SAMRecord record = this.peek();
final SAMRecord record2 = that.peek();
return comparator.compare(record, record2);
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
return compareTo((ComparableSamRecordIterator)o) == 0;
}
@Override
public int hashCode() {
throw new UnsupportedOperationException("ComparableSamRecordIterator should not be hashed because it can change value");
}
}

View File

@ -0,0 +1,233 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.picard.sam;
import net.sf.picard.PicardException;
import java.util.*;
import java.lang.reflect.Constructor;
import net.sf.samtools.*;
/**
* Provides an iterator interface for merging multiple underlying iterators into a single
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator implements Iterator<SAMRecord> {
private final PriorityQueue<ComparableSamRecordIterator> pq;
private final SamFileHeaderMerger samHeaderMerger;
private final SAMFileHeader.SortOrder sortOrder;
/**
* Maps iterators back to the readers from which they are derived.
*/
private final Map<Iterator<SAMRecord>,SAMFileReader> iteratorToSourceMap = new HashMap<Iterator<SAMRecord>,SAMFileReader>();
/**
* Constructs a new merging iterator with the same set of readers and sort order as
* provided by the header merger parameter.
* @param headerMerger The merged header and contents of readers.
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
*/
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
this(headerMerger,createWholeFileIterators(headerMerger.getReaders()),forcePresorted);
}
/**
* Constructs a new merging iterator with a given merged header and a subset of readers.
* @param headerMerger The merged header and contents of readers.
* @param readerToIteratorMap A mapping of reader to iterator.
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
*/
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final Map<SAMFileReader,Iterator<SAMRecord>> readerToIteratorMap, final boolean forcePresorted) {
this.samHeaderMerger = headerMerger;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
final SAMRecordComparator comparator = getComparator();
final Collection<SAMFileReader> readers = headerMerger.getReaders();
this.pq = new PriorityQueue<ComparableSamRecordIterator>(readers.size());
for(final SAMFileReader reader: readerToIteratorMap.keySet()) {
if (!forcePresorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
reader.getFileHeader().getSortOrder() != this.sortOrder){
throw new PicardException("Files are not compatible with sort order");
}
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(readerToIteratorMap.get(reader),comparator);
addIfNotEmpty(iterator);
iteratorToSourceMap.put(iterator,reader);
}
}
/**
* For each reader, derive an iterator that can walk the entire file and associate that back to
* @param readers The readers from which to derive iterators.
* @return A map of reader to its associated iterator.
*/
private static Map<SAMFileReader,Iterator<SAMRecord>> createWholeFileIterators(Collection<SAMFileReader> readers) {
Map<SAMFileReader,Iterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,Iterator<SAMRecord>>();
for(final SAMFileReader reader: readers)
readerToIteratorMap.put(reader,reader.iterator());
return readerToIteratorMap;
}
/** Returns true if any of the underlying iterators has more records, otherwise false. */
public boolean hasNext() {
return !this.pq.isEmpty();
}
/** Returns the next record from the top most iterator during merging. */
public SAMRecord next() {
final ComparableSamRecordIterator iterator = this.pq.poll();
final SAMRecord record = iterator.next();
addIfNotEmpty(iterator);
record.setHeader(this.samHeaderMerger.getMergedHeader());
// Fix the read group if needs be
if (this.samHeaderMerger.hasReadGroupCollisions()) {
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
if (oldGroupId != null ) {
final String newGroupId = this.samHeaderMerger.getReadGroupId(iteratorToSourceMap.get(iterator), oldGroupId);
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
}
}
// Fix the program group if needs be
if (this.samHeaderMerger.hasProgramGroupCollisions()) {
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
if (oldGroupId != null ) {
final String newGroupId = this.samHeaderMerger.getProgramGroupId(iteratorToSourceMap.get(iterator), oldGroupId);
record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
}
}
// Fix up the sequence indexes if needs be
if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iteratorToSourceMap.get(iterator),record.getReferenceIndex()));
}
if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iteratorToSourceMap.get(iterator), record.getMateReferenceIndex()));
}
}
return record;
}
/**
* Adds iterator to priority queue. If the iterator has more records it is added
* otherwise it is closed and not added.
*/
private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
if (iterator.hasNext()) {
pq.offer(iterator);
}
else {
iterator.close();
}
}
/** Unsupported operation. */
public void remove() {
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
}
/**
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
* ordering.
*/
private SAMRecordComparator getComparator() {
// For unsorted build a fake comparator that compares based on object ID
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
return new SAMRecordComparator() {
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
}
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
return fileOrderCompare(lhs, rhs);
}
};
}
if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
return new MergedSequenceDictionaryCoordinateOrderComparator();
}
// Otherwise try and figure out what kind of comparator to return and build it
final Class<? extends SAMRecordComparator> type = this.sortOrder.getComparator();
try {
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor();
return ctor.newInstance();
}
catch (Exception e) {
throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e);
}
}
/** Returns the merged header that the merging iterator is working from. */
public SAMFileHeader getMergedHeader() {
return this.samHeaderMerger.getMergedHeader();
}
/**
* Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
* sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
* more copy & paste.
*/
private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
final int referenceIndex1 = getReferenceIndex(samRecord1);
final int referenceIndex2 = getReferenceIndex(samRecord2);
if (referenceIndex1 != referenceIndex2) {
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return 1;
} else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return -1;
} else {
return referenceIndex1 - referenceIndex2;
}
}
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
// Both are unmapped.
return 0;
}
return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
}
private int getReferenceIndex(final SAMRecord samRecord) {
if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
}
if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
}
return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
}
}
}

View File

@ -34,7 +34,7 @@ import java.util.*;
/**
* Class for reading BAM file indexes.
*/
public class BAMFileIndex2
public class BAMFileIndex2 extends BAMFileIndex
{
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
private static final int BAM_LIDX_SHIFT = 14;
@ -55,6 +55,7 @@ public class BAMFileIndex2
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
protected BAMFileIndex2(final File file) {
super(file);
loadIndex(file);
}

View File

@ -23,52 +23,22 @@
*/
package net.sf.samtools;
import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.IOUtil;
import net.sf.samtools.util.RuntimeIOException;
import net.sf.samtools.SAMFileReader.ReaderImplementation;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.picard.PicardException;
import java.io.*;
import java.util.zip.GZIPInputStream;
import java.util.List;
import java.net.URL;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.lang.reflect.InvocationTargetException;
import org.broadinstitute.sting.utils.JVMUtils;
import org.broadinstitute.sting.utils.StingException;
/**
* Class for reading and querying SAM/BAM files. Delegates to appropriate concrete implementation.
*/
public class SAMFileReader2 implements Iterable<SAMRecord> {
private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY;
public static ValidationStringency getDefaultValidationStringency() {
return defaultValidationStringency;
}
/**
* Set validation stringency for all subsequently-created SAMFileReaders. This is the only way to
* change the validation stringency for SAM header.
*/
public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) {
SAMFileReader2.defaultValidationStringency = defaultValidationStringency;
}
private boolean mIsBinary = false;
private BAMFileIndex2 mFileIndex = null;
private ReaderImplementation mReader = null;
private File samFile = null;
/**
* Prepare to read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
*/
public SAMFileReader2(final InputStream stream) {
this(stream, false);
}
public class SAMFileReader2 extends SAMFileReader {
/**
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
* that is named according to the convention, it will be found and opened, and indexed query will be allowed.
@ -77,28 +47,6 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
this(file, null, false);
}
/**
* Prepare to read a SAM or BAM file. If the given file is a BAM, and an index is present, indexed query
* will be allowed.
*
* @param file SAM or BAM to read.
* @param indexFile Index file that is companion to BAM, or null if no index file, or if index file
* should be found automatically.
*/
public SAMFileReader2(final File file, final File indexFile) {
this(file, indexFile, false);
}
/**
* Read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
*
* @param stream input SAM or BAM.
* @param eagerDecode if true, decode SAM record entirely when reading it.
*/
public SAMFileReader2(final InputStream stream, final boolean eagerDecode) {
init(stream, eagerDecode, defaultValidationStringency);
}
/**
* Read a SAM or BAM file, possibly with an index file if present.
* If the given file is a BAM, and an index is present, indexed query will be allowed.
@ -107,7 +55,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
* @param eagerDecode if true, decode SAM record entirely when reading it.
*/
public SAMFileReader2(final File file, final boolean eagerDecode) {
init(file, null, eagerDecode, defaultValidationStringency);
this(file,null,eagerDecode);
}
/**
@ -119,41 +67,20 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
*/
public SAMFileReader2(final File file, final File indexFile, final boolean eagerDecode){
init(file, indexFile, eagerDecode, defaultValidationStringency);
}
super(file,indexFile,eagerDecode);
close();
/**
* Read a BAM file by http
* indexed query will be allowed.
*
* @param url BAM.
* @param indexFile Location of index file, or null in order to use the default index file (if present).
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
*/
public SAMFileReader2(final URL url, final File indexFile, final boolean eagerDecode) {
init(url, indexFile, eagerDecode, defaultValidationStringency);
}
try {
BAMFileReader2 reader = new BAMFileReader2(file,eagerDecode,getDefaultValidationStringency());
BAMFileIndex2 index = new BAMFileIndex2(indexFile != null ? indexFile : findIndexFileFromParent(file));
reader.setFileIndex(index);
public void close() {
if (mReader != null) {
mReader.close();
JVMUtils.setFieldValue(getField("mReader"),this,reader);
JVMUtils.setFieldValue(getField("mFileIndex"),this,index);
}
catch(IOException ex) {
throw new StingException("Unable to load BAM file: " + file,ex);
}
mReader = null;
mFileIndex = null;
}
/**
* @return True if this is a BAM reader.
*/
public boolean isBinary() {
return mIsBinary;
}
/**
* @return true if ths is a BAM file, and has an index
*/
public boolean hasIndex() {
return (mFileIndex != null);
}
/**
@ -161,9 +88,10 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
* @return Number of levels in this index.
*/
public int getNumIndexLevels() {
if(mFileIndex == null)
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return mFileIndex.getNumIndexLevels();
return fileIndex.getNumIndexLevels();
}
/**
@ -172,34 +100,10 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
* @return the level associated with the given bin number.
*/
public int getLevelForBin(final Bin bin) {
if(mFileIndex == null)
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
return mFileIndex.getLevelForBinNumber(bin.binNumber);
}
public SAMFileHeader getFileHeader() {
return mReader.getFileHeader();
}
/**
* Control validation of SAMRecords as they are read from file.
* In order to control validation stringency for SAM Header, call SAMFileReader.setDefaultValidationStringency
* before constructing a SAMFileReader.
*/
public void setValidationStringency(final ValidationStringency validationStringency) {
mReader.setValidationStringency(validationStringency);
}
/**
* Iterate through file in order. For a SAMFileReader constructed from an InputStream, and for any SAM file,
* a 2nd iteration starts where the 1st one left off. For a BAM constructed from a File, each new iteration
* starts at the first record.
* <p/>
* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start
* a second iteration, the first one must be closed first.
*/
public CloseableIterator<SAMRecord> iterator() {
return mReader.getIterator();
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return fileIndex.getLevelForBinNumber(bin.binNumber);
}
/**
@ -209,338 +113,46 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
*/
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getIterator(chunks);
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
return reader.getIterator(chunks);
}
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getOverlappingBins(sequence,start,end);
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
return reader.getOverlappingBins(sequence,start,end);
}
public List<Chunk> getFilePointersBounding(final Bin bin) {
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getFilePointersBounding(bin);
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
return reader.getFilePointersBounding(bin);
}
/**
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
* <p/>
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
* a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate
* in parallel over the same underlying file.
* <p/>
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
* and then discarded because they do not match the interval of interest.
* <p/>
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
* is in the query region.
*
* @param sequence Reference sequence of interest.
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
* @param contained If true, each SAMRecord returned is will have its alignment completely contained in the
* interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest.
* @return Iterator over the SAMRecords matching the interval.
*/
public CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
return mReader.query(sequence, start, end, contained);
}
/**
* Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true.
* <p/>
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
* a second iteration, the first one must be closed first.
* <p/>
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
* and then discarded because they do not match the interval of interest.
* <p/>
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
* is in the query region.
*
* @param sequence Reference sequence of interest.
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
* @return Iterator over the SAMRecords overlapping the interval.
*/
public CloseableIterator<SAMRecord> queryOverlapping(final String sequence, final int start, final int end) {
return query(sequence, start, end, false);
}
/**
* Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true.
* <p/>
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
* a second iteration, the first one must be closed first.
* <p/>
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
* and then discarded because they do not match the interval of interest.
* <p/>
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
* is in the query region.
*
* @param sequence Reference sequence of interest.
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
* @return Iterator over the SAMRecords contained in the interval.
*/
public CloseableIterator<SAMRecord> queryContained(final String sequence, final int start, final int end) {
return query(sequence, start, end, true);
}
public CloseableIterator<SAMRecord> queryUnmapped() {
return mReader.queryUnmapped();
}
/**
* Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true.
* <p/>
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
* a second iteration, the first one must be closed first.
* <p/>
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
* and then discarded because they do not match the interval of interest.
* <p/>
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
* matches the arguments.
*
* @param sequence Reference sequence of interest.
* @param start Alignment start of interest.
* @return Iterator over the SAMRecords with the given alignment start.
*/
public CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
return mReader.queryAlignmentStart(sequence, start);
}
/**
* Fetch the mate for the given read. Only valid to call this if hasIndex() == true.
* This will work whether the mate has a coordinate or not, so long as the given read has correct
* mate information. This method iterates over the SAM file, so there may not be an unclosed
* iterator on the SAM file when this method is called.
*
* @param rec Record for which mate is sought. Must be a paired read.
* @return rec's mate, or null if it cannot be found.
*/
public SAMRecord queryMate(final SAMRecord rec) {
if (!rec.getReadPairedFlag()) {
throw new IllegalArgumentException("queryMate called for unpaired read.");
}
if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) {
throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both.");
}
final boolean firstOfPair = rec.getFirstOfPairFlag();
final CloseableIterator<SAMRecord> it;
if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
it = queryUnmapped();
} else {
it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart());
}
private Field getField(String fieldName) {
try {
SAMRecord mateRec = null;
while (it.hasNext()) {
final SAMRecord next = it.next();
if (!next.getReadPairedFlag()) {
if (rec.getReadName().equals(next.getReadName())) {
throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName());
}
continue;
}
if (firstOfPair) {
if (next.getFirstOfPairFlag()) continue;
} else {
if (next.getSecondOfPairFlag()) continue;
}
if (rec.getReadName().equals(next.getReadName())) {
if (mateRec != null) {
throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() +
" for " + (firstOfPair ? "second" : "first") + " end.");
}
mateRec = next;
}
}
return mateRec;
} finally {
it.close();
return getClass().getSuperclass().getDeclaredField(fieldName);
}
catch(NoSuchFieldException ex) {
throw new StingException("Unable to load field: " + fieldName);
}
}
private void init(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency) {
private File findIndexFileFromParent(File bamFile) {
try {
final BufferedInputStream bufferedStream = IOUtil.toBufferedStream(stream);
if (isBAMFile(bufferedStream)) {
mIsBinary = true;
mReader = new BAMFileReader2(bufferedStream, eagerDecode, validationStringency);
} else if (isGzippedSAMFile(bufferedStream)) {
mIsBinary = false;
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
} else if (isSAMFile(bufferedStream)) {
mIsBinary = false;
mReader = new SAMTextReader(bufferedStream, validationStringency);
} else {
throw new SAMFormatException("Unrecognized file format");
}
setValidationStringency(validationStringency);
} catch (IOException e) {
throw new RuntimeIOException(e);
Method method = getClass().getSuperclass().getDeclaredMethod("findIndexFile",File.class);
method.setAccessible(true);
return (File)method.invoke(this,bamFile);
}
}
/**
* @param url
* @param indexFile
* @param eagerDecode
*/
private void init(final URL url, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
try {
// Its too expensive to examine the remote file to determine type.
// Rely on file extension.
if (url.toString().toLowerCase().endsWith(".bam")) {
mIsBinary = true;
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
mReader = reader;
if (indexFile != null) {
mFileIndex = new BAMFileIndex2(indexFile);
reader.setFileIndex(mFileIndex);
}
} else {
throw new SAMFormatException("Unrecognized file format: " + url);
}
setValidationStringency(validationStringency);
catch(IllegalAccessException ex) {
throw new StingException("Unable to run method findIndexFile",ex);
}
catch (IOException e) {
throw new RuntimeIOException(e);
catch(InvocationTargetException ex) {
throw new StingException("Unable to run method findIndexFile",ex);
}
catch(NoSuchMethodException ex) {
throw new StingException("Unable to run method findIndexFile",ex);
}
}
private void init(final File file, File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
this.samFile = file;
try {
final BufferedInputStream bufferedStream = new BufferedInputStream(new FileInputStream(file));
if (isBAMFile(bufferedStream)) {
bufferedStream.close();
mIsBinary = true;
final BAMFileReader2 reader = new BAMFileReader2(file, eagerDecode, validationStringency);
mReader = reader;
if (indexFile == null) {
indexFile = findIndexFile(file);
}
if (indexFile != null) {
mFileIndex = new BAMFileIndex2(indexFile);
reader.setFileIndex(mFileIndex);
if (indexFile.lastModified() < file.lastModified()) {
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +
" is older than BAM " + file.getAbsolutePath());
}
}
} else if (isGzippedSAMFile(bufferedStream)) {
mIsBinary = false;
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
} else if (isSAMFile(bufferedStream)) {
if (indexFile != null) {
bufferedStream.close();
throw new RuntimeException("Cannot use index file with textual SAM file");
}
mIsBinary = false;
mReader = new SAMTextReader(bufferedStream, file, validationStringency);
} else {
bufferedStream.close();
throw new SAMFormatException("Unrecognized file format");
}
setValidationStringency(validationStringency);
}
catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* Look for BAM index file according to standard naming convention.
*
* @param dataFile BAM file name.
* @return Index file name, or null if not found.
*/
private File findIndexFile(final File dataFile) {
// If input is foo.bam, look for foo.bai
final String bamExtension = ".bam";
File indexFile;
final String fileName = dataFile.getName();
if (fileName.endsWith(bamExtension)) {
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + ".bai";
indexFile = new File(dataFile.getParent(), bai);
if (indexFile.exists()) {
return indexFile;
}
}
// If foo.bai doesn't exist look for foo.bam.bai
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
if (indexFile.exists()) {
return indexFile;
} else {
return null;
}
}
/**
* @param stream stream.markSupported() must be true
* @return true if this looks like a BAM file.
*/
private boolean isBAMFile(final InputStream stream)
throws IOException {
return BlockCompressedInputStream.isValidFile(stream);
}
/**
* Attempts to check whether the file is a gzipped sam file. Returns true if it
* is and false otherwise.
*/
private boolean isGzippedSAMFile(final BufferedInputStream stream) {
if (!stream.markSupported()) {
throw new IllegalArgumentException("Cannot test a stream that doesn't support marking.");
}
stream.mark(8000);
try {
final GZIPInputStream gunzip = new GZIPInputStream(stream);
final int ch = gunzip.read();
return true;
}
catch (IOException ioe) {
return false;
}
finally {
try {
stream.reset();
}
catch (IOException ioe) {
throw new IllegalStateException("Could not reset stream.");
}
}
}
private boolean isSAMFile(final InputStream stream) {
// For now, assume every non-binary file is a SAM text file.
return true;
}
@Override
public String toString() {
if (this.samFile == null) {
return getClass().getSimpleName() + "{initialized with stream}";
} else {
return getClass().getSimpleName() + "{" + this.samFile.getAbsolutePath() + "}";
}
}
}