Supporting infrastructure for merging SAM files. Not yet integrated into the datasource.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2810 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fc810a1800
commit
57b8c9a53c
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
/**
|
||||
* Iterator for SAM records that implements comparable to enable sorting of iterators.
|
||||
* The comparison is performed by comparing the next record in the iterator to the next
|
||||
* record in another iterator and returning the ordering between those SAM records.
|
||||
*/
|
||||
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator> {
|
||||
private final Comparator<SAMRecord> comparator;
|
||||
|
||||
/**
|
||||
* Constructs a wrapping iterator around the given iterator that will be able
|
||||
* to compare itself to other ComparableSamRecordIterators using the given comparator.
|
||||
*
|
||||
* @param iterator the wrapped iterator.
|
||||
* @param comparator the Comparator to use to provide ordering fo SAMRecords
|
||||
*/
|
||||
public ComparableSamRecordIterator(final Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) {
|
||||
super(iterator);
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this iterator to another comparable iterator based on the next record
|
||||
* available in each iterator. If the two comparable iterators have different
|
||||
* comparator types internally an exception is thrown.
|
||||
*
|
||||
* @param that another iterator to compare to
|
||||
* @return a negative, 0 or positive number as described in the Comparator interface
|
||||
*/
|
||||
public int compareTo(final ComparableSamRecordIterator that) {
|
||||
if (this.comparator.getClass() != that.comparator.getClass()) {
|
||||
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
|
||||
"have different orderings internally");
|
||||
}
|
||||
|
||||
final SAMRecord record = this.peek();
|
||||
final SAMRecord record2 = that.peek();
|
||||
return comparator.compare(record, record2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
return compareTo((ComparableSamRecordIterator)o) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
throw new UnsupportedOperationException("ComparableSamRecordIterator should not be hashed because it can change value");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.*;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
* Provides an iterator interface for merging multiple underlying iterators into a single
|
||||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||
* the requested output format is unsorted, in which case any combination is valid.
|
||||
*/
|
||||
public class MergingSamRecordIterator implements Iterator<SAMRecord> {
|
||||
private final PriorityQueue<ComparableSamRecordIterator> pq;
|
||||
private final SamFileHeaderMerger samHeaderMerger;
|
||||
private final SAMFileHeader.SortOrder sortOrder;
|
||||
|
||||
/**
|
||||
* Maps iterators back to the readers from which they are derived.
|
||||
*/
|
||||
private final Map<Iterator<SAMRecord>,SAMFileReader> iteratorToSourceMap = new HashMap<Iterator<SAMRecord>,SAMFileReader>();
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
|
||||
this(headerMerger,createWholeFileIterators(headerMerger.getReaders()),forcePresorted);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with a given merged header and a subset of readers.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param readerToIteratorMap A mapping of reader to iterator.
|
||||
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final Map<SAMFileReader,Iterator<SAMRecord>> readerToIteratorMap, final boolean forcePresorted) {
|
||||
this.samHeaderMerger = headerMerger;
|
||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||
final SAMRecordComparator comparator = getComparator();
|
||||
|
||||
final Collection<SAMFileReader> readers = headerMerger.getReaders();
|
||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(readers.size());
|
||||
|
||||
for(final SAMFileReader reader: readerToIteratorMap.keySet()) {
|
||||
if (!forcePresorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
|
||||
reader.getFileHeader().getSortOrder() != this.sortOrder){
|
||||
throw new PicardException("Files are not compatible with sort order");
|
||||
}
|
||||
|
||||
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(readerToIteratorMap.get(reader),comparator);
|
||||
addIfNotEmpty(iterator);
|
||||
iteratorToSourceMap.put(iterator,reader);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each reader, derive an iterator that can walk the entire file and associate that back to
|
||||
* @param readers The readers from which to derive iterators.
|
||||
* @return A map of reader to its associated iterator.
|
||||
*/
|
||||
private static Map<SAMFileReader,Iterator<SAMRecord>> createWholeFileIterators(Collection<SAMFileReader> readers) {
|
||||
Map<SAMFileReader,Iterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,Iterator<SAMRecord>>();
|
||||
for(final SAMFileReader reader: readers)
|
||||
readerToIteratorMap.put(reader,reader.iterator());
|
||||
return readerToIteratorMap;
|
||||
}
|
||||
|
||||
/** Returns true if any of the underlying iterators has more records, otherwise false. */
|
||||
public boolean hasNext() {
|
||||
return !this.pq.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the next record from the top most iterator during merging. */
|
||||
public SAMRecord next() {
|
||||
final ComparableSamRecordIterator iterator = this.pq.poll();
|
||||
final SAMRecord record = iterator.next();
|
||||
addIfNotEmpty(iterator);
|
||||
record.setHeader(this.samHeaderMerger.getMergedHeader());
|
||||
|
||||
// Fix the read group if needs be
|
||||
if (this.samHeaderMerger.hasReadGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getReadGroupId(iteratorToSourceMap.get(iterator), oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the program group if needs be
|
||||
if (this.samHeaderMerger.hasProgramGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getProgramGroupId(iteratorToSourceMap.get(iterator), oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix up the sequence indexes if needs be
|
||||
if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
|
||||
if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iteratorToSourceMap.get(iterator),record.getReferenceIndex()));
|
||||
}
|
||||
|
||||
if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iteratorToSourceMap.get(iterator), record.getMateReferenceIndex()));
|
||||
}
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds iterator to priority queue. If the iterator has more records it is added
|
||||
* otherwise it is closed and not added.
|
||||
*/
|
||||
private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
|
||||
if (iterator.hasNext()) {
|
||||
pq.offer(iterator);
|
||||
}
|
||||
else {
|
||||
iterator.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Unsupported operation. */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
|
||||
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
|
||||
* ordering.
|
||||
*/
|
||||
private SAMRecordComparator getComparator() {
|
||||
// For unsorted build a fake comparator that compares based on object ID
|
||||
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
|
||||
return new SAMRecordComparator() {
|
||||
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
|
||||
}
|
||||
|
||||
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return fileOrderCompare(lhs, rhs);
|
||||
}
|
||||
};
|
||||
}
|
||||
if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
|
||||
return new MergedSequenceDictionaryCoordinateOrderComparator();
|
||||
}
|
||||
|
||||
// Otherwise try and figure out what kind of comparator to return and build it
|
||||
final Class<? extends SAMRecordComparator> type = this.sortOrder.getComparator();
|
||||
|
||||
try {
|
||||
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor();
|
||||
return ctor.newInstance();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the merged header that the merging iterator is working from. */
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return this.samHeaderMerger.getMergedHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
|
||||
* sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
|
||||
* more copy & paste.
|
||||
*/
|
||||
private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
|
||||
|
||||
public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
|
||||
final int referenceIndex1 = getReferenceIndex(samRecord1);
|
||||
final int referenceIndex2 = getReferenceIndex(samRecord2);
|
||||
if (referenceIndex1 != referenceIndex2) {
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return 1;
|
||||
} else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return -1;
|
||||
} else {
|
||||
return referenceIndex1 - referenceIndex2;
|
||||
}
|
||||
}
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
// Both are unmapped.
|
||||
return 0;
|
||||
}
|
||||
return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
|
||||
}
|
||||
|
||||
private int getReferenceIndex(final SAMRecord samRecord) {
|
||||
if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
|
||||
}
|
||||
if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
|
||||
}
|
||||
return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ import java.util.*;
|
|||
/**
|
||||
* Class for reading BAM file indexes.
|
||||
*/
|
||||
public class BAMFileIndex2
|
||||
public class BAMFileIndex2 extends BAMFileIndex
|
||||
{
|
||||
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||
private static final int BAM_LIDX_SHIFT = 14;
|
||||
|
|
@ -55,6 +55,7 @@ public class BAMFileIndex2
|
|||
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
|
||||
|
||||
protected BAMFileIndex2(final File file) {
|
||||
super(file);
|
||||
loadIndex(file);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,52 +23,22 @@
|
|||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.IOUtil;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.SAMFileReader.ReaderImplementation;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.List;
|
||||
import java.net.URL;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.broadinstitute.sting.utils.JVMUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
* Class for reading and querying SAM/BAM files. Delegates to appropriate concrete implementation.
|
||||
*/
|
||||
public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||
|
||||
private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY;
|
||||
|
||||
public static ValidationStringency getDefaultValidationStringency() {
|
||||
return defaultValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set validation stringency for all subsequently-created SAMFileReaders. This is the only way to
|
||||
* change the validation stringency for SAM header.
|
||||
*/
|
||||
public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) {
|
||||
SAMFileReader2.defaultValidationStringency = defaultValidationStringency;
|
||||
}
|
||||
|
||||
private boolean mIsBinary = false;
|
||||
private BAMFileIndex2 mFileIndex = null;
|
||||
private ReaderImplementation mReader = null;
|
||||
private File samFile = null;
|
||||
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
|
||||
*/
|
||||
public SAMFileReader2(final InputStream stream) {
|
||||
this(stream, false);
|
||||
}
|
||||
|
||||
public class SAMFileReader2 extends SAMFileReader {
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
|
||||
* that is named according to the convention, it will be found and opened, and indexed query will be allowed.
|
||||
|
|
@ -77,28 +47,6 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
this(file, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read a SAM or BAM file. If the given file is a BAM, and an index is present, indexed query
|
||||
* will be allowed.
|
||||
*
|
||||
* @param file SAM or BAM to read.
|
||||
* @param indexFile Index file that is companion to BAM, or null if no index file, or if index file
|
||||
* should be found automatically.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final File indexFile) {
|
||||
this(file, indexFile, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a SAM or BAM file. Indexed lookup not allowed because reading from InputStream.
|
||||
*
|
||||
* @param stream input SAM or BAM.
|
||||
* @param eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final InputStream stream, final boolean eagerDecode) {
|
||||
init(stream, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a SAM or BAM file, possibly with an index file if present.
|
||||
* If the given file is a BAM, and an index is present, indexed query will be allowed.
|
||||
|
|
@ -107,7 +55,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
* @param eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final boolean eagerDecode) {
|
||||
init(file, null, eagerDecode, defaultValidationStringency);
|
||||
this(file,null,eagerDecode);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -119,41 +67,20 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final File file, final File indexFile, final boolean eagerDecode){
|
||||
init(file, indexFile, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
super(file,indexFile,eagerDecode);
|
||||
close();
|
||||
|
||||
/**
|
||||
* Read a BAM file by http
|
||||
* indexed query will be allowed.
|
||||
*
|
||||
* @param url BAM.
|
||||
* @param indexFile Location of index file, or null in order to use the default index file (if present).
|
||||
* @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
|
||||
*/
|
||||
public SAMFileReader2(final URL url, final File indexFile, final boolean eagerDecode) {
|
||||
init(url, indexFile, eagerDecode, defaultValidationStringency);
|
||||
}
|
||||
try {
|
||||
BAMFileReader2 reader = new BAMFileReader2(file,eagerDecode,getDefaultValidationStringency());
|
||||
BAMFileIndex2 index = new BAMFileIndex2(indexFile != null ? indexFile : findIndexFileFromParent(file));
|
||||
reader.setFileIndex(index);
|
||||
|
||||
public void close() {
|
||||
if (mReader != null) {
|
||||
mReader.close();
|
||||
JVMUtils.setFieldValue(getField("mReader"),this,reader);
|
||||
JVMUtils.setFieldValue(getField("mFileIndex"),this,index);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to load BAM file: " + file,ex);
|
||||
}
|
||||
mReader = null;
|
||||
mFileIndex = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if this is a BAM reader.
|
||||
*/
|
||||
public boolean isBinary() {
|
||||
return mIsBinary;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if ths is a BAM file, and has an index
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
return (mFileIndex != null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -161,9 +88,10 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
* @return Number of levels in this index.
|
||||
*/
|
||||
public int getNumIndexLevels() {
|
||||
if(mFileIndex == null)
|
||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return mFileIndex.getNumIndexLevels();
|
||||
return fileIndex.getNumIndexLevels();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -172,34 +100,10 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
* @return the level associated with the given bin number.
|
||||
*/
|
||||
public int getLevelForBin(final Bin bin) {
|
||||
if(mFileIndex == null)
|
||||
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
|
||||
return mFileIndex.getLevelForBinNumber(bin.binNumber);
|
||||
}
|
||||
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return mReader.getFileHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Control validation of SAMRecords as they are read from file.
|
||||
* In order to control validation stringency for SAM Header, call SAMFileReader.setDefaultValidationStringency
|
||||
* before constructing a SAMFileReader.
|
||||
*/
|
||||
public void setValidationStringency(final ValidationStringency validationStringency) {
|
||||
mReader.setValidationStringency(validationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through file in order. For a SAMFileReader constructed from an InputStream, and for any SAM file,
|
||||
* a 2nd iteration starts where the 1st one left off. For a BAM constructed from a File, each new iteration
|
||||
* starts at the first record.
|
||||
* <p/>
|
||||
* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> iterator() {
|
||||
return mReader.getIterator();
|
||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return fileIndex.getLevelForBinNumber(bin.binNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -209,338 +113,46 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
|||
*/
|
||||
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
|
||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||
if(!(mReader instanceof BAMFileReader2))
|
||||
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||
return ((BAMFileReader2)mReader).getIterator(chunks);
|
||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||
return reader.getIterator(chunks);
|
||||
}
|
||||
|
||||
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
|
||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||
if(!(mReader instanceof BAMFileReader2))
|
||||
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||
return ((BAMFileReader2)mReader).getOverlappingBins(sequence,start,end);
|
||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||
return reader.getOverlappingBins(sequence,start,end);
|
||||
}
|
||||
|
||||
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||
if(!(mReader instanceof BAMFileReader2))
|
||||
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||
return ((BAMFileReader2)mReader).getFilePointersBounding(bin);
|
||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||
return reader.getFilePointersBounding(bin);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate
|
||||
* in parallel over the same underlying file.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @param contained If true, each SAMRecord returned is will have its alignment completely contained in the
|
||||
* interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest.
|
||||
* @return Iterator over the SAMRecords matching the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
return mReader.query(sequence, start, end, contained);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @return Iterator over the SAMRecords overlapping the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryOverlapping(final String sequence, final int start, final int end) {
|
||||
return query(sequence, start, end, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* is in the query region.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
|
||||
* @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
|
||||
* @return Iterator over the SAMRecords contained in the interval.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryContained(final String sequence, final int start, final int end) {
|
||||
return query(sequence, start, end, true);
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
return mReader.queryUnmapped();
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true.
|
||||
* <p/>
|
||||
* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start
|
||||
* a second iteration, the first one must be closed first.
|
||||
* <p/>
|
||||
* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read
|
||||
* and then discarded because they do not match the interval of interest.
|
||||
* <p/>
|
||||
* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that
|
||||
* matches the arguments.
|
||||
*
|
||||
* @param sequence Reference sequence of interest.
|
||||
* @param start Alignment start of interest.
|
||||
* @return Iterator over the SAMRecords with the given alignment start.
|
||||
*/
|
||||
public CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
return mReader.queryAlignmentStart(sequence, start);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the mate for the given read. Only valid to call this if hasIndex() == true.
|
||||
* This will work whether the mate has a coordinate or not, so long as the given read has correct
|
||||
* mate information. This method iterates over the SAM file, so there may not be an unclosed
|
||||
* iterator on the SAM file when this method is called.
|
||||
*
|
||||
* @param rec Record for which mate is sought. Must be a paired read.
|
||||
* @return rec's mate, or null if it cannot be found.
|
||||
*/
|
||||
public SAMRecord queryMate(final SAMRecord rec) {
|
||||
if (!rec.getReadPairedFlag()) {
|
||||
throw new IllegalArgumentException("queryMate called for unpaired read.");
|
||||
}
|
||||
if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) {
|
||||
throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both.");
|
||||
}
|
||||
final boolean firstOfPair = rec.getFirstOfPairFlag();
|
||||
final CloseableIterator<SAMRecord> it;
|
||||
if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
it = queryUnmapped();
|
||||
} else {
|
||||
it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart());
|
||||
}
|
||||
private Field getField(String fieldName) {
|
||||
try {
|
||||
SAMRecord mateRec = null;
|
||||
while (it.hasNext()) {
|
||||
final SAMRecord next = it.next();
|
||||
if (!next.getReadPairedFlag()) {
|
||||
if (rec.getReadName().equals(next.getReadName())) {
|
||||
throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (firstOfPair) {
|
||||
if (next.getFirstOfPairFlag()) continue;
|
||||
} else {
|
||||
if (next.getSecondOfPairFlag()) continue;
|
||||
}
|
||||
if (rec.getReadName().equals(next.getReadName())) {
|
||||
if (mateRec != null) {
|
||||
throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() +
|
||||
" for " + (firstOfPair ? "second" : "first") + " end.");
|
||||
}
|
||||
mateRec = next;
|
||||
}
|
||||
}
|
||||
return mateRec;
|
||||
} finally {
|
||||
it.close();
|
||||
return getClass().getSuperclass().getDeclaredField(fieldName);
|
||||
}
|
||||
catch(NoSuchFieldException ex) {
|
||||
throw new StingException("Unable to load field: " + fieldName);
|
||||
}
|
||||
}
|
||||
|
||||
private void init(final InputStream stream, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
|
||||
private File findIndexFileFromParent(File bamFile) {
|
||||
try {
|
||||
final BufferedInputStream bufferedStream = IOUtil.toBufferedStream(stream);
|
||||
if (isBAMFile(bufferedStream)) {
|
||||
mIsBinary = true;
|
||||
mReader = new BAMFileReader2(bufferedStream, eagerDecode, validationStringency);
|
||||
} else if (isGzippedSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
|
||||
} else if (isSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(bufferedStream, validationStringency);
|
||||
} else {
|
||||
throw new SAMFormatException("Unrecognized file format");
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
Method method = getClass().getSuperclass().getDeclaredMethod("findIndexFile",File.class);
|
||||
method.setAccessible(true);
|
||||
return (File)method.invoke(this,bamFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param url
|
||||
* @param indexFile
|
||||
* @param eagerDecode
|
||||
*/
|
||||
private void init(final URL url, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
|
||||
try {
|
||||
// Its too expensive to examine the remote file to determine type.
|
||||
// Rely on file extension.
|
||||
if (url.toString().toLowerCase().endsWith(".bam")) {
|
||||
mIsBinary = true;
|
||||
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
|
||||
mReader = reader;
|
||||
if (indexFile != null) {
|
||||
mFileIndex = new BAMFileIndex2(indexFile);
|
||||
reader.setFileIndex(mFileIndex);
|
||||
}
|
||||
} else {
|
||||
throw new SAMFormatException("Unrecognized file format: " + url);
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
catch(IllegalAccessException ex) {
|
||||
throw new StingException("Unable to run method findIndexFile",ex);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
catch(InvocationTargetException ex) {
|
||||
throw new StingException("Unable to run method findIndexFile",ex);
|
||||
}
|
||||
catch(NoSuchMethodException ex) {
|
||||
throw new StingException("Unable to run method findIndexFile",ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void init(final File file, File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) {
|
||||
this.samFile = file;
|
||||
|
||||
try {
|
||||
final BufferedInputStream bufferedStream = new BufferedInputStream(new FileInputStream(file));
|
||||
if (isBAMFile(bufferedStream)) {
|
||||
bufferedStream.close();
|
||||
mIsBinary = true;
|
||||
final BAMFileReader2 reader = new BAMFileReader2(file, eagerDecode, validationStringency);
|
||||
mReader = reader;
|
||||
if (indexFile == null) {
|
||||
indexFile = findIndexFile(file);
|
||||
}
|
||||
if (indexFile != null) {
|
||||
mFileIndex = new BAMFileIndex2(indexFile);
|
||||
reader.setFileIndex(mFileIndex);
|
||||
if (indexFile.lastModified() < file.lastModified()) {
|
||||
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +
|
||||
" is older than BAM " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
} else if (isGzippedSAMFile(bufferedStream)) {
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency);
|
||||
} else if (isSAMFile(bufferedStream)) {
|
||||
if (indexFile != null) {
|
||||
bufferedStream.close();
|
||||
throw new RuntimeException("Cannot use index file with textual SAM file");
|
||||
}
|
||||
mIsBinary = false;
|
||||
mReader = new SAMTextReader(bufferedStream, file, validationStringency);
|
||||
} else {
|
||||
bufferedStream.close();
|
||||
throw new SAMFormatException("Unrecognized file format");
|
||||
}
|
||||
setValidationStringency(validationStringency);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Look for BAM index file according to standard naming convention.
|
||||
*
|
||||
* @param dataFile BAM file name.
|
||||
* @return Index file name, or null if not found.
|
||||
*/
|
||||
private File findIndexFile(final File dataFile) {
|
||||
// If input is foo.bam, look for foo.bai
|
||||
final String bamExtension = ".bam";
|
||||
File indexFile;
|
||||
final String fileName = dataFile.getName();
|
||||
if (fileName.endsWith(bamExtension)) {
|
||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + ".bai";
|
||||
indexFile = new File(dataFile.getParent(), bai);
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
// If foo.bai doesn't exist look for foo.bam.bai
|
||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stream stream.markSupported() must be true
|
||||
* @return true if this looks like a BAM file.
|
||||
*/
|
||||
private boolean isBAMFile(final InputStream stream)
|
||||
throws IOException {
|
||||
return BlockCompressedInputStream.isValidFile(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to check whether the file is a gzipped sam file. Returns true if it
|
||||
* is and false otherwise.
|
||||
*/
|
||||
private boolean isGzippedSAMFile(final BufferedInputStream stream) {
|
||||
if (!stream.markSupported()) {
|
||||
throw new IllegalArgumentException("Cannot test a stream that doesn't support marking.");
|
||||
}
|
||||
stream.mark(8000);
|
||||
|
||||
try {
|
||||
final GZIPInputStream gunzip = new GZIPInputStream(stream);
|
||||
final int ch = gunzip.read();
|
||||
return true;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
return false;
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
stream.reset();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new IllegalStateException("Could not reset stream.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSAMFile(final InputStream stream) {
|
||||
// For now, assume every non-binary file is a SAM text file.
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (this.samFile == null) {
|
||||
return getClass().getSimpleName() + "{initialized with stream}";
|
||||
} else {
|
||||
return getClass().getSimpleName() + "{" + this.samFile.getAbsolutePath() + "}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue