Added a bunch of fixes: MSRI wasn't working, sharding had broken edge cases, and SAMBAM DS needed to close the file handles.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8efedacabf
commit
12752cf893
|
|
@ -115,19 +115,19 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
||||||
public Shard next() {
|
public Shard next() {
|
||||||
// lets get some background info on the problem
|
// lets get some background info on the problem
|
||||||
long length = dic.getSequence(seqLoc).getSequenceLength();
|
long length = dic.getSequence(seqLoc).getSequenceLength();
|
||||||
long proposedSize = nextShardSize() - 1;
|
long proposedSize = nextShardSize();
|
||||||
long nextStart = mLoc.getStop() + 1;
|
long nextStart = mLoc.getStop() + 1;
|
||||||
// can we fit it into the current seq size?
|
// can we fit it into the current seq size?
|
||||||
if (nextStart + proposedSize < length) {
|
if (nextStart + proposedSize - 1 < length) {
|
||||||
lastGenomeLocSize = proposedSize;
|
lastGenomeLocSize = proposedSize;
|
||||||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
|
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1);
|
||||||
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize));
|
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1));
|
||||||
}
|
}
|
||||||
// else we can't make it in the current location, we have to stitch one together
|
// else we can't make it in the current location, we have to stitch one together
|
||||||
else {
|
else {
|
||||||
long overflow = nextStart + proposedSize - length;
|
long overflow = nextStart + proposedSize -1 - length;
|
||||||
logger.debug("Overflow = " + overflow + " length: " + length);
|
logger.debug("Overflow = " + overflow + " length: " + length);
|
||||||
lastGenomeLocSize = lastGenomeLocSize - overflow;
|
lastGenomeLocSize = proposedSize - overflow;
|
||||||
// move to the next contig
|
// move to the next contig
|
||||||
// the next sequence should start at the begining of the next contig
|
// the next sequence should start at the begining of the next contig
|
||||||
Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize));
|
Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize));
|
||||||
|
|
|
||||||
|
|
@ -37,35 +37,31 @@ public class SAMBAMDataSource implements SimpleDataSource {
|
||||||
// do we care that the SAM files respect the sort order.
|
// do we care that the SAM files respect the sort order.
|
||||||
private boolean matchedSortOrders = true;
|
private boolean matchedSortOrders = true;
|
||||||
|
|
||||||
// our merged sam iterator for spliting up the files
|
|
||||||
MergingSamRecordIterator2 mergeIterator;
|
|
||||||
|
|
||||||
// are we set to locus mode or read mode for dividing
|
// are we set to locus mode or read mode for dividing
|
||||||
private boolean locusMode = true;
|
private boolean locusMode = true;
|
||||||
|
|
||||||
// How strict should we be with SAM/BAM parsing?
|
// How strict should we be with SAM/BAM parsing?
|
||||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
||||||
|
|
||||||
|
// our list of readers
|
||||||
|
private final List<File> samFileList = new ArrayList<File>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* constructor, given a single sam file
|
* constructor, given a single sam file
|
||||||
*
|
*
|
||||||
* @param samFiles the list of sam files
|
* @param samFiles the list of sam files
|
||||||
*/
|
*/
|
||||||
public SAMBAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
|
public SAMBAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
|
||||||
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
|
||||||
for (String fileName : samFiles) {
|
for (String fileName : samFiles) {
|
||||||
File smFile = new File(fileName);
|
File smFile = new File(fileName);
|
||||||
if (!smFile.canRead()) {
|
if (!smFile.canRead()) {
|
||||||
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
|
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
|
||||||
}
|
}
|
||||||
SAMFileReader reader = initializeSAMFile(smFile);
|
samFileList.add(smFile);
|
||||||
if (reader != null) {
|
|
||||||
readers.add(reader);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
|
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
|
||||||
this.mergeIterator = new MergingSamRecordIterator2(headerMerger);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,6 +69,7 @@ public class SAMBAMDataSource implements SimpleDataSource {
|
||||||
if (samFile.toString().endsWith(".list")) {
|
if (samFile.toString().endsWith(".list")) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
|
System.err.println("initializeSAMFile");
|
||||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||||
samReader.setValidationStringency(strictness);
|
samReader.setValidationStringency(strictness);
|
||||||
|
|
||||||
|
|
@ -96,19 +93,43 @@ public class SAMBAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* getQueryRegionIterator
|
* seek
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param location the genome location to extract data for
|
* @param location the genome location to extract data for
|
||||||
* @return an iterator for that region
|
* @return an iterator for that region
|
||||||
*/
|
*/
|
||||||
public MergingSamRecordIterator2 seek(GenomeLoc location) {
|
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||||
MergingSamRecordIterator2 iter = null; // new MergingSamRecordIterator2(this.mergeIterator.getMergedHeader().);
|
|
||||||
|
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||||
|
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||||
|
for (File f : this.samFileList) {
|
||||||
|
SAMFileReader reader = initializeSAMFile(f);
|
||||||
|
if (reader == null) {
|
||||||
|
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + f);
|
||||||
|
}
|
||||||
|
lst.add(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
// now merge the headers
|
||||||
|
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
||||||
|
|
||||||
|
// make a merging iterator for this record
|
||||||
|
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||||
|
|
||||||
|
|
||||||
|
System.err.println("About to query");
|
||||||
|
// we do different things for locus and read modes
|
||||||
if (locusMode) {
|
if (locusMode) {
|
||||||
iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
|
iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
|
||||||
} else {
|
} else {
|
||||||
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
|
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
|
||||||
}
|
}
|
||||||
return iter; //To change body of implemented methods use File | Settings | File Templates.
|
|
||||||
|
// return the iterator
|
||||||
|
return iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,6 @@ public interface SimpleDataSource extends Serializable {
|
||||||
* @param location the genome location to extract data for
|
* @param location the genome location to extract data for
|
||||||
* @return an iterator of the appropriate type, that is limited by the region
|
* @return an iterator of the appropriate type, that is limited by the region
|
||||||
*/
|
*/
|
||||||
public Iterator seek(GenomeLoc location);
|
public Iterator seek(GenomeLoc location) throws SimpleDataSourceLoadException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,35 +10,30 @@
|
||||||
*/
|
*/
|
||||||
package org.broadinstitute.sting.gatk.iterators;
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
|
||||||
import edu.mit.broad.picard.sam.ReservedTagConstants;
|
|
||||||
import edu.mit.broad.picard.PicardException;
|
import edu.mit.broad.picard.PicardException;
|
||||||
|
import edu.mit.broad.picard.sam.ReservedTagConstants;
|
||||||
|
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||||
import edu.mit.broad.picard.util.PeekableIterator;
|
import edu.mit.broad.picard.util.PeekableIterator;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
|
|
||||||
/*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
|
||||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
|
|
||||||
* functionality.
|
|
||||||
*/
|
|
||||||
import java.util.*;
|
|
||||||
import java.lang.reflect.Constructor;
|
import java.lang.reflect.Constructor;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.PriorityQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides an iterator interface for merging multiple underlying iterators into a single
|
* Provides an iterator interface for merging multiple underlying iterators into a single
|
||||||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||||
* the requested output format is unsorted, in which case any combination is valid.
|
* the requested output format is unsorted, in which case any combination is valid.
|
||||||
*/
|
*/
|
||||||
public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
|
||||||
protected PriorityQueue<ComparableSamRecordIterator> pq;
|
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
|
||||||
protected final SamFileHeaderMerger samHeaderMerger;
|
protected final SamFileHeaderMerger samHeaderMerger;
|
||||||
protected final SAMFileHeader.SortOrder sortOrder;
|
protected final SAMFileHeader.SortOrder sortOrder;
|
||||||
|
|
||||||
|
protected boolean initialized = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||||
* provided by the header merger parameter.
|
* provided by the header merger parameter.
|
||||||
|
|
@ -46,8 +41,18 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
|
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
|
||||||
this.samHeaderMerger = headerMerger;
|
this.samHeaderMerger = headerMerger;
|
||||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||||
initializePQ();
|
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this class MUST only be initialized once, since the creation of the
|
||||||
|
*/
|
||||||
|
private void lazyInitialization() {
|
||||||
|
if (initialized) {
|
||||||
|
throw new UnsupportedOperationException("You cannot double initialize a MergingSamRecordIterator2");
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
final SAMRecordComparator comparator = getComparator();
|
final SAMRecordComparator comparator = getComparator();
|
||||||
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||||
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
|
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
|
||||||
|
|
@ -59,17 +64,15 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean supportsSeeking() {
|
||||||
protected void initializePQ() {
|
|
||||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean supportsSeeking() {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void queryOverlapping(final String contig, final int start, final int stop) {
|
public void queryOverlapping(final String contig, final int start, final int stop) {
|
||||||
initializePQ(); // reinitialize the system
|
if (initialized) {
|
||||||
|
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
final SAMRecordComparator comparator = getComparator();
|
final SAMRecordComparator comparator = getComparator();
|
||||||
|
|
||||||
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||||
|
|
@ -80,9 +83,13 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void query(final String contig, final int start, final int stop, final boolean contained) {
|
public void query(final String contig, final int start, final int stop, final boolean contained) {
|
||||||
initializePQ(); // reinitialize the system
|
if (initialized) {
|
||||||
|
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
final SAMRecordComparator comparator = getComparator();
|
final SAMRecordComparator comparator = getComparator();
|
||||||
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||||
|
//reader.close();
|
||||||
Iterator<SAMRecord> recordIter = reader.query(contig, start, stop, contained);
|
Iterator<SAMRecord> recordIter = reader.query(contig, start, stop, contained);
|
||||||
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
|
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
|
||||||
addIfNotEmpty(iterator);
|
addIfNotEmpty(iterator);
|
||||||
|
|
@ -90,7 +97,10 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void queryContained(final String contig, final int start, final int stop) {
|
public void queryContained(final String contig, final int start, final int stop) {
|
||||||
initializePQ(); // reinitialize the system
|
if (initialized) {
|
||||||
|
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
final SAMRecordComparator comparator = getComparator();
|
final SAMRecordComparator comparator = getComparator();
|
||||||
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||||
Iterator<SAMRecord> recordIter = reader.queryContained(contig, start, stop);
|
Iterator<SAMRecord> recordIter = reader.queryContained(contig, start, stop);
|
||||||
|
|
@ -106,6 +116,9 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
|
|
||||||
/** Returns the next record from the top most iterator during merging. */
|
/** Returns the next record from the top most iterator during merging. */
|
||||||
public synchronized SAMRecord next() {
|
public synchronized SAMRecord next() {
|
||||||
|
if (!initialized) {
|
||||||
|
lazyInitialization();
|
||||||
|
}
|
||||||
final ComparableSamRecordIterator iterator = this.pq.poll();
|
final ComparableSamRecordIterator iterator = this.pq.poll();
|
||||||
final SAMRecord record = iterator.next();
|
final SAMRecord record = iterator.next();
|
||||||
addIfNotEmpty(iterator);
|
addIfNotEmpty(iterator);
|
||||||
|
|
@ -186,6 +199,26 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
|
||||||
public SAMFileHeader getMergedHeader() {
|
public SAMFileHeader getMergedHeader() {
|
||||||
return this.samHeaderMerger.getMergedHeader();
|
return this.samHeaderMerger.getMergedHeader();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* closes all the file handles for the readers....DO THIS or you will run out of handles
|
||||||
|
* with sharding.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
for (SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* allows us to be used in the new style for loops
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Iterator<SAMRecord> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should replace picard class with the same name
|
// Should replace picard class with the same name
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue