Added a bunch of fixes: MSRI wasn't working, sharding had broken edge cases, and SAMBAM DS needed to close the file handles.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-09 00:20:15 +00:00
parent 8efedacabf
commit 12752cf893
4 changed files with 100 additions and 46 deletions

View File

@ -115,19 +115,19 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
public Shard next() {
// lets get some background info on the problem
long length = dic.getSequence(seqLoc).getSequenceLength();
long proposedSize = nextShardSize() - 1;
long proposedSize = nextShardSize();
long nextStart = mLoc.getStop() + 1;
// can we fit it into the current seq size?
if (nextStart + proposedSize < length) {
if (nextStart + proposedSize - 1 < length) {
lastGenomeLocSize = proposedSize;
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize));
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1);
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1));
}
// else we can't make it in the current location, we have to stitch one together
else {
long overflow = nextStart + proposedSize - length;
long overflow = nextStart + proposedSize -1 - length;
logger.debug("Overflow = " + overflow + " length: " + length);
lastGenomeLocSize = lastGenomeLocSize - overflow;
lastGenomeLocSize = proposedSize - overflow;
// move to the next contig
// the next sequence should start at the begining of the next contig
Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize));

View File

@ -37,35 +37,31 @@ public class SAMBAMDataSource implements SimpleDataSource {
// do we care that the SAM files respect the sort order.
private boolean matchedSortOrders = true;
// our merged sam iterator for spliting up the files
MergingSamRecordIterator2 mergeIterator;
// are we set to locus mode or read mode for dividing
private boolean locusMode = true;
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
// our list of readers
private final List<File> samFileList = new ArrayList<File>();
/**
* constructor, given a single sam file
*
* @param samFiles the list of sam files
*/
public SAMBAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
for (String fileName : samFiles) {
File smFile = new File(fileName);
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
}
SAMFileReader reader = initializeSAMFile(smFile);
if (reader != null) {
readers.add(reader);
}
samFileList.add(smFile);
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
this.mergeIterator = new MergingSamRecordIterator2(headerMerger);
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
}
@ -73,6 +69,7 @@ public class SAMBAMDataSource implements SimpleDataSource {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
System.err.println("initializeSAMFile");
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
@ -96,19 +93,43 @@ public class SAMBAMDataSource implements SimpleDataSource {
/**
* <p>
* getQueryRegionIterator
* seek
* </p>
*
* @param location the genome location to extract data for
* @return an iterator for that region
*/
public MergingSamRecordIterator2 seek(GenomeLoc location) {
MergingSamRecordIterator2 iter = null; // new MergingSamRecordIterator2(this.mergeIterator.getMergedHeader().);
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
for (File f : this.samFileList) {
SAMFileReader reader = initializeSAMFile(f);
if (reader == null) {
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + f);
}
lst.add(reader);
}
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
System.err.println("About to query");
// we do different things for locus and read modes
if (locusMode) {
iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
} else {
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
}
return iter; //To change body of implemented methods use File | Settings | File Templates.
// return the iterator
return iter;
}
}

View File

@ -32,6 +32,6 @@ public interface SimpleDataSource extends Serializable {
* @param location the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public Iterator seek(GenomeLoc location);
public Iterator seek(GenomeLoc location) throws SimpleDataSourceLoadException;
}

View File

@ -10,35 +10,30 @@
*/
package org.broadinstitute.sting.gatk.iterators;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.sam.ReservedTagConstants;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.sam.ReservedTagConstants;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.util.PeekableIterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
* functionality.
*/
import java.util.*;
import java.lang.reflect.Constructor;
import java.util.Comparator;
import java.util.Iterator;
import java.util.PriorityQueue;
/**
* Provides an iterator interface for merging multiple underlying iterators into a single
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
protected PriorityQueue<ComparableSamRecordIterator> pq;
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
protected boolean initialized = false;
/**
* Constructs a new merging iterator with the same set of readers and sort order as
* provided by the header merger parameter.
@ -46,8 +41,18 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
this.samHeaderMerger = headerMerger;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
initializePQ();
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
}
/**
* this class MUST only be initialized once, since the creation of the
*/
private void lazyInitialization() {
if (initialized) {
throw new UnsupportedOperationException("You cannot double initialize a MergingSamRecordIterator2");
}
initialized = true;
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
@ -59,17 +64,15 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
}
}
protected void initializePQ() {
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
}
public boolean supportsSeeking() {
public boolean supportsSeeking() {
return true;
}
public void queryOverlapping(final String contig, final int start, final int stop) {
initializePQ(); // reinitialize the system
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
initialized = true;
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
@ -80,9 +83,13 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
}
public void query(final String contig, final int start, final int stop, final boolean contained) {
initializePQ(); // reinitialize the system
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
initialized = true;
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
//reader.close();
Iterator<SAMRecord> recordIter = reader.query(contig, start, stop, contained);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
@ -90,7 +97,10 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
}
public void queryContained(final String contig, final int start, final int stop) {
initializePQ(); // reinitialize the system
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
initialized = true;
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.queryContained(contig, start, stop);
@ -106,6 +116,9 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
/** Returns the next record from the top most iterator during merging. */
public synchronized SAMRecord next() {
if (!initialized) {
lazyInitialization();
}
final ComparableSamRecordIterator iterator = this.pq.poll();
final SAMRecord record = iterator.next();
addIfNotEmpty(iterator);
@ -186,6 +199,26 @@ public class MergingSamRecordIterator2 implements Iterator<SAMRecord> {
public SAMFileHeader getMergedHeader() {
return this.samHeaderMerger.getMergedHeader();
}
/**
* closes all the file handles for the readers....DO THIS or you will run out of handles
* with sharding.
*/
public void close() {
for (SAMFileReader reader : samHeaderMerger.getReaders()) {
reader.close();
}
}
/**
* allows us to be used in the new style for loops
* @return
*/
public Iterator<SAMRecord> iterator() {
return this;
}
}
// Should replace picard class with the same name