Switched over to reviewed version of Picard patch. In process, did some optimization to the IntervalSharder

which improved startup time 5-10x when dynamically merging many BAMs.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3331 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-05-08 14:12:22 +00:00
parent d6b036cdab
commit 76efa757f0
13 changed files with 131 additions and 86 deletions

View File

@ -95,9 +95,9 @@ public class BAMFileStat extends CommandLineProgram {
inspector.inspect(System.out,null,null);
}
private class BAMFileIndexContentInspector extends CachingBAMFileIndex {
private class BAMFileIndexContentInspector /*extends CachingBAMFileIndex*/ {
public BAMFileIndexContentInspector(File bamFileIndex) {
super(bamFileIndex);
// super(bamFileIndex);
}
public void inspect(PrintStream outputStream, Integer startPosition, Integer stopPosition) {

View File

@ -29,6 +29,7 @@ import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.apache.log4j.Logger;
import java.util.*;
@ -42,6 +43,8 @@ import net.sf.picard.util.PeekableIterator;
* @version 0.1
*/
public class IntervalSharder {
private static Logger logger = Logger.getLogger(IntervalSharder.class);
public static Iterator<FilePointer> shardIntervals(final BlockDrivenSAMDataSource dataSource, final List<GenomeLoc> loci) {
return new FilePointerIterator(dataSource,loci);
}
@ -101,13 +104,13 @@ public class IntervalSharder {
FilePointer lastFilePointer = null;
BAMOverlap lastBAMOverlap = null;
Map<SAMReaderID,CachingBAMFileIndex> readerToIndexMap = new HashMap<SAMReaderID,CachingBAMFileIndex>();
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
BinMergingIterator binMerger = new BinMergingIterator();
for(SAMReaderID id: dataSource.getReaderIDs()) {
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
if(referenceSequence == null)
continue;
final CachingBAMFileIndex index = dataSource.getIndex(id);
final BrowseableBAMIndex index = dataSource.getIndex(id);
binMerger.addReader(id,
index,
referenceSequence.getSequenceIndex(),
@ -115,6 +118,7 @@ public class IntervalSharder {
// Cache the reader for later data lookup.
readerToIndexMap.put(id,index);
}
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
for(GenomeLoc location: loci) {
@ -201,10 +205,9 @@ public class IntervalSharder {
// Lookup the locations for every file pointer in the index.
for(SAMReaderID id: readerToIndexMap.keySet()) {
CachingBAMFileIndex index = readerToIndexMap.get(id);
BrowseableBAMIndex index = readerToIndexMap.get(id);
for(FilePointer filePointer: filePointers)
filePointer.addFileSpans(id,index.getChunksOverlapping(filePointer.overlap.getBin(id)));
index.close();
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
}
return filePointers;
@ -214,7 +217,7 @@ public class IntervalSharder {
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
public void addReader(final SAMReaderID id, final CachingBAMFileIndex index, final int referenceSequence, Iterator<Bin> bins) {
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
binQueue.add(new BinQueueState(id,index,referenceSequence,new LowestLevelBinFilteringIterator(index,bins)));
}
@ -292,85 +295,31 @@ public class IntervalSharder {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.peek();
return new ReaderBin(current.id,current.index,current.referenceSequence,current.bins.peek());
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
}
private ReaderBin getNextBin() {
if(binQueue.isEmpty())
throw new NoSuchElementException("No more bins are available");
BinQueueState current = binQueue.remove();
ReaderBin readerBin = new ReaderBin(current.id,current.index,current.referenceSequence,current.bins.next());
if(current.bins.hasNext())
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
if(current.hasNextBin())
binQueue.add(current);
return readerBin;
}
private class ReaderBin {
public final SAMReaderID id;
public final CachingBAMFileIndex index;
public final int referenceSequence;
public final Bin bin;
public ReaderBin(final SAMReaderID id, final CachingBAMFileIndex index, final int referenceSequence, final Bin bin) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bin = bin;
}
public int getStart() {
return index.getFirstLocusInBin(bin);
}
public int getStop() {
return index.getLastLocusInBin(bin);
}
}
private class BinQueueState implements Comparable<BinQueueState> {
public final SAMReaderID id;
public final CachingBAMFileIndex index;
public final int referenceSequence;
public final PeekableIterator<Bin> bins;
public BinQueueState(final SAMReaderID id, final CachingBAMFileIndex index, final int referenceSequence, final Iterator<Bin> bins) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bins = new PeekableIterator<Bin>(bins);
}
public int compareTo(BinQueueState other) {
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
if(!this.bins.hasNext()) return -1;
if(!this.bins.hasNext()) return 1;
int thisStart = this.index.getFirstLocusInBin(this.bins.peek());
int otherStart = other.index.getFirstLocusInBin(other.bins.peek());
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
if(thisStart != otherStart)
return thisStart - otherStart;
int thisStop = this.index.getLastLocusInBin(this.bins.peek());
int otherStop = other.index.getLastLocusInBin(other.bins.peek());
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
return thisStop - otherStop;
}
}
}
/**
* Filters out bins not at the lowest level in the tree.
*/
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
private CachingBAMFileIndex index;
private BrowseableBAMIndex index;
private Iterator<Bin> wrappedIterator;
private Bin nextBin;
public LowestLevelBinFilteringIterator(final CachingBAMFileIndex index, Iterator<Bin> iterator) {
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
this.index = index;
this.wrappedIterator = iterator;
advance();
@ -396,7 +345,7 @@ public class IntervalSharder {
nextBin = bin;
}
}
}
}
}
/**
@ -452,5 +401,101 @@ class BAMOverlap {
}
}
class ReaderBin {
public final SAMReaderID id;
public final BrowseableBAMIndex index;
public final int referenceSequence;
public final Bin bin;
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bin = bin;
}
public int getStart() {
return index.getFirstLocusInBin(bin);
}
public int getStop() {
return index.getLastLocusInBin(bin);
}
}
class BinQueueState implements Comparable<BinQueueState> {
private final SAMReaderID id;
private final BrowseableBAMIndex index;
private final int referenceSequence;
private final PeekableIterator<Bin> bins;
private int firstLocusInCurrentBin;
private int lastLocusInCurrentBin;
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
this.id = id;
this.index = index;
this.referenceSequence = referenceSequence;
this.bins = new PeekableIterator<Bin>(bins);
refreshLocusInBinCache();
}
public SAMReaderID getReaderID() {
return id;
}
public BrowseableBAMIndex getIndex() {
return index;
}
public int getReferenceSequence() {
return referenceSequence;
}
public boolean hasNextBin() {
return bins.hasNext();
}
public Bin peekNextBin() {
return bins.peek();
}
public Bin nextBin() {
Bin nextBin = bins.next();
refreshLocusInBinCache();
return nextBin;
}
public int compareTo(BinQueueState other) {
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
if(!this.bins.hasNext()) return -1;
if(!this.bins.hasNext()) return 1;
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
throw new StingException("Sharding mechanism error - bin->locus cache is invalid.");
}
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
}
private void refreshLocusInBinCache() {
firstLocusInCurrentBin = -1;
lastLocusInCurrentBin = -1;
if(bins.hasNext()) {
Bin bin = bins.peek();
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
}
}
}

View File

@ -129,9 +129,9 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
* @param id Id of the reader.
* @return The index. Will preload the index if necessary.
*/
public CachingBAMFileIndex getIndex(final SAMReaderID id) {
public BrowseableBAMIndex getIndex(final SAMReaderID id) {
SAMReaders readers = resourcePool.getReadersWithoutLocking();
return readers.getReader(id).getIndex(CachingBAMFileIndex.class);
return readers.getReader(id).getBrowseableIndex();
}
/**
@ -181,7 +181,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
* @return ID of the reader.
*/
public SAMReaderID getReaderID(SAMRecord read) {
return resourcePool.getReaderID(read.getReader());
return resourcePool.getReaderID(read.getFileSource().getReader());
}
/**
@ -191,7 +191,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
* @param read The read to add to the shard.
*/
private void addReadToBufferingShard(BAMFormatAwareShard shard,SAMReaderID id,SAMRecord read) {
SAMFileSpan endChunk = read.getFilePointer().getContentsFollowing();
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
shard.addRead(read);
readerPositions.put(id,endChunk);
}
@ -204,7 +204,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
*/
private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) {
for(SAMReaderID id: getReaderIDs()) {
if(readers.getReader(id) == read.getReader())
if(readers.getReader(id) == read.getFileSource().getReader())
return id;
}
throw new StingException("Unable to find id for reader associated with read " + read.getReadName());
@ -405,7 +405,9 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
*/
public SAMReaders(Reads sourceInfo) {
for(File readsFile: sourceInfo.getReadsFiles()) {
SAMFileReader reader = new SAMFileReader(readsFile,CachingBAMFileIndex.class,true);
SAMFileReader reader = new SAMFileReader(readsFile,true);
reader.enableFileSource(true);
reader.enableIndexCaching(true);
reader.setValidationStringency(sourceInfo.getValidationStringency());
// If no read group is present, hallucinate one.

View File

@ -126,8 +126,8 @@ public class IndexDrivenSAMDataSource extends SAMDataSource {
* @return ID of the reader.
*/
public SAMReaderID getReaderID(SAMRecord read) {
if(resourcePool.readerToIDMap.containsKey(read.getReader()))
return resourcePool.readerToIDMap.get(read.getReader());
if(resourcePool.readerToIDMap.containsKey(read.getFileSource().getReader()))
return resourcePool.readerToIDMap.get(read.getFileSource().getReader());
throw new StingException("Unable to find reader id for record.");
}

View File

@ -79,8 +79,8 @@ public class ReadFormattingIterator implements StingSAMIterator {
// if we don't have a read group, set one.
// TODO: Straw poll to see whether this is really required.
if (read.getAttribute(SAMTag.RG.toString()) == null && read.getReader() != null) {
List<SAMReadGroupRecord> readGroups = read.getReader().getFileHeader().getReadGroups();
if (read.getAttribute(SAMTag.RG.toString()) == null && read.getFileSource() != null && read.getFileSource().getReader() != null) {
List<SAMReadGroupRecord> readGroups = read.getFileSource().getReader().getFileHeader().getReadGroups();
if (readGroups.size() == 1) {
read.setAttribute(SAMTag.RG.toString(), readGroups.get(0).getReadGroupId());
read.setAttribute(SAMTag.SM.toString(), readGroups.get(0).getReadGroupId());

View File

@ -315,8 +315,6 @@ public class GATKSAMRecord extends SAMRecord {
public void setValidationStringency(net.sf.samtools.SAMFileReader.ValidationStringency validationStringency) { mRecord.setValidationStringency(validationStringency); }
public SAMFileReader getReader() { return mRecord.getReader(); }
public SAMFileHeader getHeader() { return mRecord.getHeader(); }
public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); }
@ -341,5 +339,5 @@ public class GATKSAMRecord extends SAMRecord {
public String toString() { return mRecord.toString(); }
public SAMFileSpan getFilePointer() { return mRecord.getFilePointer(); }
public SAMFileSource getFileSource() { return mRecord.getFileSource(); }
}

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1348-sharding" status="integration" publication="20100412142400" />
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1377-sharding" status="integration" publication="20100507172500" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.17.373-sharding" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.19.395-sharding" status="release" />
</ivy-module>

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.17.373-sharding" status="release" />
<info organisation="net.sf" module="sam" revision="1.19.395-sharding" status="release" />
</ivy-module>