Significant performance improvements made by subtracting out the contents of the prior highest-level bin.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2859 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-19 16:46:16 +00:00
parent 3e0e7aad2d
commit 71f18e941f
5 changed files with 105 additions and 17 deletions

View File

@ -183,6 +183,101 @@ public class BAMFileIndex2 extends BAMFileIndex
}
}
/**
* Perform an overlapping query of all bins bounding the given location.
* @param bin The bin over which to perform an overlapping query.
* @return The file pointers
*/
long[] getFilePointersBounding(final Bin bin) {
if(bin == null)
return null;
final int referenceSequence = bin.referenceSequence;
final Bin[] allBins = referenceToBins.get(referenceSequence);
final int binLevel = getLevelForBinNumber(bin.binNumber);
final int firstLocusInBin = getFirstLocusInBin(bin);
List<Bin> binTree = new ArrayList<Bin>();
binTree.add(bin);
int currentBinLevel = binLevel;
while(--currentBinLevel >= 0) {
final int binStart = LEVEL_STARTS[currentBinLevel];
final int binWidth = BIN_SPAN/(LEVEL_STARTS[currentBinLevel+1]-LEVEL_STARTS[currentBinLevel]);
final int binNumber = firstLocusInBin/binWidth + binStart;
for(Bin referenceBin: allBins) {
if(binNumber == referenceBin.binNumber)
binTree.add(referenceBin);
}
}
List<Chunk> chunkList = new ArrayList<Chunk>();
for(Bin coveringBin: binTree)
chunkList.addAll(binToChunks.get(coveringBin));
// Find the nearest adjacent bin. This can act as a minimum offset
Bin closestAdjacentBin = null;
for(Bin adjacentBin: allBins) {
if(getLevelForBinNumber(adjacentBin.binNumber) != binLevel)
continue;
if(adjacentBin.binNumber<bin.binNumber && (closestAdjacentBin == null || closestAdjacentBin.binNumber < adjacentBin.binNumber))
closestAdjacentBin = adjacentBin;
}
// Find the offset of the closest bin.
long adjacentBinOffset = 0;
if(closestAdjacentBin != null) {
for(Chunk chunk: binToChunks.get(closestAdjacentBin)) {
if(adjacentBinOffset < chunk.getChunkEnd())
adjacentBinOffset = chunk.getChunkEnd();
}
}
final int start = getFirstLocusInBin(bin)-1;
final int regionLinearBin = start >> BAM_LIDX_SHIFT;
LinearIndex index = referenceToLinearIndices.get(referenceSequence);
long minimumOffset = 0;
if (regionLinearBin < index.indexEntries.length)
minimumOffset = index.indexEntries[regionLinearBin];
chunkList = optimizeChunkList(chunkList, minimumOffset);
long[] chunkArray = convertToArray(chunkList);
// Trim off anything before the first desired bin.
int location = Arrays.binarySearch(chunkArray,adjacentBinOffset);
// location not found, but insertion point was determined.
long trimmedChunkArray[] = chunkArray;
// If the location of the element is in an even bucket (a start position), trim everything before it.
if(location >= 0) {
if(location%2==0) {
trimmedChunkArray = new long[chunkArray.length-location];
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
}
else {
trimmedChunkArray = new long[chunkArray.length-location-1];
System.arraycopy(chunkArray,location+1,trimmedChunkArray,0,trimmedChunkArray.length);
}
}
else {
location = -(location+1);
if(location < chunkArray.length) {
if(location%2==0) {
trimmedChunkArray = new long[chunkArray.length-location];
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
}
else {
trimmedChunkArray = new long[chunkArray.length-location+1];
trimmedChunkArray[0] = adjacentBinOffset;
System.arraycopy(chunkArray,location,trimmedChunkArray,1,trimmedChunkArray.length-1);
}
}
}
return trimmedChunkArray;
}
/**
* Get list of regions of BAM file that may contain SAMRecords for the given range
* @param referenceIndex sequence of desired SAMRecords

View File

@ -194,14 +194,9 @@ class BAMFileReader2
return bins;
}
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
final SAMFileHeader fileHeader = getFileHeader();
long[] filePointers = null;
int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMFileIndex2 fileIndex = getFileIndex();
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
}
public List<Chunk> getFilePointersBounding(final Bin bin) {
final BAMFileIndex2 fileIndex = getFileIndex();
long[] filePointers = fileIndex.getFilePointersBounding(bin);
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
}

View File

@ -147,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
return reader.getOverlappingBins(sequence,start,end);
}
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
public List<Chunk> getFilePointersBounding(final Bin bin) {
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
return reader.getFilePointersBounding(sequence,start,end);
return reader.getFilePointersBounding(bin);
}
private Field getField(String fieldName) {

View File

@ -127,7 +127,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
start = Math.min(loc.getStart(),start);
stop = Math.max(loc.getStop(),stop);
}
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
}

View File

@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.JVMUtils;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
@ -70,14 +69,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
/**
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
* @param locus The loci for which to load data.
* @param bin The bin for which to load data.
* @return A map of the file pointers bounding the bin.
*/
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(Bin bin) {
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
for(SAMFileReader reader: headerMerger.getReaders()) {
SAMFileReader2 reader2 = (SAMFileReader2)reader;
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
}
return filePointers;
}
@ -154,8 +153,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
Map<SAMFileReader,CloseableIterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
for(Map.Entry<SAMFileReader2,List<Chunk>> chunksByReader: bamAwareShard.getChunks().entrySet()) {
SAMFileReader2 reader = chunksByReader.getKey();
GenomeLoc bounds = bamAwareShard.getBounds();
readerToIteratorMap.put(reader,reader.queryOverlapping(bounds.getContig(),(int)bounds.getStart(),(int)bounds.getStop()));
readerToIteratorMap.put(reader,reader.iterator(bamAwareShard.getChunks().get(reader)));
}
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.