Significant performance improvements made by subtracting out the contents of the prior highest-level bin.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2859 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3e0e7aad2d
commit
71f18e941f
|
|
@ -183,6 +183,101 @@ public class BAMFileIndex2 extends BAMFileIndex
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform an overlapping query of all bins bounding the given location.
|
||||
* @param bin The bin over which to perform an overlapping query.
|
||||
* @return The file pointers
|
||||
*/
|
||||
long[] getFilePointersBounding(final Bin bin) {
|
||||
if(bin == null)
|
||||
return null;
|
||||
|
||||
final int referenceSequence = bin.referenceSequence;
|
||||
final Bin[] allBins = referenceToBins.get(referenceSequence);
|
||||
|
||||
final int binLevel = getLevelForBinNumber(bin.binNumber);
|
||||
final int firstLocusInBin = getFirstLocusInBin(bin);
|
||||
|
||||
List<Bin> binTree = new ArrayList<Bin>();
|
||||
binTree.add(bin);
|
||||
|
||||
int currentBinLevel = binLevel;
|
||||
while(--currentBinLevel >= 0) {
|
||||
final int binStart = LEVEL_STARTS[currentBinLevel];
|
||||
final int binWidth = BIN_SPAN/(LEVEL_STARTS[currentBinLevel+1]-LEVEL_STARTS[currentBinLevel]);
|
||||
final int binNumber = firstLocusInBin/binWidth + binStart;
|
||||
for(Bin referenceBin: allBins) {
|
||||
if(binNumber == referenceBin.binNumber)
|
||||
binTree.add(referenceBin);
|
||||
}
|
||||
}
|
||||
|
||||
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||
for(Bin coveringBin: binTree)
|
||||
chunkList.addAll(binToChunks.get(coveringBin));
|
||||
|
||||
// Find the nearest adjacent bin. This can act as a minimum offset
|
||||
Bin closestAdjacentBin = null;
|
||||
for(Bin adjacentBin: allBins) {
|
||||
if(getLevelForBinNumber(adjacentBin.binNumber) != binLevel)
|
||||
continue;
|
||||
if(adjacentBin.binNumber<bin.binNumber && (closestAdjacentBin == null || closestAdjacentBin.binNumber < adjacentBin.binNumber))
|
||||
closestAdjacentBin = adjacentBin;
|
||||
}
|
||||
|
||||
// Find the offset of the closest bin.
|
||||
long adjacentBinOffset = 0;
|
||||
if(closestAdjacentBin != null) {
|
||||
for(Chunk chunk: binToChunks.get(closestAdjacentBin)) {
|
||||
if(adjacentBinOffset < chunk.getChunkEnd())
|
||||
adjacentBinOffset = chunk.getChunkEnd();
|
||||
}
|
||||
}
|
||||
|
||||
final int start = getFirstLocusInBin(bin)-1;
|
||||
final int regionLinearBin = start >> BAM_LIDX_SHIFT;
|
||||
LinearIndex index = referenceToLinearIndices.get(referenceSequence);
|
||||
long minimumOffset = 0;
|
||||
if (regionLinearBin < index.indexEntries.length)
|
||||
minimumOffset = index.indexEntries[regionLinearBin];
|
||||
|
||||
chunkList = optimizeChunkList(chunkList, minimumOffset);
|
||||
long[] chunkArray = convertToArray(chunkList);
|
||||
|
||||
// Trim off anything before the first desired bin.
|
||||
int location = Arrays.binarySearch(chunkArray,adjacentBinOffset);
|
||||
// location not found, but insertion point was determined.
|
||||
long trimmedChunkArray[] = chunkArray;
|
||||
|
||||
// If the location of the element is in an even bucket (a start position), trim everything before it.
|
||||
if(location >= 0) {
|
||||
if(location%2==0) {
|
||||
trimmedChunkArray = new long[chunkArray.length-location];
|
||||
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||
}
|
||||
else {
|
||||
trimmedChunkArray = new long[chunkArray.length-location-1];
|
||||
System.arraycopy(chunkArray,location+1,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||
}
|
||||
}
|
||||
else {
|
||||
location = -(location+1);
|
||||
if(location < chunkArray.length) {
|
||||
if(location%2==0) {
|
||||
trimmedChunkArray = new long[chunkArray.length-location];
|
||||
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||
}
|
||||
else {
|
||||
trimmedChunkArray = new long[chunkArray.length-location+1];
|
||||
trimmedChunkArray[0] = adjacentBinOffset;
|
||||
System.arraycopy(chunkArray,location,trimmedChunkArray,1,trimmedChunkArray.length-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return trimmedChunkArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of regions of BAM file that may contain SAMRecords for the given range
|
||||
* @param referenceIndex sequence of desired SAMRecords
|
||||
|
|
|
|||
|
|
@ -194,14 +194,9 @@ class BAMFileReader2
|
|||
return bins;
|
||||
}
|
||||
|
||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
long[] filePointers = null;
|
||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
|
||||
}
|
||||
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||
long[] filePointers = fileIndex.getFilePointersBounding(bin);
|
||||
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -147,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
|
|||
return reader.getOverlappingBins(sequence,start,end);
|
||||
}
|
||||
|
||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||
return reader.getFilePointersBounding(sequence,start,end);
|
||||
return reader.getFilePointersBounding(bin);
|
||||
}
|
||||
|
||||
private Field getField(String fieldName) {
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
start = Math.min(loc.getStart(),start);
|
||||
stop = Math.max(loc.getStop(),stop);
|
||||
}
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.JVMUtils;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
|
|
@ -70,14 +69,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
|||
|
||||
/**
|
||||
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
||||
* @param locus The loci for which to load data.
|
||||
* @param bin The bin for which to load data.
|
||||
* @return A map of the file pointers bounding the bin.
|
||||
*/
|
||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
|
||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(Bin bin) {
|
||||
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||
for(SAMFileReader reader: headerMerger.getReaders()) {
|
||||
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
||||
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
|
||||
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
|
||||
}
|
||||
return filePointers;
|
||||
}
|
||||
|
|
@ -154,8 +153,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
|||
Map<SAMFileReader,CloseableIterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
|
||||
for(Map.Entry<SAMFileReader2,List<Chunk>> chunksByReader: bamAwareShard.getChunks().entrySet()) {
|
||||
SAMFileReader2 reader = chunksByReader.getKey();
|
||||
GenomeLoc bounds = bamAwareShard.getBounds();
|
||||
readerToIteratorMap.put(reader,reader.queryOverlapping(bounds.getContig(),(int)bounds.getStart(),(int)bounds.getStop()));
|
||||
readerToIteratorMap.put(reader,reader.iterator(bamAwareShard.getChunks().get(reader)));
|
||||
}
|
||||
|
||||
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
||||
|
|
|
|||
Loading…
Reference in New Issue