Significant performance improvements made by subtracting out the contents of the prior highest-level bin.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2859 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3e0e7aad2d
commit
71f18e941f
|
|
@ -183,6 +183,101 @@ public class BAMFileIndex2 extends BAMFileIndex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform an overlapping query of all bins bounding the given location.
|
||||||
|
* @param bin The bin over which to perform an overlapping query.
|
||||||
|
* @return The file pointers
|
||||||
|
*/
|
||||||
|
long[] getFilePointersBounding(final Bin bin) {
|
||||||
|
if(bin == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
final int referenceSequence = bin.referenceSequence;
|
||||||
|
final Bin[] allBins = referenceToBins.get(referenceSequence);
|
||||||
|
|
||||||
|
final int binLevel = getLevelForBinNumber(bin.binNumber);
|
||||||
|
final int firstLocusInBin = getFirstLocusInBin(bin);
|
||||||
|
|
||||||
|
List<Bin> binTree = new ArrayList<Bin>();
|
||||||
|
binTree.add(bin);
|
||||||
|
|
||||||
|
int currentBinLevel = binLevel;
|
||||||
|
while(--currentBinLevel >= 0) {
|
||||||
|
final int binStart = LEVEL_STARTS[currentBinLevel];
|
||||||
|
final int binWidth = BIN_SPAN/(LEVEL_STARTS[currentBinLevel+1]-LEVEL_STARTS[currentBinLevel]);
|
||||||
|
final int binNumber = firstLocusInBin/binWidth + binStart;
|
||||||
|
for(Bin referenceBin: allBins) {
|
||||||
|
if(binNumber == referenceBin.binNumber)
|
||||||
|
binTree.add(referenceBin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||||
|
for(Bin coveringBin: binTree)
|
||||||
|
chunkList.addAll(binToChunks.get(coveringBin));
|
||||||
|
|
||||||
|
// Find the nearest adjacent bin. This can act as a minimum offset
|
||||||
|
Bin closestAdjacentBin = null;
|
||||||
|
for(Bin adjacentBin: allBins) {
|
||||||
|
if(getLevelForBinNumber(adjacentBin.binNumber) != binLevel)
|
||||||
|
continue;
|
||||||
|
if(adjacentBin.binNumber<bin.binNumber && (closestAdjacentBin == null || closestAdjacentBin.binNumber < adjacentBin.binNumber))
|
||||||
|
closestAdjacentBin = adjacentBin;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the offset of the closest bin.
|
||||||
|
long adjacentBinOffset = 0;
|
||||||
|
if(closestAdjacentBin != null) {
|
||||||
|
for(Chunk chunk: binToChunks.get(closestAdjacentBin)) {
|
||||||
|
if(adjacentBinOffset < chunk.getChunkEnd())
|
||||||
|
adjacentBinOffset = chunk.getChunkEnd();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final int start = getFirstLocusInBin(bin)-1;
|
||||||
|
final int regionLinearBin = start >> BAM_LIDX_SHIFT;
|
||||||
|
LinearIndex index = referenceToLinearIndices.get(referenceSequence);
|
||||||
|
long minimumOffset = 0;
|
||||||
|
if (regionLinearBin < index.indexEntries.length)
|
||||||
|
minimumOffset = index.indexEntries[regionLinearBin];
|
||||||
|
|
||||||
|
chunkList = optimizeChunkList(chunkList, minimumOffset);
|
||||||
|
long[] chunkArray = convertToArray(chunkList);
|
||||||
|
|
||||||
|
// Trim off anything before the first desired bin.
|
||||||
|
int location = Arrays.binarySearch(chunkArray,adjacentBinOffset);
|
||||||
|
// location not found, but insertion point was determined.
|
||||||
|
long trimmedChunkArray[] = chunkArray;
|
||||||
|
|
||||||
|
// If the location of the element is in an even bucket (a start position), trim everything before it.
|
||||||
|
if(location >= 0) {
|
||||||
|
if(location%2==0) {
|
||||||
|
trimmedChunkArray = new long[chunkArray.length-location];
|
||||||
|
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
trimmedChunkArray = new long[chunkArray.length-location-1];
|
||||||
|
System.arraycopy(chunkArray,location+1,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
location = -(location+1);
|
||||||
|
if(location < chunkArray.length) {
|
||||||
|
if(location%2==0) {
|
||||||
|
trimmedChunkArray = new long[chunkArray.length-location];
|
||||||
|
System.arraycopy(chunkArray,location,trimmedChunkArray,0,trimmedChunkArray.length);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
trimmedChunkArray = new long[chunkArray.length-location+1];
|
||||||
|
trimmedChunkArray[0] = adjacentBinOffset;
|
||||||
|
System.arraycopy(chunkArray,location,trimmedChunkArray,1,trimmedChunkArray.length-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return trimmedChunkArray;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get list of regions of BAM file that may contain SAMRecords for the given range
|
* Get list of regions of BAM file that may contain SAMRecords for the given range
|
||||||
* @param referenceIndex sequence of desired SAMRecords
|
* @param referenceIndex sequence of desired SAMRecords
|
||||||
|
|
|
||||||
|
|
@ -194,14 +194,9 @@ class BAMFileReader2
|
||||||
return bins;
|
return bins;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||||
final SAMFileHeader fileHeader = getFileHeader();
|
|
||||||
long[] filePointers = null;
|
|
||||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
|
||||||
if (referenceIndex != -1) {
|
|
||||||
final BAMFileIndex2 fileIndex = getFileIndex();
|
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||||
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
|
long[] filePointers = fileIndex.getFilePointersBounding(bin);
|
||||||
}
|
|
||||||
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
|
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -147,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
|
||||||
return reader.getOverlappingBins(sequence,start,end);
|
return reader.getOverlappingBins(sequence,start,end);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||||
return reader.getFilePointersBounding(sequence,start,end);
|
return reader.getFilePointersBounding(bin);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Field getField(String fieldName) {
|
private Field getField(String fieldName) {
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
start = Math.min(loc.getStart(),start);
|
start = Math.min(loc.getStart(),start);
|
||||||
stop = Math.max(loc.getStop(),stop);
|
stop = Math.max(loc.getStop(),stop);
|
||||||
}
|
}
|
||||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
|
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||||
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.JVMUtils;
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||||
|
|
@ -70,14 +69,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
||||||
* @param locus The loci for which to load data.
|
* @param bin The bin for which to load data.
|
||||||
* @return A map of the file pointers bounding the bin.
|
* @return A map of the file pointers bounding the bin.
|
||||||
*/
|
*/
|
||||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
|
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(Bin bin) {
|
||||||
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||||
for(SAMFileReader reader: headerMerger.getReaders()) {
|
for(SAMFileReader reader: headerMerger.getReaders()) {
|
||||||
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
||||||
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
|
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
|
||||||
}
|
}
|
||||||
return filePointers;
|
return filePointers;
|
||||||
}
|
}
|
||||||
|
|
@ -154,8 +153,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
Map<SAMFileReader,CloseableIterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
|
Map<SAMFileReader,CloseableIterator<SAMRecord>> readerToIteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
|
||||||
for(Map.Entry<SAMFileReader2,List<Chunk>> chunksByReader: bamAwareShard.getChunks().entrySet()) {
|
for(Map.Entry<SAMFileReader2,List<Chunk>> chunksByReader: bamAwareShard.getChunks().entrySet()) {
|
||||||
SAMFileReader2 reader = chunksByReader.getKey();
|
SAMFileReader2 reader = chunksByReader.getKey();
|
||||||
GenomeLoc bounds = bamAwareShard.getBounds();
|
readerToIteratorMap.put(reader,reader.iterator(bamAwareShard.getChunks().get(reader)));
|
||||||
readerToIteratorMap.put(reader,reader.queryOverlapping(bounds.getContig(),(int)bounds.getStart(),(int)bounds.getStop()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue