A few quick fixes to temporarily make the LowMemorySharder return exactly the
same shards as the previous sharder, so that I can directly compare filespans to see where some performance bugs lie. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5395 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0b4e51317b
commit
9384b2ff65
|
|
@ -103,22 +103,46 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
|||
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs()) {
|
||||
GATKBAMIndex index = (GATKBAMIndex)dataSource.getIndex(reader);
|
||||
|
||||
BinTree binTree = getNextOverlappingBinTree(reader,(GATKBAMIndex)dataSource.getIndex(reader),currentLocus);
|
||||
|
||||
// No overlapping data at all.
|
||||
if(binTree != null) {
|
||||
coveredRegionStart = Math.max(coveredRegionStart,binTree.getStart());
|
||||
coveredRegionStop = Math.min(coveredRegionStop,binTree.getStop());
|
||||
coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop);
|
||||
}
|
||||
|
||||
GATKBAMFileSpan fileSpan = generateFileSpan(reader,index,binTree,currentLocus);
|
||||
// Only a partial overlap, and the interval list precedes the bin. Force the bin tree to null.
|
||||
// TODO: the only reason to do this is to generate shards with no data that are placeholders for the interval list. Manage this externally.
|
||||
if(coveredRegion != null && currentLocus.startsBefore(coveredRegion))
|
||||
binTree = null;
|
||||
|
||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||
GATKBAMFileSpan fileSpan = generateFileSpan(reader,index,currentLocus.getContigIndex(),binTree);
|
||||
nextFilePointer.addFileSpans(reader,fileSpan);
|
||||
}
|
||||
|
||||
// Early exit if no bins were found.
|
||||
if(coveredRegion == null) {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.next();
|
||||
if(currentLocus.size() > 16384) {
|
||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
}
|
||||
else {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Early exit if only part of the first interval was found.
|
||||
if(currentLocus.startsBefore(coveredRegion)) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -134,7 +158,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
|||
}
|
||||
|
||||
// Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus,
|
||||
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
|
||||
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
|
||||
if(coveredRegionStop < currentLocus.getStop())
|
||||
currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop());
|
||||
else if(locusIterator.hasNext())
|
||||
|
|
@ -174,47 +198,54 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
|||
if(!binTreeIterator.hasNext())
|
||||
return null;
|
||||
|
||||
// Peek the iterator along until finding the first binTree at or following the current locus.
|
||||
BinTree binTree = binTreeIterator.peek();
|
||||
while(binTree.isBefore(locus)) {
|
||||
binTreeIterator.next(); // Before the point of interest. Consume this one.
|
||||
binTree = binTreeIterator.peek();
|
||||
}
|
||||
|
||||
if(binTree.overlaps(locus)) {
|
||||
while(binTree != null && binTree.isBefore(locus)) {
|
||||
binTreeIterator.next();
|
||||
return binTree;
|
||||
}
|
||||
binTree = binTreeIterator.hasNext() ? binTreeIterator.peek() : null;
|
||||
}
|
||||
|
||||
return null;
|
||||
return (binTree != null && binTree.overlaps(locus)) ? binTree : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a bin list to a file span, trimmed based on the linear index and with overlapping regions removed.
|
||||
* @param index BAM index.
|
||||
* @param binTree Tree of data found to overlap the region. binTree.overlaps(initialRegion) must return true.
|
||||
* @param initialRegion The region to employ when trimming the linear index.
|
||||
* @return File span mapping to given region.
|
||||
*/
|
||||
private GATKBAMFileSpan generateFileSpan(final SAMReaderID reader, final GATKBAMIndex index, final BinTree binTree, final GenomeLoc initialRegion) {
|
||||
private GATKBAMFileSpan generateFileSpan(final SAMReaderID reader, final GATKBAMIndex index, final int referenceSequence, final BinTree binTree) {
|
||||
System.out.printf("Shard %d: index file = %s; reference sequence = %d; span = %d-%d; ",++shardNumber,index.getIndexFile(),referenceSequence,(shardNumber-1)*16384+1,shardNumber*16384);
|
||||
|
||||
// Empty bin trees mean empty file spans.
|
||||
if(binTree == null)
|
||||
if(binTree == null) {
|
||||
System.out.printf("bins = {}; minimumOffset = 0, chunks = {}%n");
|
||||
return new GATKBAMFileSpan();
|
||||
|
||||
}
|
||||
|
||||
System.out.printf("bins = {");
|
||||
List<GATKChunk> chunks = new ArrayList<GATKChunk>(binTree.size());
|
||||
for(GATKBin bin: binTree.getBins()) {
|
||||
if(bin == null)
|
||||
continue;
|
||||
System.out.printf("%d,",bin.getBinNumber());
|
||||
// The optimizer below will mutate the chunk list. Make sure each element is a clone of the reference sequence.
|
||||
for(GATKChunk chunk: bin.getChunkList())
|
||||
chunks.add(chunk.clone());
|
||||
}
|
||||
System.out.printf("}; ");
|
||||
|
||||
final long linearIndexMinimumOffset = binTree.getLinearIndexEntry();
|
||||
System.out.printf("minimumOffset = %d, ",linearIndexMinimumOffset);
|
||||
|
||||
// Optimize the chunk list with a linear index optimization
|
||||
chunks = index.optimizeChunkList(chunks,linearIndexMinimumOffset);
|
||||
|
||||
GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()]));
|
||||
System.out.printf("chunks = {%s}%n",fileSpan);
|
||||
|
||||
return fileSpan;
|
||||
}
|
||||
|
||||
static long shardNumber = 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue