NanoScheduler optimization for TraverseReadsNano

-- Pre-read MapData into a list, which is actually faster than dealing with future lock contention issues with lots of map threads
-- Increase the ReadShard default size to 100K reads by default
This commit is contained in:
Mark DePristo 2012-12-19 15:51:51 -05:00
parent f849910c4e
commit b92f563d06
2 changed files with 19 additions and 4 deletions

View File

@ -253,9 +253,10 @@ public class SAMDataSource {
if(readBufferSize != null)
ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests
else {
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
// Choose a sensible default for the read buffer size.
// Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once.
// Now we are simply setting it to 100K reads
ReadShard.setReadBufferSize(100000);
}
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);

View File

@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Iterator;
import java.util.LinkedList;
/**
* A nano-scheduling version of TraverseReads.
@ -53,6 +54,7 @@ import java.util.Iterator;
*/
public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
/** our log, which we want to capture anything from this class */
private final static boolean PRE_READ_ALL_MAP_DATA = true;
protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class);
private static final boolean DEBUG = false;
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
@ -111,7 +113,19 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
* should execute
*/
private Iterator<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
return new Iterator<MapData>() {
final Iterator<MapData> it = makeDataIterator(dataProvider);
if ( PRE_READ_ALL_MAP_DATA ) {
final LinkedList<MapData> l = new LinkedList<MapData>();
while ( it.hasNext() ) l.add(it.next());
return l.iterator();
} else {
return it;
}
}
private Iterator<MapData> makeDataIterator(final ReadShardDataProvider dataProvider) {
return new Iterator<MapData> () {
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);