NanoScheduler optimization for TraverseReadsNano

-- Pre-read MapData into a list, which is actually faster than dealing with future lock contention issues with lots of map threads -- Increase the ReadShard default size to 100K reads by default
2012-12-19 15:51:51 -05:00 · 2012-12-19 15:51:51 -05:00 · b92f563d06
parent f849910c4e
commit b92f563d06
2 changed files with 19 additions and 4 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@ -253,9 +253,10 @@ public class SAMDataSource {
        if(readBufferSize != null)
            ReadShard.setReadBufferSize(readBufferSize);   // TODO: use of non-final static variable here is just awful, especially for parallel tests
        else {
-            // Choose a sensible default for the read buffer size.  For the moment, we're picking 1000 reads per BAM per shard (which effectively
-            // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
-            ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
+            // Choose a sensible default for the read buffer size.
+            // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once.
+            // Now we are simply setting it to 100K reads
+            ReadShard.setReadBufferSize(100000);
        }

        resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java
@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;

 import java.util.Iterator;
+import java.util.LinkedList;

 /**
 * A nano-scheduling version of TraverseReads.
@ -53,6 +54,7 @@ import java.util.Iterator;
 */
 public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
    /** our log, which we want to capture anything from this class */
+    private final static boolean PRE_READ_ALL_MAP_DATA = true;
    protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class);
    private static final boolean DEBUG = false;
    final NanoScheduler<MapData, MapResult, T> nanoScheduler;
@ -111,7 +113,19 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
     *          should execute
     */
    private Iterator<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
-        return new Iterator<MapData>() {
+        final Iterator<MapData> it = makeDataIterator(dataProvider);
+        if ( PRE_READ_ALL_MAP_DATA ) {
+            final LinkedList<MapData> l = new LinkedList<MapData>();
+            while ( it.hasNext() ) l.add(it.next());
+            return l.iterator();
+        } else {
+            return it;
+        }
+    }
+
+
+    private Iterator<MapData> makeDataIterator(final ReadShardDataProvider dataProvider) {
+        return new Iterator<MapData> ()  {
            final ReadView reads = new ReadView(dataProvider);
            final ReadReferenceView reference = new ReadReferenceView(dataProvider);
            final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);