Start printing traversal stats after 30 seconds. I can't stand waiting 2 minutes.

2011-12-22 15:40:59 -05:00 · 2011-12-22 15:40:59 -05:00 · 6d260ec6ae
parent 7204fcc2c3
commit 6d260ec6ae
3 changed files with 42 additions and 19 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -469,7 +469,7 @@ public class GenomeAnalysisEngine {
                throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
        }
        else {
-            final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
+            final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;     // TODO -- make it a multiple of 16K
            if(intervals == null)
                return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
            else
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
@ -30,7 +30,6 @@ import net.sf.picard.reference.FastaSequenceIndexBuilder;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.picard.sam.CreateSequenceDictionary;
 import net.sf.samtools.SAMSequenceRecord;
-import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
 import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
 import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
@ -46,7 +45,6 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Iterator;
 import java.util.List;

 /**
@ -230,26 +228,51 @@ public class ReferenceDataSource {

    /**
     * Creates an iterator for processing the entire reference.
-     * @param readsDataSource the reads datasource to embed in the locus shard.  TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
-     * @param intervals the list of intervals to use when processing the reference.
-     * @param maxShardSize The maximum shard size which can be used to create this list.
+     * @param readsDataSource  the reads datasource to embed in the locus shard.  TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
+     * @param intervals        the list of intervals to use when processing the reference.
+     * @param targetShardSize  the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size.
     * @return Creates a schedule for performing a traversal over the entire reference.
     */
-    public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
-        List<Shard> shards = new ArrayList<Shard>();
+    public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) {
+        final List<Shard> shards = new ArrayList<Shard>();
+        final GenomeLocParser parser = intervals.getGenomeLocParser();
+        GenomeLoc currentInterval = null;
+
        for(GenomeLoc interval: intervals) {
-            while(interval.size() > maxShardSize) {
-                shards.add(new LocusShard(intervals.getGenomeLocParser(),
-                        readsDataSource,
-                        Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
-                        null));
-                interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
+            // if the next interval is too big, we can safely shard currentInterval and then break down this one
+            if (interval.size() > targetShardSize) {
+                if (currentInterval != null)
+                    shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
+                currentInterval = interval;
+                while(currentInterval.size() > targetShardSize) {
+                    final GenomeLoc partialInterval = parser.createGenomeLoc(currentInterval.getContig(), currentInterval.getStart(), currentInterval.getStart()+targetShardSize-1);
+                    shards.add(createShardFromInterval(partialInterval, readsDataSource, parser));
+                    currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart()+targetShardSize,currentInterval.getStop());
+                }
+            }
+            // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly)
+            else {
+                if (currentInterval == null) {
+                    currentInterval = interval;
+                }
+                else if (currentInterval.compareContigs(interval) != 0 || interval.getStop() - currentInterval.getStart() + 1 > targetShardSize) {
+                    shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
+                    currentInterval = interval;
+                } else {
+                    currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart(),interval.getStop());
+                }
            }
-            shards.add(new LocusShard(intervals.getGenomeLocParser(),
-                    readsDataSource,
-                    Collections.singletonList(interval),
-                    null));
        }
+        if (currentInterval != null)
+            shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
        return shards;
    }
+
+    private static Shard createShardFromInterval(final GenomeLoc interval, final SAMDataSource readsDataSource, final GenomeLocParser parser) {
+        System.out.println("Adding shard " + interval);
+        return new LocusShard(parser,
+                readsDataSource,
+                Collections.singletonList(interval),
+                null);
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@ -121,7 +121,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
    private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
    private int printProgressCheckCounter = 0;
    private long lastProgressPrintTime = -1;                       // When was the last time we printed progress log?
-    private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds
+    private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
    private long PROGRESS_PRINT_FREQUENCY = 10 * 1000;             // in milliseconds
    private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
    private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;