From 6d260ec6ae0b3bfb65be9408920a6acb58d8e502 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 22 Dec 2011 15:40:59 -0500 Subject: [PATCH] Start printing traversal stats after 30 seconds. I can't stand waiting 2 minutes. --- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../reference/ReferenceDataSource.java | 57 +++++++++++++------ .../gatk/traversals/TraversalEngine.java | 2 +- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index d37116215..f6956f530 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -469,7 +469,7 @@ public class GenomeAnalysisEngine { throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); } else { - final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; // TODO -- make it a multiple of 16K if(intervals == null) return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); else diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index 2c33a19b8..b6c6a7564 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -30,7 +30,6 @@ import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.CreateSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; @@ -46,7 +45,6 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; import java.util.ArrayList; import java.util.Collections; -import java.util.Iterator; import java.util.List; /** @@ -230,26 +228,51 @@ public class ReferenceDataSource { /** * Creates an iterator for processing the entire reference. - * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. - * @param intervals the list of intervals to use when processing the reference. - * @param maxShardSize The maximum shard size which can be used to create this list. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. * @return Creates a schedule for performing a traversal over the entire reference. */ - public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { - List shards = new ArrayList(); + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { + final List shards = new ArrayList(); + final GenomeLocParser parser = intervals.getGenomeLocParser(); + GenomeLoc currentInterval = null; + for(GenomeLoc interval: intervals) { - while(interval.size() > maxShardSize) { - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), - null)); - interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + // if the next interval is too big, we can safely shard currentInterval and then break down this one + if (interval.size() > targetShardSize) { + if (currentInterval != null) + shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); + currentInterval = interval; + while(currentInterval.size() > targetShardSize) { + final GenomeLoc partialInterval = parser.createGenomeLoc(currentInterval.getContig(), currentInterval.getStart(), currentInterval.getStart()+targetShardSize-1); + shards.add(createShardFromInterval(partialInterval, readsDataSource, parser)); + currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart()+targetShardSize,currentInterval.getStop()); + } + } + // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) + else { + if (currentInterval == null) { + currentInterval = interval; + } + else if (currentInterval.compareContigs(interval) != 0 || interval.getStop() - currentInterval.getStart() + 1 > targetShardSize) { + shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); + currentInterval = interval; + } else { + currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart(),interval.getStop()); + } } - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(interval), - null)); } + if (currentInterval != null) + shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); return shards; } + + private static Shard createShardFromInterval(final GenomeLoc interval, final SAMDataSource readsDataSource, final GenomeLocParser parser) { + System.out.println("Adding shard " + interval); + return new LocusShard(parser, + readsDataSource, + Collections.singletonList(interval), + null); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index fd691735f..4ef255524 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -121,7 +121,7 @@ public abstract class TraversalEngine,Provide private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;