Start printing traversal stats after 30 seconds. I can't stand waiting 2 minutes.

This commit is contained in:
Eric Banks 2011-12-22 15:40:59 -05:00
parent 7204fcc2c3
commit 6d260ec6ae
3 changed files with 42 additions and 19 deletions

View File

@ -469,7 +469,7 @@ public class GenomeAnalysisEngine {
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
}
else {
final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; // TODO -- make it a multiple of 16K
if(intervals == null)
return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
else

View File

@ -30,7 +30,6 @@ import net.sf.picard.reference.FastaSequenceIndexBuilder;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
@ -46,7 +45,6 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
@ -230,26 +228,51 @@ public class ReferenceDataSource {
/**
* Creates an iterator for processing the entire reference.
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param intervals the list of intervals to use when processing the reference.
* @param maxShardSize The maximum shard size which can be used to create this list.
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
* @param intervals the list of intervals to use when processing the reference.
* @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size.
* @return Creates a schedule for performing a traversal over the entire reference.
*/
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
List<Shard> shards = new ArrayList<Shard>();
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) {
final List<Shard> shards = new ArrayList<Shard>();
final GenomeLocParser parser = intervals.getGenomeLocParser();
GenomeLoc currentInterval = null;
for(GenomeLoc interval: intervals) {
while(interval.size() > maxShardSize) {
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
null));
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
// if the next interval is too big, we can safely shard currentInterval and then break down this one
if (interval.size() > targetShardSize) {
if (currentInterval != null)
shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
currentInterval = interval;
while(currentInterval.size() > targetShardSize) {
final GenomeLoc partialInterval = parser.createGenomeLoc(currentInterval.getContig(), currentInterval.getStart(), currentInterval.getStart()+targetShardSize-1);
shards.add(createShardFromInterval(partialInterval, readsDataSource, parser));
currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart()+targetShardSize,currentInterval.getStop());
}
}
// otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly)
else {
if (currentInterval == null) {
currentInterval = interval;
}
else if (currentInterval.compareContigs(interval) != 0 || interval.getStop() - currentInterval.getStart() + 1 > targetShardSize) {
shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
currentInterval = interval;
} else {
currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart(),interval.getStop());
}
}
shards.add(new LocusShard(intervals.getGenomeLocParser(),
readsDataSource,
Collections.singletonList(interval),
null));
}
if (currentInterval != null)
shards.add(createShardFromInterval(currentInterval, readsDataSource, parser));
return shards;
}
private static Shard createShardFromInterval(final GenomeLoc interval, final SAMDataSource readsDataSource, final GenomeLocParser parser) {
System.out.println("Adding shard " + interval);
return new LocusShard(parser,
readsDataSource,
Collections.singletonList(interval),
null);
}
}

View File

@ -121,7 +121,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
private int printProgressCheckCounter = 0;
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;