2009-03-27 23:40:45 +08:00
|
|
|
package org.broadinstitute.sting.gatk.traversals;
|
2009-02-27 05:50:29 +08:00
|
|
|
|
2009-03-24 04:27:21 +08:00
|
|
|
import org.apache.log4j.Logger;
|
2009-06-12 02:13:22 +08:00
|
|
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
2009-07-10 05:57:00 +08:00
|
|
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
|
|
|
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2009-03-03 02:18:48 +08:00
|
|
|
|
2009-03-27 23:40:45 +08:00
|
|
|
public abstract class TraversalEngine {
|
2009-03-12 05:43:31 +08:00
|
|
|
// Time in milliseconds since we initialized this engine
|
2009-07-10 05:57:00 +08:00
|
|
|
private long startTime = -1;
|
|
|
|
|
private long lastProgressPrintTime = -1; // When was the last time we printed our progress?
|
2009-03-12 05:43:31 +08:00
|
|
|
|
|
|
|
|
// How long can we go without printing some progress info?
|
2009-07-10 05:57:00 +08:00
|
|
|
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs
|
|
|
|
|
private final long N_RECORDS_TO_PRINT = 1000000;
|
2009-03-12 05:43:31 +08:00
|
|
|
|
|
|
|
|
// Maximum number of reads to process before finishing
|
2009-07-10 06:03:45 +08:00
|
|
|
protected long maximumIterations = -1;
|
2009-03-12 05:43:31 +08:00
|
|
|
|
2009-07-10 05:57:00 +08:00
|
|
|
/** our log, which we want to capture anything from this class */
|
2009-03-27 23:40:45 +08:00
|
|
|
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
2009-03-24 04:27:21 +08:00
|
|
|
|
2009-11-10 13:57:01 +08:00
|
|
|
/** what kind of traversal we're undertaking. This allows us to format output correctly */
|
|
|
|
|
public enum TRAVERSAL_TYPE { READ, LOCUS, LOCUS_WINDOW, DUPLICATE };
|
|
|
|
|
|
2009-07-10 06:03:45 +08:00
|
|
|
/**
|
|
|
|
|
* set the max number of iterations
|
|
|
|
|
* @param maximumIterations the number of iterations
|
|
|
|
|
*/
|
|
|
|
|
public void setMaximumIterations(final int maximumIterations) {
|
|
|
|
|
this.maximumIterations = maximumIterations;
|
2009-03-24 04:27:21 +08:00
|
|
|
}
|
|
|
|
|
|
2009-03-10 22:59:42 +08:00
|
|
|
/**
|
|
|
|
|
* @param curTime (current runtime, in millisecs)
|
2009-07-10 05:57:00 +08:00
|
|
|
*
|
2009-03-10 22:59:42 +08:00
|
|
|
* @return true if the maximum interval (in millisecs) has passed since the last printing
|
|
|
|
|
*/
|
2009-07-10 05:57:00 +08:00
|
|
|
private boolean maxElapsedIntervalForPrinting(final long curTime) {
|
2009-03-10 22:59:42 +08:00
|
|
|
return (curTime - this.lastProgressPrintTime) > MAX_PROGRESS_PRINT_TIME;
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-12 05:43:31 +08:00
|
|
|
/**
|
|
|
|
|
* Forward request to printProgress
|
|
|
|
|
*
|
2009-11-10 13:57:01 +08:00
|
|
|
* @param type the TRAVERSAL_TYPE of the traversal
|
2009-07-10 05:57:00 +08:00
|
|
|
* @param loc the location
|
2009-03-12 05:43:31 +08:00
|
|
|
*/
|
2009-11-10 13:57:01 +08:00
|
|
|
public void printProgress(final TRAVERSAL_TYPE type, GenomeLoc loc) {
|
2009-03-24 04:27:21 +08:00
|
|
|
printProgress(false, type, loc);
|
2009-03-12 05:43:31 +08:00
|
|
|
}
|
2009-02-27 05:50:29 +08:00
|
|
|
|
2009-03-12 05:43:31 +08:00
|
|
|
/**
|
|
|
|
|
* Utility routine that prints out process information (including timing) every N records or
|
|
|
|
|
* every M seconds, for N and M set in global variables.
|
|
|
|
|
*
|
|
|
|
|
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
|
2009-03-24 04:27:21 +08:00
|
|
|
* @param type String to print out describing our atomic traversal type ("read", "locus", etc)
|
|
|
|
|
* @param loc Current location
|
2009-03-12 05:43:31 +08:00
|
|
|
*/
|
2009-11-10 13:57:01 +08:00
|
|
|
private void printProgress(boolean mustPrint, final TRAVERSAL_TYPE type, GenomeLoc loc) {
|
2009-04-10 04:28:17 +08:00
|
|
|
final long nRecords = TraversalStatistics.nRecords;
|
2009-03-10 22:59:42 +08:00
|
|
|
final long curTime = System.currentTimeMillis();
|
|
|
|
|
final double elapsed = (curTime - startTime) / 1000.0;
|
2009-04-03 00:44:12 +08:00
|
|
|
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
|
2009-03-24 04:27:21 +08:00
|
|
|
|
2009-06-22 05:27:40 +08:00
|
|
|
if (mustPrint || nRecords == 1 || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) {
|
2009-03-10 22:59:42 +08:00
|
|
|
this.lastProgressPrintTime = curTime;
|
2009-02-27 05:50:29 +08:00
|
|
|
final double secsPer1MReads = (elapsed * 1000000.0) / nRecords;
|
2009-11-10 13:57:01 +08:00
|
|
|
switch (type) {
|
|
|
|
|
case LOCUS:
|
|
|
|
|
logger.info(String.format("[PROGRESS] Traversed to %s, processing %,d loci in %.2f secs (%.2f secs per 1M loci)",
|
|
|
|
|
loc,
|
|
|
|
|
nRecords,
|
|
|
|
|
elapsed,
|
|
|
|
|
secsPer1MReads));break;
|
|
|
|
|
case READ:
|
|
|
|
|
logger.info(String.format("[PROGRESS] Traversed %,d reads in %.2f secs %s(%.2f secs per 1M reads)",
|
|
|
|
|
nRecords,
|
|
|
|
|
elapsed,
|
|
|
|
|
(loc != null) ? String.format("at location %s ",loc) : "",
|
|
|
|
|
secsPer1MReads));break;
|
|
|
|
|
case DUPLICATE:
|
|
|
|
|
logger.info(String.format("[PROGRESS] Traversed %,d dups in %.2f secs %s(%.2f secs per 1M dups)",
|
|
|
|
|
nRecords,
|
|
|
|
|
elapsed,
|
|
|
|
|
(loc != null) ? String.format("at location %s ",loc) : "",
|
|
|
|
|
secsPer1MReads));break;
|
|
|
|
|
case LOCUS_WINDOW:
|
|
|
|
|
logger.info(String.format("[PROGRESS] Traversed %,d intervals in %.2f secs over interval %s (%.2f secs per 1M intervals)",
|
|
|
|
|
nRecords,
|
|
|
|
|
elapsed,
|
|
|
|
|
loc,
|
|
|
|
|
secsPer1MReads));break;
|
|
|
|
|
default:
|
|
|
|
|
logger.info(String.format("[PROGRESS] Traversed %,d records in %.2f secs (%.2f secs per 1M intervals)%s",
|
|
|
|
|
nRecords,
|
|
|
|
|
elapsed,
|
|
|
|
|
secsPer1MReads,
|
|
|
|
|
(loc != null) ? String.format(", last location seen was %s",loc) : ""));
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-02-27 05:50:29 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-08 22:12:45 +08:00
|
|
|
/**
|
|
|
|
|
* A passthrough method so that subclasses can report which types of traversals they're using.
|
2009-07-10 05:57:00 +08:00
|
|
|
*
|
2009-05-08 22:12:45 +08:00
|
|
|
* @param sum Result of the computation.
|
|
|
|
|
* @param <T> Type of the computation.
|
|
|
|
|
*/
|
2009-07-10 05:57:00 +08:00
|
|
|
public abstract <T> void printOnTraversalDone(T sum);
|
2009-05-08 22:12:45 +08:00
|
|
|
|
2009-03-12 05:43:31 +08:00
|
|
|
/**
|
|
|
|
|
* Called after a traversal to print out information about the traversal process
|
|
|
|
|
*
|
2009-11-10 13:57:01 +08:00
|
|
|
* @param type TRAVERSAL_TYPE describing this type of traversal
|
2009-03-24 04:27:21 +08:00
|
|
|
* @param sum The reduce result of the traversal
|
|
|
|
|
* @param <T> ReduceType of the traversal
|
2009-03-12 05:43:31 +08:00
|
|
|
*/
|
2009-11-10 13:57:01 +08:00
|
|
|
protected <T> void printOnTraversalDone(final TRAVERSAL_TYPE type, T sum) {
|
2009-03-24 04:27:21 +08:00
|
|
|
printProgress(true, type, null);
|
2009-03-27 21:27:04 +08:00
|
|
|
logger.info("Traversal reduce result is " + sum);
|
2009-03-26 22:40:50 +08:00
|
|
|
final long curTime = System.currentTimeMillis();
|
|
|
|
|
final double elapsed = (curTime - startTime) / 1000.0;
|
|
|
|
|
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
2009-07-07 06:50:22 +08:00
|
|
|
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
2009-07-10 05:57:00 +08:00
|
|
|
TraversalStatistics.nSkippedReads,
|
|
|
|
|
TraversalStatistics.nReads,
|
|
|
|
|
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
2009-04-10 04:28:17 +08:00
|
|
|
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
2009-04-17 09:27:36 +08:00
|
|
|
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
|
2009-04-10 04:28:17 +08:00
|
|
|
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
|
|
|
|
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
|
|
|
|
|
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
|
2009-03-12 05:43:31 +08:00
|
|
|
}
|
|
|
|
|
|
2009-07-10 05:57:00 +08:00
|
|
|
/** Initialize the traversal engine. After this point traversals can be run over the data */
|
|
|
|
|
public void initialize() {
|
2009-03-12 05:43:31 +08:00
|
|
|
lastProgressPrintTime = startTime = System.currentTimeMillis();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2009-07-10 05:57:00 +08:00
|
|
|
* this method must be implemented by all traversal engines
|
2009-03-24 04:27:21 +08:00
|
|
|
*
|
2009-07-10 05:57:00 +08:00
|
|
|
* @param walker the walker to run with
|
|
|
|
|
* @param shard a shard of data
|
|
|
|
|
* @param dataProvider the data provider that generates data given the shard
|
|
|
|
|
* @param sum the accumulator
|
|
|
|
|
* @param <M> an object of the map type
|
|
|
|
|
* @param <T> an object of the reduce type
|
2009-03-12 05:43:31 +08:00
|
|
|
*
|
2009-07-10 05:57:00 +08:00
|
|
|
* @return an object of the reduce type
|
2009-03-12 05:43:31 +08:00
|
|
|
*/
|
2009-07-10 05:57:00 +08:00
|
|
|
public abstract <M, T> T traverse(Walker<M, T> walker,
|
|
|
|
|
Shard shard,
|
|
|
|
|
ShardDataProvider dataProvider,
|
|
|
|
|
T sum);
|
2009-03-03 05:51:25 +08:00
|
|
|
}
|