2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
import org.apache.log4j.Logger ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
import org.broadinstitute.sting.gatk.walkers.Walker ;
import org.broadinstitute.sting.utils.GenomeLoc ;
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public abstract class TraversalEngine {
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-07-10 05:57:00 +08:00
private long startTime = - 1 ;
private long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-07-10 05:57:00 +08:00
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000 ; // 10 seconds in millisecs
private final long N_RECORDS_TO_PRINT = 1000000 ;
2009-03-12 05:43:31 +08:00
// Maximum number of reads to process before finishing
2009-07-10 06:03:45 +08:00
protected long maximumIterations = - 1 ;
2009-03-12 05:43:31 +08:00
2009-07-10 05:57:00 +08:00
/** our log, which we want to capture anything from this class */
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2009-07-10 06:03:45 +08:00
/ * *
* set the max number of iterations
* @param maximumIterations the number of iterations
* /
public void setMaximumIterations ( final int maximumIterations ) {
this . maximumIterations = maximumIterations ;
2009-03-24 04:27:21 +08:00
}
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
2009-07-10 05:57:00 +08:00
*
2009-03-10 22:59:42 +08:00
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-07-10 05:57:00 +08:00
private boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
2009-11-10 13:57:01 +08:00
* @param type the TRAVERSAL_TYPE of the traversal
2009-07-10 05:57:00 +08:00
* @param loc the location
2009-03-12 05:43:31 +08:00
* /
2009-11-12 14:18:10 +08:00
public void printProgress ( final String type , GenomeLoc loc ) {
2009-03-24 04:27:21 +08:00
printProgress ( false , type , loc ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-24 04:27:21 +08:00
* @param type String to print out describing our atomic traversal type ( "read" , "locus" , etc )
* @param loc Current location
2009-03-12 05:43:31 +08:00
* /
2009-11-12 14:18:10 +08:00
private void printProgress ( boolean mustPrint , final String type , GenomeLoc loc ) {
2009-04-10 04:28:17 +08:00
final long nRecords = TraversalStatistics . nRecords ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
2009-06-22 05:27:40 +08:00
if ( mustPrint | | nRecords = = 1 | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-11-10 14:18:34 +08:00
if ( loc ! = null )
2009-11-12 14:18:10 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-11-10 14:18:34 +08:00
else
2009-11-12 14:18:10 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-11-10 13:57:01 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-05-08 22:12:45 +08:00
/ * *
* A passthrough method so that subclasses can report which types of traversals they ' re using .
2009-07-10 05:57:00 +08:00
*
2009-05-08 22:12:45 +08:00
* @param sum Result of the computation .
* @param < T > Type of the computation .
* /
2009-07-10 05:57:00 +08:00
public abstract < T > void printOnTraversalDone ( T sum ) ;
2009-05-08 22:12:45 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
*
2009-11-12 14:18:10 +08:00
* @param type describing this type of traversal
2009-03-24 04:27:21 +08:00
* @param sum The reduce result of the traversal
* @param < T > ReduceType of the traversal
2009-03-12 05:43:31 +08:00
* /
2009-11-12 14:18:10 +08:00
protected < T > void printOnTraversalDone ( final String type , T sum ) {
2009-03-24 04:27:21 +08:00
printProgress ( true , type , null ) ;
2009-03-27 21:27:04 +08:00
logger . info ( "Traversal reduce result is " + sum ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2009-07-07 06:50:22 +08:00
logger . info ( String . format ( "Traversal skipped %d valid reads out of %d total (%.2f%%)" ,
2009-07-10 05:57:00 +08:00
TraversalStatistics . nSkippedReads ,
TraversalStatistics . nReads ,
( TraversalStatistics . nSkippedReads * 100.0 ) / TraversalStatistics . nReads ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( " -> %d unmapped reads" , TraversalStatistics . nUnmappedReads ) ) ;
2009-04-17 09:27:36 +08:00
logger . info ( String . format ( " -> %d duplicate reads" , TraversalStatistics . nDuplicates ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( " -> %d non-primary reads" , TraversalStatistics . nNotPrimary ) ) ;
logger . info ( String . format ( " -> %d reads with bad alignments" , TraversalStatistics . nBadAlignments ) ) ;
logger . info ( String . format ( " -> %d reads with indels" , TraversalStatistics . nSkippedIndels ) ) ;
2009-03-12 05:43:31 +08:00
}
2009-07-10 05:57:00 +08:00
/** Initialize the traversal engine. After this point traversals can be run over the data */
public void initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
}
/ * *
2009-07-10 05:57:00 +08:00
* this method must be implemented by all traversal engines
2009-03-24 04:27:21 +08:00
*
2009-07-10 05:57:00 +08:00
* @param walker the walker to run with
* @param shard a shard of data
* @param dataProvider the data provider that generates data given the shard
* @param sum the accumulator
* @param < M > an object of the map type
* @param < T > an object of the reduce type
2009-03-12 05:43:31 +08:00
*
2009-07-10 05:57:00 +08:00
* @return an object of the reduce type
2009-03-12 05:43:31 +08:00
* /
2009-07-10 05:57:00 +08:00
public abstract < M , T > T traverse ( Walker < M , T > walker ,
Shard shard ,
ShardDataProvider dataProvider ,
T sum ) ;
2009-03-03 05:51:25 +08:00
}