2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
import org.apache.log4j.Logger ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.ReadMetrics ;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2010-05-27 06:12:25 +08:00
import org.broadinstitute.sting.utils.Utils ;
import org.broadinstitute.sting.utils.MathUtils ;
import java.util.Map ;
2009-03-03 02:18:48 +08:00
2010-03-12 02:40:31 +08:00
public abstract class TraversalEngine < M , T , WalkerType extends Walker < M , T > , ProviderType extends ShardDataProvider > {
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-07-10 05:57:00 +08:00
private long startTime = - 1 ;
private long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-07-10 05:57:00 +08:00
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000 ; // 10 seconds in millisecs
private final long N_RECORDS_TO_PRINT = 1000000 ;
2009-03-12 05:43:31 +08:00
2009-07-10 05:57:00 +08:00
/** our log, which we want to capture anything from this class */
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2010-08-12 04:17:11 +08:00
/ * *
* Gets the named traversal type associated with the given traversal .
* @return A user - friendly name for the given traversal type .
* /
protected abstract String getTraversalType ( ) ;
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
2009-07-10 05:57:00 +08:00
*
2009-03-10 22:59:42 +08:00
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-07-10 05:57:00 +08:00
private boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
2010-08-12 04:17:11 +08:00
* @param shard the given shard currently being processed .
2009-07-10 05:57:00 +08:00
* @param loc the location
2009-03-12 05:43:31 +08:00
* /
2010-08-12 04:17:11 +08:00
public void printProgress ( Shard shard , GenomeLoc loc ) {
// A bypass is inserted here for unit testing.
// TODO: print metrics outside of the traversal engine to more easily handle cumulative stats.
ReadMetrics cumulativeMetrics = GenomeAnalysisEngine . instance ! = null ? GenomeAnalysisEngine . instance . getCumulativeMetrics ( ) . clone ( ) : new ReadMetrics ( ) ;
cumulativeMetrics . incrementMetrics ( shard . getReadMetrics ( ) ) ;
printProgress ( loc , cumulativeMetrics , false ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
2009-03-24 04:27:21 +08:00
* @param loc Current location
2010-08-12 04:17:11 +08:00
* @param metrics Metrics of reads filtered in / out .
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-12 05:43:31 +08:00
* /
2010-08-12 04:17:11 +08:00
private void printProgress ( GenomeLoc loc , ReadMetrics metrics , boolean mustPrint ) {
final long nRecords = metrics . getNumIterations ( ) ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
2009-06-22 05:27:40 +08:00
if ( mustPrint | | nRecords = = 1 | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-11-10 14:18:34 +08:00
if ( loc ! = null )
2010-08-12 04:17:11 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , getTraversalType ( ) , elapsed , secsPer1MReads , getTraversalType ( ) ) ) ;
2009-11-10 14:18:34 +08:00
else
2010-08-12 04:17:11 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , getTraversalType ( ) , elapsed , secsPer1MReads , getTraversalType ( ) ) ) ;
2009-11-10 13:57:01 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
* /
2010-08-12 04:17:11 +08:00
public void printOnTraversalDone ( ReadMetrics cumulativeMetrics ) {
printProgress ( null , cumulativeMetrics , true ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2010-05-27 06:12:25 +08:00
// count up the number of skipped reads by summing over all filters
long nSkippedReads = 0 L ;
2010-08-12 04:17:11 +08:00
for ( Map . Entry < Class , Long > countsByFilter : cumulativeMetrics . getCountsByFilter ( ) . entrySet ( ) )
nSkippedReads + = countsByFilter . getValue ( ) ;
2010-05-27 06:12:25 +08:00
2009-03-26 22:40:50 +08:00
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2010-05-27 06:12:25 +08:00
logger . info ( String . format ( "%d reads were filtered out during traversal out of %d total (%.2f%%)" ,
nSkippedReads ,
2010-08-12 04:17:11 +08:00
cumulativeMetrics . getNumReadsSeen ( ) ,
100.0 * MathUtils . ratio ( nSkippedReads , cumulativeMetrics . getNumReadsSeen ( ) ) ) ) ;
for ( Map . Entry < Class , Long > filterCounts : cumulativeMetrics . getCountsByFilter ( ) . entrySet ( ) ) {
2010-05-27 06:12:25 +08:00
long count = filterCounts . getValue ( ) ;
logger . info ( String . format ( " -> %d reads (%.2f%% of total) failing %s" ,
2010-08-12 04:17:11 +08:00
count , 100.0 * MathUtils . ratio ( count , cumulativeMetrics . getNumReadsSeen ( ) ) , Utils . getClassName ( filterCounts . getKey ( ) ) ) ) ;
2010-05-27 06:12:25 +08:00
}
2009-03-12 05:43:31 +08:00
}
2009-07-10 05:57:00 +08:00
/** Initialize the traversal engine. After this point traversals can be run over the data */
public void initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
}
/ * *
2009-07-10 05:57:00 +08:00
* this method must be implemented by all traversal engines
2009-03-24 04:27:21 +08:00
*
2009-07-10 05:57:00 +08:00
* @param walker the walker to run with
* @param dataProvider the data provider that generates data given the shard
* @param sum the accumulator
2009-03-12 05:43:31 +08:00
*
2009-07-10 05:57:00 +08:00
* @return an object of the reduce type
2009-03-12 05:43:31 +08:00
* /
2010-03-12 02:40:31 +08:00
public abstract T traverse ( WalkerType walker ,
ProviderType dataProvider ,
T sum ) ;
2009-03-03 05:51:25 +08:00
}