2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
import org.apache.log4j.Logger ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2010-05-27 06:12:25 +08:00
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2010-05-27 06:12:25 +08:00
import org.broadinstitute.sting.utils.Utils ;
import org.broadinstitute.sting.utils.MathUtils ;
import java.util.Map ;
import java.util.List ;
import java.util.Iterator ;
import net.sf.picard.filter.SamRecordFilter ;
import net.sf.samtools.SAMRecord ;
2009-03-03 02:18:48 +08:00
2010-03-12 02:40:31 +08:00
public abstract class TraversalEngine < M , T , WalkerType extends Walker < M , T > , ProviderType extends ShardDataProvider > {
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-07-10 05:57:00 +08:00
private long startTime = - 1 ;
private long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-07-10 05:57:00 +08:00
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000 ; // 10 seconds in millisecs
private final long N_RECORDS_TO_PRINT = 1000000 ;
2009-03-12 05:43:31 +08:00
2009-07-10 05:57:00 +08:00
/** our log, which we want to capture anything from this class */
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
2009-07-10 05:57:00 +08:00
*
2009-03-10 22:59:42 +08:00
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-07-10 05:57:00 +08:00
private boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
2009-11-10 13:57:01 +08:00
* @param type the TRAVERSAL_TYPE of the traversal
2009-07-10 05:57:00 +08:00
* @param loc the location
2009-03-12 05:43:31 +08:00
* /
2009-11-12 14:18:10 +08:00
public void printProgress ( final String type , GenomeLoc loc ) {
2009-03-24 04:27:21 +08:00
printProgress ( false , type , loc ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-24 04:27:21 +08:00
* @param type String to print out describing our atomic traversal type ( "read" , "locus" , etc )
* @param loc Current location
2009-03-12 05:43:31 +08:00
* /
2009-11-12 14:18:10 +08:00
private void printProgress ( boolean mustPrint , final String type , GenomeLoc loc ) {
2009-04-10 04:28:17 +08:00
final long nRecords = TraversalStatistics . nRecords ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
2009-06-22 05:27:40 +08:00
if ( mustPrint | | nRecords = = 1 | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-11-10 14:18:34 +08:00
if ( loc ! = null )
2009-11-12 14:18:10 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-11-10 14:18:34 +08:00
else
2009-11-12 14:18:10 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-11-10 13:57:01 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-05-08 22:12:45 +08:00
/ * *
* A passthrough method so that subclasses can report which types of traversals they ' re using .
2009-07-10 05:57:00 +08:00
*
2009-05-08 22:12:45 +08:00
* @param sum Result of the computation .
* /
2010-03-12 02:40:31 +08:00
public abstract void printOnTraversalDone ( T sum ) ;
2009-05-08 22:12:45 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
*
2009-11-12 14:18:10 +08:00
* @param type describing this type of traversal
2009-03-24 04:27:21 +08:00
* @param sum The reduce result of the traversal
2009-03-12 05:43:31 +08:00
* /
2010-03-12 02:40:31 +08:00
protected void printOnTraversalDone ( final String type , T sum ) {
2009-03-24 04:27:21 +08:00
printProgress ( true , type , null ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2010-05-27 06:12:25 +08:00
// count up the number of skipped reads by summing over all filters
long nSkippedReads = 0 L ;
for ( long counts : TraversalStatistics . counter . values ( ) )
nSkippedReads + = counts ;
2009-03-26 22:40:50 +08:00
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2010-05-27 06:12:25 +08:00
logger . info ( String . format ( "%d reads were filtered out during traversal out of %d total (%.2f%%)" ,
nSkippedReads ,
2009-07-10 05:57:00 +08:00
TraversalStatistics . nReads ,
2010-05-27 06:12:25 +08:00
100.0 * MathUtils . ratio ( nSkippedReads , TraversalStatistics . nReads ) ) ) ;
for ( Map . Entry < Class , Long > filterCounts : TraversalStatistics . counter . entrySet ( ) ) {
long count = filterCounts . getValue ( ) ;
logger . info ( String . format ( " -> %d reads (%.2f%% of total) failing %s" ,
count , 100.0 * MathUtils . ratio ( count , TraversalStatistics . nReads ) , Utils . getClassName ( filterCounts . getKey ( ) ) ) ) ;
}
2009-03-12 05:43:31 +08:00
}
2009-07-10 05:57:00 +08:00
/** Initialize the traversal engine. After this point traversals can be run over the data */
public void initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
}
/ * *
2009-07-10 05:57:00 +08:00
* this method must be implemented by all traversal engines
2009-03-24 04:27:21 +08:00
*
2009-07-10 05:57:00 +08:00
* @param walker the walker to run with
* @param dataProvider the data provider that generates data given the shard
* @param sum the accumulator
2009-03-12 05:43:31 +08:00
*
2009-07-10 05:57:00 +08:00
* @return an object of the reduce type
2009-03-12 05:43:31 +08:00
* /
2010-03-12 02:40:31 +08:00
public abstract T traverse ( WalkerType walker ,
ProviderType dataProvider ,
T sum ) ;
2010-05-27 06:12:25 +08:00
public static Iterator < SAMRecord > addMandatoryFilteringIterators ( Iterator < SAMRecord > iter , List < SamRecordFilter > filters ) {
for ( SamRecordFilter filter : filters ) {
//logger.debug("Adding filter " + filter.getClass());
iter = new CountingFilteringIterator ( iter , filter ) ;
}
return new CountingFilteringIterator ( iter ) ; // special case to count all reads
}
2009-03-03 05:51:25 +08:00
}