2010-10-18 07:21:23 +08:00
/ *
* Copyright ( c ) 2010 , The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
import org.apache.log4j.Logger ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.ReadMetrics ;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2010-05-27 06:12:25 +08:00
import org.broadinstitute.sting.utils.Utils ;
import org.broadinstitute.sting.utils.MathUtils ;
2011-01-03 06:18:12 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-05-27 06:12:25 +08:00
2011-01-03 06:18:12 +08:00
import java.io.File ;
import java.io.FileNotFoundException ;
import java.io.FileOutputStream ;
import java.io.PrintStream ;
import java.util.Arrays ;
2010-05-27 06:12:25 +08:00
import java.util.Map ;
2009-03-03 02:18:48 +08:00
2010-03-12 02:40:31 +08:00
public abstract class TraversalEngine < M , T , WalkerType extends Walker < M , T > , ProviderType extends ShardDataProvider > {
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-07-10 05:57:00 +08:00
private long startTime = - 1 ;
private long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2011-01-03 06:18:12 +08:00
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000 ; // in seconds
2009-07-10 05:57:00 +08:00
private final long N_RECORDS_TO_PRINT = 1000000 ;
2009-03-12 05:43:31 +08:00
2011-01-03 06:18:12 +08:00
// for performance log
private static final boolean PERFORMANCE_LOG_ENABLED = false ;
private PrintStream performanceLog = null ;
private long lastPerformanceLogPrintTime = - 1 ; // When was the last time we printed to the performance log?
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = 1 * 1000 ; // in seconds
2009-07-10 05:57:00 +08:00
/** our log, which we want to capture anything from this class */
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2010-11-11 01:59:50 +08:00
protected GenomeAnalysisEngine engine ;
2010-09-25 10:49:30 +08:00
2010-08-12 04:17:11 +08:00
/ * *
* Gets the named traversal type associated with the given traversal .
* @return A user - friendly name for the given traversal type .
* /
protected abstract String getTraversalType ( ) ;
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
2011-01-03 06:18:12 +08:00
* @param lastPrintTime the last time we printed , in machine milliseconds
* @param printFreq maximum permitted difference between last print and current times
2009-07-10 05:57:00 +08:00
*
2009-03-10 22:59:42 +08:00
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2011-01-03 06:18:12 +08:00
private boolean maxElapsedIntervalForPrinting ( final long curTime , long lastPrintTime , long printFreq ) {
return ( curTime - lastPrintTime ) > printFreq ;
2009-03-10 22:59:42 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
2010-08-12 04:17:11 +08:00
* @param shard the given shard currently being processed .
2009-07-10 05:57:00 +08:00
* @param loc the location
2009-03-12 05:43:31 +08:00
* /
2010-08-12 04:17:11 +08:00
public void printProgress ( Shard shard , GenomeLoc loc ) {
// A bypass is inserted here for unit testing.
// TODO: print metrics outside of the traversal engine to more easily handle cumulative stats.
2010-11-11 01:59:50 +08:00
ReadMetrics cumulativeMetrics = engine . getCumulativeMetrics ( ) ! = null ? engine . getCumulativeMetrics ( ) . clone ( ) : new ReadMetrics ( ) ;
2010-08-12 04:17:11 +08:00
cumulativeMetrics . incrementMetrics ( shard . getReadMetrics ( ) ) ;
printProgress ( loc , cumulativeMetrics , false ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
2009-03-24 04:27:21 +08:00
* @param loc Current location
2010-08-12 04:17:11 +08:00
* @param metrics Metrics of reads filtered in / out .
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-12 05:43:31 +08:00
* /
2010-08-12 04:17:11 +08:00
private void printProgress ( GenomeLoc loc , ReadMetrics metrics , boolean mustPrint ) {
final long nRecords = metrics . getNumIterations ( ) ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2011-01-03 06:18:12 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / Math . max ( nRecords , 1 ) ;
2009-03-24 04:27:21 +08:00
2011-01-03 06:18:12 +08:00
if ( mustPrint
| | nRecords = = 1
| | nRecords % N_RECORDS_TO_PRINT = = 0
| | maxElapsedIntervalForPrinting ( curTime , lastProgressPrintTime , MAX_PROGRESS_PRINT_TIME ) ) {
lastProgressPrintTime = curTime ;
2010-10-20 10:47:34 +08:00
if ( nRecords = = 1 )
logger . info ( "[INITIALIZATION COMPLETE; TRAVERSAL STARTING]" ) ;
else {
if ( loc ! = null )
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , getTraversalType ( ) , elapsed , secsPer1MReads , getTraversalType ( ) ) ) ;
else
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , getTraversalType ( ) , elapsed , secsPer1MReads , getTraversalType ( ) ) ) ;
}
2011-01-03 06:18:12 +08:00
}
//
// code to process the performance log
// TODO -- should be integrated into command line system [hard coded off now]
// TODO -- should write a unique log name as an option?
// TODO -- should be controlled by Queue so that .out and .performance.log comes out
//
if ( PERFORMANCE_LOG_ENABLED & & performanceLog = = null ) {
try {
// todo -- temp for testing
performanceLog = new PrintStream ( new FileOutputStream ( "performance.log" ) ) ;
performanceLog . println ( Utils . join ( "\t" , Arrays . asList ( "elapsed.time" , "units.processed" , "processing.speed" ) ) ) ;
} catch ( FileNotFoundException e ) {
throw new UserException . CouldNotCreateOutputFile ( new File ( "performance.log" ) , e ) ;
}
}
if ( performanceLog ! = null & & maxElapsedIntervalForPrinting ( curTime , lastPerformanceLogPrintTime , PERFORMANCE_LOG_PRINT_FREQUENCY ) ) {
lastPerformanceLogPrintTime = curTime ;
if ( nRecords > 1 ) performanceLog . printf ( "%.2f\t%d\t%.2f%n" , elapsed , nRecords , secsPer1MReads ) ;
}
2009-02-27 05:50:29 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
* /
2010-08-12 04:17:11 +08:00
public void printOnTraversalDone ( ReadMetrics cumulativeMetrics ) {
printProgress ( null , cumulativeMetrics , true ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2010-05-27 06:12:25 +08:00
// count up the number of skipped reads by summing over all filters
long nSkippedReads = 0 L ;
2010-08-12 04:17:11 +08:00
for ( Map . Entry < Class , Long > countsByFilter : cumulativeMetrics . getCountsByFilter ( ) . entrySet ( ) )
nSkippedReads + = countsByFilter . getValue ( ) ;
2010-05-27 06:12:25 +08:00
2010-08-29 06:53:32 +08:00
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2010-10-27 04:22:16 +08:00
if ( cumulativeMetrics . getNumReadsSeen ( ) > 0 )
logger . info ( String . format ( "%d reads were filtered out during traversal out of %d total (%.2f%%)" ,
nSkippedReads ,
cumulativeMetrics . getNumReadsSeen ( ) ,
100.0 * MathUtils . ratio ( nSkippedReads , cumulativeMetrics . getNumReadsSeen ( ) ) ) ) ;
2010-08-12 04:17:11 +08:00
for ( Map . Entry < Class , Long > filterCounts : cumulativeMetrics . getCountsByFilter ( ) . entrySet ( ) ) {
2010-05-27 06:12:25 +08:00
long count = filterCounts . getValue ( ) ;
logger . info ( String . format ( " -> %d reads (%.2f%% of total) failing %s" ,
2010-08-12 04:17:11 +08:00
count , 100.0 * MathUtils . ratio ( count , cumulativeMetrics . getNumReadsSeen ( ) ) , Utils . getClassName ( filterCounts . getKey ( ) ) ) ) ;
2010-05-27 06:12:25 +08:00
}
2011-01-03 06:18:12 +08:00
if ( performanceLog ! = null ) performanceLog . close ( ) ;
2009-03-12 05:43:31 +08:00
}
2010-09-25 10:49:30 +08:00
/ * *
* Initialize the traversal engine . After this point traversals can be run over the data
* @param engine GenomeAnalysisEngine for this traversal
* /
public void initialize ( GenomeAnalysisEngine engine ) {
this . engine = engine ;
2010-10-20 10:47:34 +08:00
}
/ * *
* Should be called to indicate that we ' re going to process records and the timer should start ticking
* /
public void startTimers ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
}
/ * *
2009-07-10 05:57:00 +08:00
* this method must be implemented by all traversal engines
2009-03-24 04:27:21 +08:00
*
2009-07-10 05:57:00 +08:00
* @param walker the walker to run with
* @param dataProvider the data provider that generates data given the shard
* @param sum the accumulator
2009-03-12 05:43:31 +08:00
*
2009-07-10 05:57:00 +08:00
* @return an object of the reduce type
2009-03-12 05:43:31 +08:00
* /
2010-03-12 02:40:31 +08:00
public abstract T traverse ( WalkerType walker ,
ProviderType dataProvider ,
T sum ) ;
2009-03-03 05:51:25 +08:00
}