From 54660a8c25bb1179bac9261305772728156bdd51 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 20 Apr 2011 03:22:48 +0000 Subject: [PATCH] Fix requested by Lee Lichtenstein: first check to see whether it's time for a progress message, then aggregate metrics. Makes the overhead of printProgress in RealignerTargetCreator go from >20% to ~3%. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5663 348d0f76-0448-11de-a6fe-93d51630548a --- .../executive/HierarchicalMicroScheduler.java | 2 +- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../sting/gatk/executive/MicroScheduler.java | 4 +- .../gatk/traversals/TraversalEngine.java | 102 +++++++++--------- .../traversals/TraverseReadsUnitTest.java | 1 - 5 files changed, 53 insertions(+), 58 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 384f8cdf7..8a27e008c 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -176,7 +176,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar outputTracker.bypassThreadLocalStorage(true); try { walker.onTraversalDone(result); - printOnTraversalDone(result,engine.getCumulativeMetrics()); + printOnTraversalDone(result); } finally { outputTracker.bypassThreadLocalStorage(false); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 9d8a57df0..a1c19daff 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -84,7 +84,7 @@ public class LinearMicroScheduler extends MicroScheduler { Object result = accumulator.finishTraversal(); - printOnTraversalDone(result,engine.getCumulativeMetrics()); + printOnTraversalDone(result); outputTracker.close(); cleanup(); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index eb02a591a..ca823235d 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -217,8 +217,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * Print summary information for the analysis. * @param sum The final reduce output. */ - protected void printOnTraversalDone(Object sum, ReadMetrics metrics) { - traversalEngine.printOnTraversalDone(metrics); + protected void printOnTraversalDone(Object sum) { + traversalEngine.printOnTraversalDone(); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 359bbe6b3..40ae8074f 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -119,6 +119,7 @@ public abstract class TraversalEngine,Provide private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + private boolean progressMeterInitialized = false; // for performance log private static final boolean PERFORMANCE_LOG_ENABLED = true; @@ -229,10 +230,7 @@ public abstract class TraversalEngine,Provide */ public void printProgress(Shard shard, GenomeLoc loc) { // A bypass is inserted here for unit testing. - // TODO: print metrics outside of the traversal engine to more easily handle cumulative stats. - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics().clone() : new ReadMetrics(); - cumulativeMetrics.incrementMetrics(shard.getReadMetrics()); - printProgress(loc, cumulativeMetrics, false); + printProgress(loc,shard.getReadMetrics(),false); } /** @@ -240,66 +238,62 @@ public abstract class TraversalEngine,Provide * every M seconds, for N and M set in global variables. * * @param loc Current location, can be null if you are at the end of the traversal - * @param metrics Metrics of reads filtered in/out. + * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) { - final long nRecords = metrics.getNumIterations(); - - if ( nRecords == 1 && mustPrint == false ) { + if(!progressMeterInitialized && mustPrint == false ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", getTraversalType(), getTraversalType())); - + progressMeterInitialized = true; } - else { - final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); - boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); - if ( printProgress || printLog ) { - ProcessingHistory last = updateHistory(loc, metrics); + final long curTime = timer.currentTime(); + boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); + boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); - final MyTime elapsed = new MyTime(last.elapsedSeconds); - final MyTime bpRate = new MyTime(secondsPerMillionBP(last)); - final MyTime unitRate = new MyTime(secondsPerMillionElements(last)); - final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); - final MyTime estTotalRuntime = new MyTime(elapsed.t / fractionGenomeTargetCompleted); - final MyTime timeToCompletion = new MyTime(estTotalRuntime.t - elapsed.t); + if ( printProgress || printLog ) { + // getting and appending metrics data actually turns out to be quite a heavyweight + // operation. Postpone it until after determining whether to print the log message. + ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics(); + if(metrics != null) + cumulativeMetrics.incrementMetrics(metrics); - if ( printProgress ) { - lastProgressPrintTime = curTime; - - // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates - if ( estTotalRuntime.t > TWELVE_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds - else if ( estTotalRuntime.t > TWO_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds - else - PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + final long nRecords = cumulativeMetrics.getNumIterations(); -// String common = String.format("%4.1e %s in %s, %s per 1M %s, %4.1f%% complete, est. runtime %s, %s remaining", -// nRecords*1.0, getTraversalType(), elapsed, unitRate, -// getTraversalType(), 100*fractionGenomeTargetCompleted, -// estTotalRuntime, timeToCompletion); -// -// if (loc != null) -// logger.info(String.format("%20s: processing %s", loc, common)); -// else -// logger.info(String.format("Processing %s", common)); - logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", - loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, - 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); + ProcessingHistory last = updateHistory(loc,cumulativeMetrics); - } + final MyTime elapsed = new MyTime(last.elapsedSeconds); + final MyTime bpRate = new MyTime(secondsPerMillionBP(last)); + final MyTime unitRate = new MyTime(secondsPerMillionElements(last)); + final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); + final MyTime estTotalRuntime = new MyTime(elapsed.t / fractionGenomeTargetCompleted); + final MyTime timeToCompletion = new MyTime(estTotalRuntime.t - elapsed.t); - if ( printLog ) { - lastPerformanceLogPrintTime = curTime; - synchronized(performanceLogLock) { - performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", - elapsed.t, nRecords, unitRate.t, last.bpProcessed, bpRate.t, - fractionGenomeTargetCompleted, estTotalRuntime.t, timeToCompletion.t); - } + if ( printProgress ) { + lastProgressPrintTime = curTime; + + // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates + if ( estTotalRuntime.t > TWELVE_HOURS_IN_SECONDS ) + PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds + else if ( estTotalRuntime.t > TWO_HOURS_IN_SECONDS ) + PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds + else + PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + + logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", + loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, + 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); + + } + + if ( printLog ) { + lastPerformanceLogPrintTime = curTime; + synchronized(performanceLogLock) { + performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", + elapsed.t, nRecords, unitRate.t, last.bpProcessed, bpRate.t, + fractionGenomeTargetCompleted, estTotalRuntime.t, timeToCompletion.t); } } } @@ -345,11 +339,13 @@ public abstract class TraversalEngine,Provide /** * Called after a traversal to print out information about the traversal process */ - public void printOnTraversalDone(ReadMetrics cumulativeMetrics) { - printProgress(null, cumulativeMetrics, true); + public void printOnTraversalDone() { + printProgress(null, null, true); final double elapsed = timer.getElapsedTime(); + ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + // count up the number of skipped reads by summing over all filters long nSkippedReads = 0L; for ( Map.Entry countsByFilter: cumulativeMetrics.getCountsByFilter().entrySet()) diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 446df326c..c0d32a05b 100755 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -138,7 +138,6 @@ public class TraverseReadsUnitTest extends BaseTest { dataProvider.close(); } - traversalEngine.printOnTraversalDone(new ReadMetrics()); countReadWalker.onTraversalDone(accumulator); if (!(accumulator instanceof Integer)) {