From 7ae19953f50c3e6ebe8462770bd80978313a6e1d Mon Sep 17 00:00:00 2001 From: Nicholas Clarke Date: Fri, 14 Feb 2014 09:35:06 +0000 Subject: [PATCH] Squashed commit of the following: commit 5e73b94eed3d1fc75c88863c2cf07d5972eb348b Merge: e12593a d04a585 Author: Nicholas Clarke Date: Fri Feb 14 09:25:22 2014 +0000 Merge pull request #1 from broadinstitute/checkpoint SimpleTimer passes tests, with formatting commit d04a58533f1bf5e39b0b43018c9db3302943d985 Author: kshakir Date: Fri Feb 14 14:46:01 2014 +0800 SimpleTimer passes tests, with formatting Fixed getNanoOffset() to offset nano to nano, instead of nano to seconds. Updated warning message with comma separated numbers, and exact values of offsets. commit e12593ae66a5e6f0819316f2a580dbc7ae5896ad Author: Nicholas Clarke Date: Wed Feb 12 13:27:07 2014 +0000 Remove instance of 'Timer'. commit 47a73e0b123d4257b57cfc926a5bdd75d709fcf9 Author: Nicholas Clarke Date: Wed Feb 12 12:19:00 2014 +0000 Revert a couple of changes that survived somehow. - CheckpointableTimer,Timer -> SimpleTimer commit d86d9888ae93400514a8119dc2024e0a101f7170 Author: Nicholas Clarke Date: Mon Jan 20 14:13:09 2014 +0000 Revised commits following comments. - All utility merged into `SimpleTimer`. - All tests merged into `SimpleTimerUnitTest`. - Behaviour of `getElapsedTime` should now be consistent with `stop`. - Use 'TimeUnit' class for all unit conversions. - A bit more tidying. commit 354ee49b7fc880e944ff9df4343a86e9a5d477c7 Author: Nicholas Clarke Date: Fri Jan 17 17:04:39 2014 +0000 Add a new CheckpointableTimerUnitTest. Revert SimpleTimerUnitTest to the version before any changes were made. commit 2ad1b6c87c158399ededd706525c776372bbaf6e Author: Nicholas Clarke Date: Tue Jan 14 16:11:18 2014 +0000 Add test specifically checking behaviour under checkpoint/restart. Slight alteration to the checkpointable timer based on observations during the testing - it seems that there's a fair amount of drift between the sources anyway, so each time we stop we resynchronise the offset. Hopefully this should avoid gradual drift building up and presenting as checkpoint/restart drift. commit 1c98881594dc51e4e2365ac95b31d410326d8b53 Author: Nicholas Clarke Date: Tue Jan 14 14:11:31 2014 +0000 Should use consistent time units commit 6f70d42d660b31eee4c2e9d918e74c4129f46036 Author: Nicholas Clarke Date: Tue Jan 14 14:01:10 2014 +0000 Add a new timer supporting checkpoint mechanisms. The issue with this is that the current timer is locked to JVM nanoTime. This can be reset after a checkpoint/restart and result in negative elapsed times, which causes an error. This patch addresses the issue in two ways: - Moves the check on timer information in GenomeAnalysisEngine.java to only occur if a time limit has been set. - Create a new timer (CheckpointableTimer) which keeps track of the relation between system and nano time. If this changes drastically, then the assumption is that there has been a JVM restart owing to checkpoint/restart. Any time straddling a checkpoint/restart event will not be counted towards total running time. Signed-off-by: Khalid Shakir --- .../sting/gatk/GenomeAnalysisEngine.java | 7 +- .../sting/utils/SimpleTimer.java | 83 ++++++++++++++++++- .../sting/utils/SimpleTimerUnitTest.java | 50 +++++++++++ 3 files changed, 134 insertions(+), 6 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0f7333704..8df294b21 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -1205,12 +1205,11 @@ public class GenomeAnalysisEngine { // not yet initialized or not set because of testing return false; - final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); - if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); - if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) return false; - else { + else { + final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); + if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); return runtime > maxRuntimeNano; } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java index 69a2f0c8e..59516f196 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java @@ -29,7 +29,11 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; + +import java.text.NumberFormat; import java.util.concurrent.TimeUnit; +import static java.lang.Math.abs; /** * A useful simple system for timing code with nano second resolution @@ -39,14 +43,42 @@ import java.util.concurrent.TimeUnit; * calls to avoid meaningless results of having multiple starts and stops * called sequentially. * + * This timer has been modified to provide better semantics for dealing with + * system-level checkpoint and restarting. Such events can cause the internal JVM + * clock to be reset, breaking timings based upon it. Whilst this is difficult to + * counter without getting explicit notice of checkpoint events, we try to moderate + * the symptoms through tracking the offset between the system clock and the JVM clock. + * If this offset grows drastically (greater than CLOCK_DRIFT), we infer a JVM restart + * and reset the timer. + * * User: depristo * Date: Dec 10, 2010 * Time: 9:07:44 AM */ public class SimpleTimer { + private final static Logger logger = Logger.getLogger(SimpleTimer.class); protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private static final long MILLI_TO_NANO= TimeUnit.MILLISECONDS.toNanos(1); + private static final ThreadLocal NUMBER_FORMAT = new ThreadLocal() { + @Override + protected NumberFormat initialValue() { + return NumberFormat.getIntegerInstance(); + } + }; + + /** + * Allowable clock drift in nanoseconds. + */ + private static final long CLOCK_DRIFT = TimeUnit.SECONDS.toNanos(5); private final String name; + /** + * The difference between system time and JVM time at last sync. + * This is used to detect JVM checkpoint/restart events, and should be + * reset when a JVM checkpoint/restart is detected. + */ + private long nanoTimeOffset; + /** * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the * sum of times between starts/restrats and stops. @@ -77,6 +109,8 @@ public class SimpleTimer { public SimpleTimer(final String name) { if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); this.name = name; + + this.nanoTimeOffset = getNanoOffset(); } /** @@ -108,6 +142,7 @@ public class SimpleTimer { public synchronized SimpleTimer restart() { running = true; startTimeNano = currentTimeNano(); + nanoTimeOffset = getNanoOffset(); return this; } @@ -134,6 +169,9 @@ public class SimpleTimer { /** * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * This method calls `ensureClockSync` to make sure that the JVM and system clocks + * are roughly in sync since the start of the timer. If they are not, then the time + * elapsed since the previous 'stop' will not be added to the timer. * * It's ok to call stop on a timer that's not running. It has no effect on the timer. * @@ -143,7 +181,9 @@ public class SimpleTimer { public synchronized SimpleTimer stop() { if ( running ) { running = false; - elapsedTimeNano += currentTimeNano() - startTimeNano; + if (ensureClockSync()) { + elapsedTimeNano += currentTimeNano() - startTimeNano; + } } return this; } @@ -168,7 +208,11 @@ public class SimpleTimer { * @return the elapsed time in nanoseconds */ public synchronized long getElapsedTimeNano() { - return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; + if (running && ensureClockSync()) { + return currentTimeNano() - startTimeNano + elapsedTimeNano; + } else { + return elapsedTimeNano; + } } /** @@ -179,4 +223,39 @@ public class SimpleTimer { public synchronized void addElapsed(final SimpleTimer toAdd) { elapsedTimeNano += toAdd.getElapsedTimeNano(); } + + /** + * Get the current offset of nano time from system time. + */ + private static long getNanoOffset() { + return System.nanoTime() - (System.currentTimeMillis() * MILLI_TO_NANO); + } + + /** + * Ensure that the JVM time has remained in sync with system time. + * This will also reset the clocks to avoid gradual drift. + * + * @return true if the clocks are in sync, false otherwise + */ + private boolean ensureClockSync() { + final long currentOffset = getNanoOffset(); + final long diff = abs(currentOffset - nanoTimeOffset); + final boolean ret = (diff <= CLOCK_DRIFT); + if (!ret) { + final NumberFormat numberFormat = NUMBER_FORMAT.get(); + final String msg = String.format( + "Clock drift of %s - %s = %s nanoseconds detected, vs. max allowable drift of %s. " + + "Assuming checkpoint/restart event.", + numberFormat.format(currentOffset), + numberFormat.format(nanoTimeOffset), + numberFormat.format(diff), + numberFormat.format(CLOCK_DRIFT)); + // Log message + logger.warn(msg); + } + // Reset the drift meter to stay in sync. + this.nanoTimeOffset = currentOffset; + return ret; + } + } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index f92cd4bcf..85b79a00f 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -29,6 +29,8 @@ import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.Test; +import java.lang.reflect.Field; + import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; @@ -123,6 +125,54 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); } + @Test + public void testCheckpointRestart() throws Exception { + SimpleTimer t = new SimpleTimer(NAME); + + final Field offsetField = t.getClass().getDeclaredField("nanoTimeOffset"); + offsetField.setAccessible(true); + long offset = ((Long) offsetField.get(t)).longValue(); + + t.start(); + idleLoop(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Time over restart is not zero."); + + t.start(); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + double elapsed = t.getElapsedTime(); + Assert.assertTrue(elapsed >= 0.0, "Elapsed time is zero."); + t.restart(); + // Make the clock jump again by just a little + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(1)); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertTrue(t.getElapsedTime() > elapsed, "Small clock drift causing reset."); + elapsed = t.getElapsedTime(); + // Now a bigger jump, into the future this time. + t.restart(); + // Make the clock jump again by a lot + offsetField.set(t, offset - TimeUnit.SECONDS.toNanos(10)); + t.stop(); + Assert.assertEquals(t.getElapsedTime(), elapsed, "Time added over checkpoint/restart."); + + // Test without stopping + t.start(); + offset = ((Long) offsetField.get(t)).longValue(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Elapsed time after C/R is not zero."); + idleLoop(); + Assert.assertTrue(t.getElapsedTime() > 0.0, "Elapsed time zero after re-sync."); + + } + private static void idleLoop() { for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time }