From d53d5ffbf6b97e76c2353d3f0670a793571c23ae Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 4 Aug 2010 21:39:02 +0000 Subject: [PATCH] A utility class that computes running average and standard deviation for a stream of numbers it is being fed with. Updates mean/stddev on the fly and does not cache the observations, so it uses no memory and also should be stable against overflow/loss of precision. Simple unit test is also provided (does *not* stress-test the engine with millions of numbers though). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3944 348d0f76-0448-11de-a6fe-93d51630548a --- .../broadinstitute/sting/utils/MathUtils.java | 24 +++++++++++++++++++ .../sting/utils/MathUtilsUnitTest.java | 15 ++++++++++++ 2 files changed, 39 insertions(+) diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 209588fff..395326d15 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -714,7 +714,31 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); } + /** A utility class that computes on the fly average and standard deviation for a stream of numbers. + * The number of observations does not have to be known in advance, and can be also very big (so that + * it could overflow any naive summation-based scheme or cause loss of precision). + * Instead, adding a new number observed + * to a sample with add(observed) immediately updates the instance of this object so that + * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 + * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). + */ + public static class RunningAverage { + private double mean = 0.0; + private double s = 0.0; + private long obs_count = 0; + public void add(double obs) { + obs_count++; + double oldMean = mean; + mean += ( obs - mean ) / obs_count; // update mean + s += ( obs - oldMean ) * ( obs - mean ); + } + + public double mean() { return mean; } + public double stddev() { return Math.sqrt(s/(obs_count - 1)); } + public long observationCount() { return obs_count; } + } + // // useful common utility routines // diff --git a/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index b6c5d6720..d74e09128 100755 --- a/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -123,5 +123,20 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(BigFiveAlpha.containsAll(FiveAlpha)); Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); } + + /** Tests that we correctly compute mean and standard deviation from a stream of numbers */ + @Test + public void testRunningAverage() { + logger.warn("Executing testRunningAverage"); + + int [] numbers = {1,2,4,5,3,128,25678,-24}; + MathUtils.RunningAverage r = new MathUtils.RunningAverage(); + + for ( int i = 0 ; i < numbers.length ; i++ ) r.add((double)numbers[i]); + + Assert.assertEquals(r.observationCount(),(long)numbers.length); + Assert.assertTrue(r.mean()- 3224.625 < 2e-10 ); + Assert.assertTrue(r.stddev()-9072.6515881128 < 2e-10); + } } \ No newline at end of file