Added knuth-shuffle (KS) and randomSubset using KS to MathUtils

* Knuth-shuffle is a simple, yet effective array permutator (hope this is good english). * added a simple randomSubset that returns a random subset without repeats of any given array with the same probability for every permutation. * added unit tests to both functions
2011-12-29 00:41:59 -05:00 · 2011-12-29 00:41:59 -05:00 · cd68cc239b
parent 94791a2a75
commit cd68cc239b
2 changed files with 545 additions and 365 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -25,6 +25,7 @@
 package org.broadinstitute.sting.utils;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -49,8 +50,11 @@ public class MathUtils {
     */
-    /** Private constructor.  No instantiating this class! */
+    /**
-    private MathUtils() {}
+     * Private constructor.  No instantiating this class!
     */
    private MathUtils() {
    }
    @Requires({"d > 0.0"})
    public static int fastPositiveRound(double d) {
@ -100,8 +104,12 @@ public class MathUtils {
    public static double variance(Collection<Number> numbers, Number mean, boolean ignoreNan) {
        double mn = mean.doubleValue();
        double var = 0;
-        for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; }
+        for (Number n : numbers) {
-        if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); }
+            var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0;
        }
        if (ignoreNan) {
            return var / (nonNanSize(numbers) - 1);
        }
        return var / (numbers.size() - 1);
    }
@ -126,6 +134,7 @@ public class MathUtils {
    /**
     * Calculates the log10 cumulative sum of an array with log10 probabilities
     *
     * @param log10p the array with log10 probabilites
     * @param upTo   index in the array to calculate the cumsum up to
     * @return the log10 of the cumulative sum
@ -136,6 +145,7 @@ public class MathUtils {
    /**
     * Converts a real space array of probabilities into a log10 array
     *
     * @param prRealSpace
     * @return
     */
@ -219,7 +229,9 @@ public class MathUtils {
     * @param b the second double value
     * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a.
     */
-    public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); }
+    public static byte compareDoubles(double a, double b) {
        return compareDoubles(a, b, 1e-6);
    }
    /**
     * Compares double values for equality (within epsilon), or inequality.
@ -229,10 +241,13 @@ public class MathUtils {
     * @param epsilon the precision within which two double values will be considered equal
     * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a.
     */
-    public static byte compareDoubles(double a, double b, double epsilon)
+    public static byte compareDoubles(double a, double b, double epsilon) {
-    {
+        if (Math.abs(a - b) < epsilon) {
-        if (Math.abs(a - b) < epsilon) { return 0; }
+            return 0;
-        if (a > b) { return -1; }
+        }
        if (a > b) {
            return -1;
        }
        return 1;
    }
@ -243,7 +258,9 @@ public class MathUtils {
     * @param b the second float value
     * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a.
     */
-    public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); }
+    public static byte compareFloats(float a, float b) {
        return compareFloats(a, b, 1e-6f);
    }
    /**
     * Compares float values for equality (within epsilon), or inequality.
@ -253,15 +270,17 @@ public class MathUtils {
     * @param epsilon the precision within which two float values will be considered equal
     * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a.
     */
-    public static byte compareFloats(float a, float b, float epsilon)
+    public static byte compareFloats(float a, float b, float epsilon) {
-    {
+        if (Math.abs(a - b) < epsilon) {
-        if (Math.abs(a - b) < epsilon) { return 0; }
+            return 0;
-        if (a > b) { return -1; }
+        }
        if (a > b) {
            return -1;
        }
        return 1;
    }
-    public static double NormalDistribution(double mean, double sd, double x)
+    public static double NormalDistribution(double mean, double sd, double x) {
    {
        double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI));
        double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd)));
        return a * b;
@ -270,17 +289,17 @@ public class MathUtils {
    public static double binomialCoefficient(int n, int k) {
        return Math.pow(10, log10BinomialCoefficient(n, k));
    }
    /**
     * Computes a binomial probability.  This is computed using the formula
-     *
+     * <p/>
     * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k )
-     *
+     * <p/>
     * where n is the number of trials, k is the number of successes, and p is the probability of success
     *
     * @param n number of Bernoulli trials
     * @param k number of successes
     * @param p probability of success
     *
     * @return the binomial probability of the specified configuration.  Computes values down to about 1e-237.
     */
    public static double binomialProbability(int n, int k, double p) {
@ -289,6 +308,7 @@ public class MathUtils {
    /**
     * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space.
     *
     * @param start   - start of the cumulant sum (over hits)
     * @param end     - end of the cumulant sum (over hits)
     * @param total   - number of attempts for the number of hits
@ -318,9 +338,9 @@ public class MathUtils {
    /**
     * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers.
     * This is computed using the formula:
-     *
+     * <p/>
     * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ]
-     *
+     * <p/>
     * where xi represents the number of times outcome i was observed, n is the number of total observations.
     * In this implementation, the value of n is inferred as the sum over i of xi.
     *
@ -339,9 +359,9 @@ public class MathUtils {
    /**
     * Computes a multinomial probability efficiently avoiding overflow even for large numbers.
     * This is computed using the formula:
-     *
+     * <p/>
     * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk)
-     *
+     * <p/>
     * where xi represents the number of times outcome i was observed, n is the number of total observations, and
     * pi represents the probability of the i-th outcome to occur.  In this implementation, the value of n is
     * inferred as the sum over i of xi.
@ -365,6 +385,7 @@ public class MathUtils {
    /**
     * calculate the Root Mean Square of an array of integers
     *
     * @param x an byte[] of numbers
     * @return the RMS of the specified numbers.
     */
@ -382,6 +403,7 @@ public class MathUtils {
    /**
     * calculate the Root Mean Square of an array of integers
     *
     * @param x an int[] of numbers
     * @return the RMS of the specified numbers.
     */
@ -398,6 +420,7 @@ public class MathUtils {
    /**
     * calculate the Root Mean Square of an array of doubles
     *
     * @param x a double[] of numbers
     * @return the RMS of the specified numbers.
     */
@ -444,7 +467,6 @@ public class MathUtils {
     *
     * @param array             the array to be normalized
     * @param takeLog10OfOutput if true, the output will be transformed back into log10 units
     *
     * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed
     */
    public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) {
@ -504,11 +526,11 @@ public class MathUtils {
        return normalized;
    }
    /**
     * normalizes the log10-based array.  ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
     *
     * @param array the array to be normalized
     *
     * @return a newly allocated array corresponding the normalized values in array
     */
    public static double[] normalizeFromLog10(double[] array) {
@ -749,7 +771,9 @@ public class MathUtils {
    }
-    /** Draw N random elements from list. */
+    /**
     * Draw N random elements from list.
     */
    public static <T> List<T> randomSubset(List<T> list, int N) {
        if (list.size() <= N) {
            return list;
@ -770,6 +794,25 @@ public class MathUtils {
        return ans;
    }
    /**
     * Draw N random elements from an array.
     *
     * @param array your objects
     * @param n     number of elements to select at random from the list
     * @return a new list with the N randomly chosen elements from list
     */
    @Requires({"array != null", "n>=0"})
    @Ensures({"result != null", "result.length == Math.min(n, array.length)"})
    public static Object[] randomSubset(final Object[] array, final int n) {
        if (array.length <= n)
            return array.clone();
        Object[] shuffledArray = arrayShuffle(array);
        Object[] result = new Object[n];
        System.arraycopy(shuffledArray, 0, result, 0, n);
        return result;
    }
    public static double percentage(double x, double base) {
        return (base > 0 ? (x / base) * 100.0 : 0);
    }
@ -872,7 +915,7 @@ public class MathUtils {
    /**
     * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times
-
+     *
     * @param indices the list of indices for elements to extract
     * @param list    the list from which the elements should be extracted
     * @param <T>     the template type of the ArrayList
@ -967,7 +1010,8 @@ public class MathUtils {
        return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
    }
-    /** A utility class that computes on the fly average and standard deviation for a stream of numbers.
+    /**
     * A utility class that computes on the fly average and standard deviation for a stream of numbers.
     * The number of observations does not have to be known in advance, and can be also very big (so that
     * it could overflow any naive summation-based scheme or cause loss of precision).
     * Instead, adding a new number <code>observed</code>
@ -993,10 +1037,21 @@ public class MathUtils {
            }
        }
-        public double mean() { return mean; }
+        public double mean() {
-        public double stddev() { return Math.sqrt(s/(obs_count - 1)); }
+            return mean;
-        public double var() { return s/(obs_count - 1); }
+        }
-        public long observationCount() { return obs_count; }
+
        public double stddev() {
            return Math.sqrt(s / (obs_count - 1));
        }
        public double var() {
            return s / (obs_count - 1);
        }
        public long observationCount() {
            return obs_count;
        }
        public RunningAverage clone() {
            RunningAverage ra = new RunningAverage();
@ -1018,14 +1073,29 @@ public class MathUtils {
    //
    // useful common utility routines
    //
-    public static double rate(long n, long d)           { return n / (1.0 * Math.max(d, 1)); }
+    public static double rate(long n, long d) {
-    public static double rate(int n, int d)             { return n / (1.0 * Math.max(d, 1)); }
+        return n / (1.0 * Math.max(d, 1));
    }
-    public static long inverseRate(long n, long d)      { return n == 0 ? 0 : d / Math.max(n, 1); }
+    public static double rate(int n, int d) {
-    public static long inverseRate(int n, int d)        { return n == 0 ? 0 : d / Math.max(n, 1); }
+        return n / (1.0 * Math.max(d, 1));
    }
-    public static double ratio(int num, int denom)      { return ((double)num) / (Math.max(denom, 1)); }
+    public static long inverseRate(long n, long d) {
-    public static double ratio(long num, long denom)    { return ((double)num) / (Math.max(denom, 1)); }
+        return n == 0 ? 0 : d / Math.max(n, 1);
    }
    public static long inverseRate(int n, int d) {
        return n == 0 ? 0 : d / Math.max(n, 1);
    }
    public static double ratio(int num, int denom) {
        return ((double) num) / (Math.max(denom, 1));
    }
    public static double ratio(long num, long denom) {
        return ((double) num) / (Math.max(denom, 1));
    }
    public static final double[] log10Cache;
    public static final double[] jacobianLogTable;
@ -1093,8 +1163,7 @@ public class MathUtils {
        else if (diff >= 0) {
            int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
            return x + jacobianLogTable[ind];
-        }
+        } else {
        else {
            int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
            return y + jacobianLogTable[ind];
        }
@ -1110,6 +1179,7 @@ public class MathUtils {
    /**
     * Returns the phred scaled value of probability p
     *
     * @param p probability (between 0 and 1).
     * @return phred scaled probability of p
     */
@ -1123,6 +1193,7 @@ public class MathUtils {
    /**
     * Converts LN to LOG10
     *
     * @param ln log(x)
     * @return log10(x)
     */
@ -1240,14 +1311,28 @@ public class MathUtils {
        else if (ix < 0x40000000) {
            if (ix <= 0x3feccccc) {     /* lgamma(x) = lgamma(x+1)-log(x) */
                r = -Math.log(x);
-                if(ix>=0x3FE76944) {y = one-x; i= 0;}
+                if (ix >= 0x3FE76944) {
-                else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
+                    y = one - x;
-                else {y = x; i=2;}
+                    i = 0;
                } else if (ix >= 0x3FCDA661) {
                    y = x - (tc - one);
                    i = 1;
                } else {
                    y = x;
                    i = 2;
                }
            } else {
                r = zero;
-                if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
+                if (ix >= 0x3FFBB4C3) {
-                else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
+                    y = 2.0 - x;
-                else {y=x-one;i=2;}
+                    i = 0;
                } /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) {
                    y = x - tc;
                    i = 1;
                } /* [1.23,1.73] */ else {
                    y = x - one;
                    i = 2;
                }
            }
            switch (i) {
@ -1256,7 +1341,8 @@ public class MathUtils {
                    p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
                    p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
                    p = y * p1 + p2;
-                r  += (p-0.5*y); break;
+                    r += (p - 0.5 * y);
                    break;
                case 1:
                    z = y * y;
                    w = z * y;
@ -1264,14 +1350,14 @@ public class MathUtils {
                    p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
                    p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
                    p = z * p1 - (tt - w * (p2 + y * p3));
-                r += (tf + p); break;
+                    r += (tf + p);
                    break;
                case 2:
                    p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
                    p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
                    r += (-0.5 * y + p1 / p2);
            }
-        }
+        } else if (ix < 0x40200000) {             /* x < 8.0 */
        else if(ix<0x40200000) { 			/* x < 8.0 */
            i = (int) x;
            t = zero;
            y = x - (double) i;
@ -1280,12 +1366,18 @@ public class MathUtils {
            r = half * y + p / q;
            z = one;    /* lgamma(1+s) = log(s) + lgamma(s) */
            switch (i) {
-            case 7: z *= (y+6.0);	/* FALLTHRU */
+                case 7:
-            case 6: z *= (y+5.0);	/* FALLTHRU */
+                    z *= (y + 6.0);    /* FALLTHRU */
-            case 5: z *= (y+4.0);	/* FALLTHRU */
+                case 6:
-            case 4: z *= (y+3.0);	/* FALLTHRU */
+                    z *= (y + 5.0);    /* FALLTHRU */
-            case 3: z *= (y+2.0);	/* FALLTHRU */
+                case 5:
-                r += Math.log(z); break;
+                    z *= (y + 4.0);    /* FALLTHRU */
                case 4:
                    z *= (y + 3.0);    /* FALLTHRU */
                case 3:
                    z *= (y + 2.0);    /* FALLTHRU */
                    r += Math.log(z);
                    break;
            }
            /* 8.0 <= x < 2**58 */
        } else if (ix < 0x43900000) {
@ -1372,4 +1464,40 @@ public class MathUtils {
    public static double log10Factorial(int x) {
        return log10Gamma(x + 1);
    }
    /**
     * Adds two arrays together and returns a new array with the sum.
     *
     * @param a one array
     * @param b another array
     * @return a new array with the sum of a and b
     */
    @Requires("a.length == b.length")
    @Ensures("result.length == a.length")
    public static int[] addArrays(int[] a, int[] b) {
        int[] c = new int[a.length];
        for (int i = 0; i < a.length; i++)
            c[i] = a[i] + b[i];
        return c;
    }
    /**
     * Quick implementation of the Knuth-shuffle algorithm to generate a random
     * permutation of the given array.
     *
     * @param array the original array
     * @return a new array with the elements shuffled
     */
    public static Object[] arrayShuffle(Object[] array) {
        int n = array.length;
        Object[] shuffled = array.clone();
        for (int i = 0; i < n; i++) {
            int j = i + GenomeAnalysisEngine.getRandomGenerator().nextInt(n - i);
            Object tmp = shuffled[i];
            shuffled[i] = shuffled[j];
            shuffled[j] = tmp;
        }
        return shuffled;
    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
@ -26,22 +26,20 @@
 package org.broadinstitute.sting.utils;
 import org.broadinstitute.sting.BaseTest;
 import org.testng.Assert;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.Test;
 import org.broadinstitute.sting.BaseTest;
-
+import java.util.*;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Collections;
 /**
 * Basic unit test for MathUtils
 */
 public class MathUtilsUnitTest extends BaseTest {
    @BeforeClass
-    public void init() { }
+    public void init() {
    }
    /**
     * Tests that we get the right values from the binomial distribution
@ -123,7 +121,9 @@ public class MathUtilsUnitTest extends BaseTest {
        Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha));
    }
-    /** Tests that we correctly compute mean and standard deviation from a stream of numbers */
+    /**
     * Tests that we correctly compute mean and standard deviation from a stream of numbers
     */
    @Test
    public void testRunningAverage() {
        logger.warn("Executing testRunningAverage");
@ -174,4 +174,56 @@ public class MathUtilsUnitTest extends BaseTest {
        Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1);
    }
    @Test(enabled = true)
    public void testRandomSubset() {
        Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
        Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0);
        Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1);
        Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2);
        Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3);
        Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4);
        Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5);
        Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6);
        Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7);
        Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8);
        Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9);
        Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10);
        Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10);
        for (int i = 0; i < 25; i++)
            Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5)));
    }
    @Test(enabled = true)
    public void testArrayShuffle() {
        Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
        for (int i = 0; i < 25; i++) {
            Object[] t = MathUtils.arrayShuffle(x);
            Assert.assertTrue(hasUniqueElements(t));
            Assert.assertTrue(hasAllElements(x, t));
        }
    }
    private boolean hasUniqueElements(Object[] x) {
        for (int i = 0; i < x.length; i++)
            for (int j = i + 1; j < x.length; j++)
                if (x[i].equals(x[j]) || x[i] == x[j])
                    return false;
        return true;
    }
    private boolean hasAllElements(final Object[] expected, final Object[] actual) {
        HashSet<Object> set = new HashSet<Object>();
        set.addAll(Arrays.asList(expected));
        set.removeAll(Arrays.asList(actual));
        return set.isEmpty();
    }
    private void p (Object []x) {
        for (Object v: x)
            System.out.print((Integer) v + " ");
        System.out.println();
    }
 }