Minor changes to MannWhitneyU:

- Comment fixes to better explain why two-sided test wants to use the LOWER (not higher) value for U - Much more direct testing of MWU functions - Uniform approximation was always using the < cumulant (sometimes the > cumulant should be used instead) - Uniform approximation currently not used (regime in which it was being used was not the right one -- not necessarily bad, but not an improvement over normal) + this particular approximation is for major imbalances of the form m >> n. Code may be altered in the future to use this method for this particular regime, if the method's not too slow. - Hook into one-sided test. RegionalAssociationRecalibrator: NaNs were being caused by presence of Infinity and -Infinity values out of the walker. Currently I'm just re-setting them to arbitrary post-whitened values, but the walker will be changed to prevent output of these values, and the "fix" will undone. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5539 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-30 17:03:02 +00:00 · 2011-03-30 17:03:02 +00:00 · 328f89f66a
parent fff11a3279
commit 328f89f66a
3 changed files with 102 additions and 41 deletions
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java
@ -207,6 +207,9 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
                if ( Double.compare(v,Double.NaN) == 0 || Double.compare(v1,Double.NaN) == 0 ) {
                    return 0.0;
                }
+
+		if ( Double.compare(v,Double.POSITIVE_INFINITY) == 0 ) { return 50.0; } // todo -- fixme
+		if ( Double.compare(v,Double.NEGATIVE_INFINITY) == 0 ) { return -50.0; } // todo -- fixme, really should be an exception
                return v - v1;
            }
        };
@ -244,14 +247,6 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati

        public double[] whiten(double[] value) {
            DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
-            double[] h = ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
-            for ( double d : h ) {
-                if ( d == Double.NaN ) {
-                    logger.debug(vec.toString());
-                    logger.debug(means.toString());
-                }
-            }
-
            return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
        }
    }
--- a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
+++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
@ -70,8 +70,8 @@ public class MannWhitneyU {

    /**
     * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
     * @param u - the Mann-Whitney U value
     * @param twoSided - is the test twosided
     * @return the (possibly approximate) p-value associated with the MWU test
@ -84,13 +84,15 @@ public class MannWhitneyU {
        } else if ( n > 8 && m > 8 ) {
            // large m and n - normal approx
            pval = calculatePNormalApproximation(n,m,u);
-        } else if ( n > 4 && m > 7 ) {
+        } else if ( n > 5 && m > 7 ) {
            // large m, small n - sum uniform approx
-            pval = calculatePUniformApproximation(n,m,u);
+            // todo -- find the appropriate regimes where this approximation is actually better
+            // pval = calculatePUniformApproximation(n,m,u);
+            pval = calculatePNormalApproximation(n,m,u);
        } else if ( n > 8 || m > 8 ) {
            pval = calculatePFromTable(n,m,u);
        } else {
-            // small m [possibly small n] - full approx
+            // small m and n [possibly small n] - full approx
            pval = calculatePRecursively(n,m,u);
        }

@ -99,15 +101,14 @@ public class MannWhitneyU {

    public static double calculatePFromTable(int n, int m, long u) {
        // todo -- actually use a table for:
-        // todo      - n small, m large
        // todo      - n large, m small
        return calculatePUniformApproximation(n,m,u);
    }

    /**
     * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
     * @param u - the Mann-Whitney U value
     * @return p-value associated with the normal approximation
     */
@ -118,8 +119,8 @@ public class MannWhitneyU {

    /**
     * Calculates the Z-score approximation of the u-statistic
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
     * @param u - the Mann-Whitney U value
     * @return z-score associated with the normal approximation
     */
@ -132,21 +133,25 @@ public class MannWhitneyU {

    /**
     * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
-     * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
-     * @param n -
-     * @param m -
-     * @param u -
-     * @return
+     * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF)
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
+     * @param u - mann-whitney u value
+     * @return p-value according to sum of uniform approx
     */
    public static double calculatePUniformApproximation(int n, int m, long u) {
        long R = u + (n*(n+1))/2;
        double a = Math.sqrt(m*(n+m+1));
        double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
-        double z = b + R/a;
-        if ( z < 0 ) { return 0.0; }
-        else if ( z > n ) { return 1.0; }
+        double z = b + ((double)R)/a;
+        if ( z < 0 ) { return 1.0; }
+        else if ( z > n ) { return 0.0; }
        else {
-            return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            if ( z > ((double) n) /2 ) {
+                return 1.0-1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            } else {
+                return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            }
        }
    }

@ -167,11 +172,13 @@ public class MannWhitneyU {
    /**
     * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
     * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
-     * later on simply by multiplying the p-value by 2
-     * @param observed
+     * later on simply by multiplying the p-value by 2.
+     *
+     * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen
+     * @param observed - the observed data
     * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
     */
-    public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
+    public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed) {
        int set1SeenSoFar = 0;
        int set2SeenSoFar = 0;
        long uSet1DomSet2 = 0;
@ -201,36 +208,45 @@ public class MannWhitneyU {

    /**
     * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates
-     * the other U-set
+     * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1.
     * @param observed - the observed data points, tagged by each set
     * @param dominator - the set that is hypothesized to be stochastically dominating
     * @return the u-statistic associated with the hypothesis
     */
    public static long calculateOneSidedU(TreeSet<Pair<Number,USet>> observed,USet dominator) {
-        long domninatorBeforeOther = 0l;
-        int domSeenSoFar = 0;
+        long otherBeforeDominator = 0l;
+        int otherSeenSoFar = 0;
        for ( Pair<Number,USet> dataPoint : observed ) {
-            if ( dataPoint.second == dominator ) {
-                ++domSeenSoFar;
+            if ( dataPoint.second != dominator ) {
+                ++otherSeenSoFar;
            } else {
-                domninatorBeforeOther += domSeenSoFar;
+                otherBeforeDominator += otherSeenSoFar;
            }
        }

-        return domninatorBeforeOther;
+        return otherBeforeDominator;
    }

    /**
     * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
     * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
     * calls into that recursive calculation.
-     * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
+     * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
     * @param m: number of set-two entries
     * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
     * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
     */
    public static double calculatePRecursively(int n, int m, long u) {
-        if ( m > 7 && n > 4 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
+        if ( m > 8 && n > 5 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
+        return cpr(n,m,u);
+    }
+
+    /**
+     * Hook into CPR with sufficient warning (for testing purposes)
+     * @deprecated - for testing only (really)
+     */
+    @Deprecated
+    public static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) {
        return cpr(n,m,u);
    }

@ -264,6 +280,15 @@ public class MannWhitneyU {
        return observations;
    }

+    /**
+     * hook into the set sizes, for testing purposes only
+     * @return size set 1, size set 2
+     * @deprecated - only for testing
+     */
+    public Pair<Integer,Integer> getSetSizes() {
+        return new Pair<Integer,Integer>(sizeSet1,sizeSet2);
+    }
+
    /**
     * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
     * and to ensure that rank ties are broken at random.
--- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
+++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
@ -113,8 +113,49 @@ public class RegionalAssociationUnitTest extends BaseTest {
        mwu.add(9,MannWhitneyU.USet.SET1);
        mwu.add(10,MannWhitneyU.USet.SET1);
        mwu.add(11,MannWhitneyU.USet.SET2);
-        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l);
-        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
+
+        MannWhitneyU mwu2 = new MannWhitneyU();
+        for ( int dp : new int[]{2,4,5,6,8} ) {
+            mwu2.add(dp,MannWhitneyU.USet.SET1);
+        }
+
+        for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
+            mwu2.add(dp,MannWhitneyU.USet.SET2);
+        }
+
+        // tests using the hypothesis that set 2 dominates set 1 (U value = 10)
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
+        Pair<Integer,Integer> sizes = mwu2.getSetSizes();
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
+        Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
+
+        // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
+
+        logger.warn("Set 1");
+        Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
+        logger.warn("Set 2");
+        Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
+
+        MannWhitneyU mwu3 = new MannWhitneyU();
+        for ( int dp : new int[]{0,2,4} ) {
+            mwu3.add(dp,MannWhitneyU.USet.SET1);
+        }
+        for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
+            mwu3.add(dp,MannWhitneyU.USet.SET2);
+        }
+        long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
+        Pair<Integer,Integer> nums = mwu3.getSetSizes();
+        Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);

    }