From 328f89f66ace0a43409a6edd6616dcfa65024ba9 Mon Sep 17 00:00:00 2001
From: chartl <chartl@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Wed, 30 Mar 2011 17:03:02 +0000
Subject: [PATCH] Minor changes to MannWhitneyU:  - Comment fixes to better
 explain why two-sided test wants to use the LOWER (not higher) value for U  -
 Much more direct testing of MWU functions  - Uniform approximation was always
 using the < cumulant (sometimes the > cumulant should be used instead)  -
 Uniform approximation currently not used (regime in which it was being used
 was not the right one -- not necessarily bad, but not an improvement over
 normal)     + this particular approximation is for major imbalances of the
 form m >> n. Code may be altered in the future to use this method for this
 particular regime, if the method's not too slow.  - Hook into one-sided test.

RegionalAssociationRecalibrator: NaNs were being caused by presence of Infinity and -Infinity values out of the walker. Currently I'm just re-setting them to arbitrary post-whitened values, but the walker will be changed to prevent output of these values, and the "fix" will undone.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5539 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../RegionalAssociationRecalibrator.java      | 11 +--
 .../sting/utils/MannWhitneyU.java             | 87 ++++++++++++-------
 .../walkers/RegionalAssociationUnitTest.java  | 45 +++++++++-
 3 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java
index d7931a0f0..56acdbe77 100755
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java
@@ -207,6 +207,9 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
                 if ( Double.compare(v,Double.NaN) == 0 || Double.compare(v1,Double.NaN) == 0 ) {
                     return 0.0;
                 }
+
+		if ( Double.compare(v,Double.POSITIVE_INFINITY) == 0 ) { return 50.0; } // todo -- fixme
+		if ( Double.compare(v,Double.NEGATIVE_INFINITY) == 0 ) { return -50.0; } // todo -- fixme, really should be an exception
                 return v - v1;
             }
         };
@@ -244,14 +247,6 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
 
         public double[] whiten(double[] value) {
             DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
-            double[] h = ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
-            for ( double d : h ) {
-                if ( d == Double.NaN ) {
-                    logger.debug(vec.toString());
-                    logger.debug(means.toString());
-                }
-            }
-
             return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
         }
     }
diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
index 27ee25e5b..a9f066421 100755
--- a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
+++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
@@ -70,8 +70,8 @@ public class MannWhitneyU {
 
     /**
      * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
      * @param u - the Mann-Whitney U value
      * @param twoSided - is the test twosided
      * @return the (possibly approximate) p-value associated with the MWU test
@@ -84,13 +84,15 @@ public class MannWhitneyU {
         } else if ( n > 8 && m > 8 ) {
             // large m and n - normal approx
             pval = calculatePNormalApproximation(n,m,u);
-        } else if ( n > 4 && m > 7 ) {
+        } else if ( n > 5 && m > 7 ) {
             // large m, small n - sum uniform approx
-            pval = calculatePUniformApproximation(n,m,u);
+            // todo -- find the appropriate regimes where this approximation is actually better
+            // pval = calculatePUniformApproximation(n,m,u);
+            pval = calculatePNormalApproximation(n,m,u);
         } else if ( n > 8 || m > 8 ) {
             pval = calculatePFromTable(n,m,u);
         } else {
-            // small m [possibly small n] - full approx
+            // small m and n [possibly small n] - full approx
             pval = calculatePRecursively(n,m,u);
         }
 
@@ -99,15 +101,14 @@ public class MannWhitneyU {
 
     public static double calculatePFromTable(int n, int m, long u) {
         // todo -- actually use a table for:
-        // todo      - n small, m large
         // todo      - n large, m small
         return calculatePUniformApproximation(n,m,u);
     }
 
     /**
      * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
      * @param u - the Mann-Whitney U value
      * @return p-value associated with the normal approximation
      */
@@ -118,8 +119,8 @@ public class MannWhitneyU {
 
     /**
      * Calculates the Z-score approximation of the u-statistic
-     * @param n - The number of entries in the DOMINATED set
-     * @param m - The number of entries in the DOMINANT set
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
      * @param u - the Mann-Whitney U value
      * @return z-score associated with the normal approximation
      */
@@ -132,21 +133,25 @@ public class MannWhitneyU {
 
     /**
      * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
-     * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
-     * @param n -
-     * @param m -
-     * @param u -
-     * @return
+     * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF)
+     * @param n - The number of entries in the stochastically smaller (dominant) set
+     * @param m - The number of entries in the stochastically larger (dominated) set
+     * @param u - mann-whitney u value
+     * @return p-value according to sum of uniform approx
      */
     public static double calculatePUniformApproximation(int n, int m, long u) {
         long R = u + (n*(n+1))/2;
         double a = Math.sqrt(m*(n+m+1));
         double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
-        double z = b + R/a;
-        if ( z < 0 ) { return 0.0; }
-        else if ( z > n ) { return 1.0; }
+        double z = b + ((double)R)/a;
+        if ( z < 0 ) { return 1.0; }
+        else if ( z > n ) { return 0.0; }
         else {
-            return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            if ( z > ((double) n) /2 ) {
+                return 1.0-1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            } else {
+                return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
+            }
         }
     }
 
@@ -167,11 +172,13 @@ public class MannWhitneyU {
     /**
      * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
      * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
-     * later on simply by multiplying the p-value by 2
-     * @param observed
+     * later on simply by multiplying the p-value by 2.
+     *
+     * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen
+     * @param observed - the observed data
      * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
      */
-    public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
+    public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed) {
         int set1SeenSoFar = 0;
         int set2SeenSoFar = 0;
         long uSet1DomSet2 = 0;
@@ -201,36 +208,45 @@ public class MannWhitneyU {
 
     /**
      * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates
-     * the other U-set
+     * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1.
      * @param observed - the observed data points, tagged by each set
      * @param dominator - the set that is hypothesized to be stochastically dominating
      * @return the u-statistic associated with the hypothesis
      */
     public static long calculateOneSidedU(TreeSet<Pair<Number,USet>> observed,USet dominator) {
-        long domninatorBeforeOther = 0l;
-        int domSeenSoFar = 0;
+        long otherBeforeDominator = 0l;
+        int otherSeenSoFar = 0;
         for ( Pair<Number,USet> dataPoint : observed ) {
-            if ( dataPoint.second == dominator ) {
-                ++domSeenSoFar;
+            if ( dataPoint.second != dominator ) {
+                ++otherSeenSoFar;
             } else {
-                domninatorBeforeOther += domSeenSoFar;
+                otherBeforeDominator += otherSeenSoFar;
             }
         }
 
-        return domninatorBeforeOther;
+        return otherBeforeDominator;
     }
 
     /**
      * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
      * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
      * calls into that recursive calculation.
-     * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
+     * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
      * @param m: number of set-two entries
      * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
      * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
      */
     public static double calculatePRecursively(int n, int m, long u) {
-        if ( m > 7 && n > 4 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
+        if ( m > 8 && n > 5 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
+        return cpr(n,m,u);
+    }
+
+    /**
+     * Hook into CPR with sufficient warning (for testing purposes)
+     * @deprecated - for testing only (really)
+     */
+    @Deprecated
+    public static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) {
         return cpr(n,m,u);
     }
 
@@ -264,6 +280,15 @@ public class MannWhitneyU {
         return observations;
     }
 
+    /**
+     * hook into the set sizes, for testing purposes only
+     * @return size set 1, size set 2
+     * @deprecated - only for testing
+     */
+    public Pair<Integer,Integer> getSetSizes() {
+        return new Pair<Integer,Integer>(sizeSet1,sizeSet2);
+    }
+
     /**
      * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
      * and to ensure that rank ties are broken at random.
diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
index a9483a291..74b81643a 100755
--- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
+++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
@@ -113,8 +113,49 @@ public class RegionalAssociationUnitTest extends BaseTest {
         mwu.add(9,MannWhitneyU.USet.SET1);
         mwu.add(10,MannWhitneyU.USet.SET1);
         mwu.add(11,MannWhitneyU.USet.SET2);
-        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l);
-        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
+
+        MannWhitneyU mwu2 = new MannWhitneyU();
+        for ( int dp : new int[]{2,4,5,6,8} ) {
+            mwu2.add(dp,MannWhitneyU.USet.SET1);
+        }
+
+        for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
+            mwu2.add(dp,MannWhitneyU.USet.SET2);
+        }
+
+        // tests using the hypothesis that set 2 dominates set 1 (U value = 10)
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
+        Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
+        Pair<Integer,Integer> sizes = mwu2.getSetSizes();
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
+        Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
+
+        // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
+
+        logger.warn("Set 1");
+        Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
+        logger.warn("Set 2");
+        Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
+
+        MannWhitneyU mwu3 = new MannWhitneyU();
+        for ( int dp : new int[]{0,2,4} ) {
+            mwu3.add(dp,MannWhitneyU.USet.SET1);
+        }
+        for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
+            mwu3.add(dp,MannWhitneyU.USet.SET2);
+        }
+        long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
+        Pair<Integer,Integer> nums = mwu3.getSetSizes();
+        Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
+        Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
 
     }