From 328f89f66ace0a43409a6edd6616dcfa65024ba9 Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 30 Mar 2011 17:03:02 +0000 Subject: [PATCH] Minor changes to MannWhitneyU: - Comment fixes to better explain why two-sided test wants to use the LOWER (not higher) value for U - Much more direct testing of MWU functions - Uniform approximation was always using the < cumulant (sometimes the > cumulant should be used instead) - Uniform approximation currently not used (regime in which it was being used was not the right one -- not necessarily bad, but not an improvement over normal) + this particular approximation is for major imbalances of the form m >> n. Code may be altered in the future to use this method for this particular regime, if the method's not too slow. - Hook into one-sided test. RegionalAssociationRecalibrator: NaNs were being caused by presence of Infinity and -Infinity values out of the walker. Currently I'm just re-setting them to arbitrary post-whitened values, but the walker will be changed to prevent output of these values, and the "fix" will undone. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5539 348d0f76-0448-11de-a6fe-93d51630548a --- .../RegionalAssociationRecalibrator.java | 11 +-- .../sting/utils/MannWhitneyU.java | 87 ++++++++++++------- .../walkers/RegionalAssociationUnitTest.java | 45 +++++++++- 3 files changed, 102 insertions(+), 41 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java index d7931a0f0..56acdbe77 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java @@ -207,6 +207,9 @@ public class RegionalAssociationRecalibrator extends RodWalker 8 && m > 8 ) { // large m and n - normal approx pval = calculatePNormalApproximation(n,m,u); - } else if ( n > 4 && m > 7 ) { + } else if ( n > 5 && m > 7 ) { // large m, small n - sum uniform approx - pval = calculatePUniformApproximation(n,m,u); + // todo -- find the appropriate regimes where this approximation is actually better + // pval = calculatePUniformApproximation(n,m,u); + pval = calculatePNormalApproximation(n,m,u); } else if ( n > 8 || m > 8 ) { pval = calculatePFromTable(n,m,u); } else { - // small m [possibly small n] - full approx + // small m and n [possibly small n] - full approx pval = calculatePRecursively(n,m,u); } @@ -99,15 +101,14 @@ public class MannWhitneyU { public static double calculatePFromTable(int n, int m, long u) { // todo -- actually use a table for: - // todo - n small, m large // todo - n large, m small return calculatePUniformApproximation(n,m,u); } /** * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] - * @param n - The number of entries in the DOMINATED set - * @param m - The number of entries in the DOMINANT set + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set * @param u - the Mann-Whitney U value * @return p-value associated with the normal approximation */ @@ -118,8 +119,8 @@ public class MannWhitneyU { /** * Calculates the Z-score approximation of the u-statistic - * @param n - The number of entries in the DOMINATED set - * @param m - The number of entries in the DOMINANT set + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set * @param u - the Mann-Whitney U value * @return z-score associated with the normal approximation */ @@ -132,21 +133,25 @@ public class MannWhitneyU { /** * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate - * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF) - * @param n - - * @param m - - * @param u - - * @return + * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF) + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set + * @param u - mann-whitney u value + * @return p-value according to sum of uniform approx */ public static double calculatePUniformApproximation(int n, int m, long u) { long R = u + (n*(n+1))/2; double a = Math.sqrt(m*(n+m+1)); double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); - double z = b + R/a; - if ( z < 0 ) { return 0.0; } - else if ( z > n ) { return 1.0; } + double z = b + ((double)R)/a; + if ( z < 0 ) { return 1.0; } + else if ( z > n ) { return 0.0; } else { - return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + if ( z > ((double) n) /2 ) { + return 1.0-1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } else { + return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } } } @@ -167,11 +172,13 @@ public class MannWhitneyU { /** * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for - * later on simply by multiplying the p-value by 2 - * @param observed + * later on simply by multiplying the p-value by 2. + * + * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen + * @param observed - the observed data * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) */ - public static Pair calculateTwoSidedU(TreeSet> observed ) { + public static Pair calculateTwoSidedU(TreeSet> observed) { int set1SeenSoFar = 0; int set2SeenSoFar = 0; long uSet1DomSet2 = 0; @@ -201,36 +208,45 @@ public class MannWhitneyU { /** * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates - * the other U-set + * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1. * @param observed - the observed data points, tagged by each set * @param dominator - the set that is hypothesized to be stochastically dominating * @return the u-statistic associated with the hypothesis */ public static long calculateOneSidedU(TreeSet> observed,USet dominator) { - long domninatorBeforeOther = 0l; - int domSeenSoFar = 0; + long otherBeforeDominator = 0l; + int otherSeenSoFar = 0; for ( Pair dataPoint : observed ) { - if ( dataPoint.second == dominator ) { - ++domSeenSoFar; + if ( dataPoint.second != dominator ) { + ++otherSeenSoFar; } else { - domninatorBeforeOther += domSeenSoFar; + otherBeforeDominator += otherSeenSoFar; } } - return domninatorBeforeOther; + return otherBeforeDominator; } /** * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor * calls into that recursive calculation. - * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two) + * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) * @param m: number of set-two entries * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. */ public static double calculatePRecursively(int n, int m, long u) { - if ( m > 7 && n > 4 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); } + if ( m > 8 && n > 5 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); } + return cpr(n,m,u); + } + + /** + * Hook into CPR with sufficient warning (for testing purposes) + * @deprecated - for testing only (really) + */ + @Deprecated + public static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) { return cpr(n,m,u); } @@ -264,6 +280,15 @@ public class MannWhitneyU { return observations; } + /** + * hook into the set sizes, for testing purposes only + * @return size set 1, size set 2 + * @deprecated - only for testing + */ + public Pair getSetSizes() { + return new Pair(sizeSet1,sizeSet2); + } + /** * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java index a9483a291..74b81643a 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java @@ -113,8 +113,49 @@ public class RegionalAssociationUnitTest extends BaseTest { mwu.add(9,MannWhitneyU.USet.SET1); mwu.add(10,MannWhitneyU.USet.SET1); mwu.add(11,MannWhitneyU.USet.SET2); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l); + + MannWhitneyU mwu2 = new MannWhitneyU(); + for ( int dp : new int[]{2,4,5,6,8} ) { + mwu2.add(dp,MannWhitneyU.USet.SET1); + } + + for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { + mwu2.add(dp,MannWhitneyU.USet.SET2); + } + + // tests using the hypothesis that set 2 dominates set 1 (U value = 10) + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l); + Pair sizes = mwu2.getSetSizes(); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14); + Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); + + // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above + + logger.warn("Set 1"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); + logger.warn("Set 2"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); + + MannWhitneyU mwu3 = new MannWhitneyU(); + for ( int dp : new int[]{0,2,4} ) { + mwu3.add(dp,MannWhitneyU.USet.SET1); + } + for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { + mwu3.add(dp,MannWhitneyU.USet.SET2); + } + long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); + Pair nums = mwu3.getSetSizes(); + Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); }