Minor changes to MannWhitneyU:

- Comment fixes to better explain why two-sided test wants to use the LOWER (not higher) value for U
 - Much more direct testing of MWU functions
 - Uniform approximation was always using the < cumulant (sometimes the > cumulant should be used instead)
 - Uniform approximation currently not used (regime in which it was being used was not the right one -- not necessarily bad, but not an improvement over normal)
    + this particular approximation is for major imbalances of the form m >> n. Code may be altered in the future to use this method for this particular regime, if the method's not too slow.
 - Hook into one-sided test.

RegionalAssociationRecalibrator: NaNs were being caused by presence of Infinity and -Infinity values out of the walker. Currently I'm just re-setting them to arbitrary post-whitened values, but the walker will be changed to prevent output of these values, and the "fix" will undone.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5539 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-03-30 17:03:02 +00:00
parent fff11a3279
commit 328f89f66a
3 changed files with 102 additions and 41 deletions

View File

@ -207,6 +207,9 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
if ( Double.compare(v,Double.NaN) == 0 || Double.compare(v1,Double.NaN) == 0 ) {
return 0.0;
}
if ( Double.compare(v,Double.POSITIVE_INFINITY) == 0 ) { return 50.0; } // todo -- fixme
if ( Double.compare(v,Double.NEGATIVE_INFINITY) == 0 ) { return -50.0; } // todo -- fixme, really should be an exception
return v - v1;
}
};
@ -244,14 +247,6 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
public double[] whiten(double[] value) {
DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
double[] h = ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
for ( double d : h ) {
if ( d == Double.NaN ) {
logger.debug(vec.toString());
logger.debug(means.toString());
}
}
return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
}
}

View File

@ -70,8 +70,8 @@ public class MannWhitneyU {
/**
* Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @param twoSided - is the test twosided
* @return the (possibly approximate) p-value associated with the MWU test
@ -84,13 +84,15 @@ public class MannWhitneyU {
} else if ( n > 8 && m > 8 ) {
// large m and n - normal approx
pval = calculatePNormalApproximation(n,m,u);
} else if ( n > 4 && m > 7 ) {
} else if ( n > 5 && m > 7 ) {
// large m, small n - sum uniform approx
pval = calculatePUniformApproximation(n,m,u);
// todo -- find the appropriate regimes where this approximation is actually better
// pval = calculatePUniformApproximation(n,m,u);
pval = calculatePNormalApproximation(n,m,u);
} else if ( n > 8 || m > 8 ) {
pval = calculatePFromTable(n,m,u);
} else {
// small m [possibly small n] - full approx
// small m and n [possibly small n] - full approx
pval = calculatePRecursively(n,m,u);
}
@ -99,15 +101,14 @@ public class MannWhitneyU {
public static double calculatePFromTable(int n, int m, long u) {
// todo -- actually use a table for:
// todo - n small, m large
// todo - n large, m small
return calculatePUniformApproximation(n,m,u);
}
/**
* Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @return p-value associated with the normal approximation
*/
@ -118,8 +119,8 @@ public class MannWhitneyU {
/**
* Calculates the Z-score approximation of the u-statistic
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @return z-score associated with the normal approximation
*/
@ -132,21 +133,25 @@ public class MannWhitneyU {
/**
* Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
* @param n -
* @param m -
* @param u -
* @return
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF)
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - mann-whitney u value
* @return p-value according to sum of uniform approx
*/
public static double calculatePUniformApproximation(int n, int m, long u) {
long R = u + (n*(n+1))/2;
double a = Math.sqrt(m*(n+m+1));
double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
double z = b + R/a;
if ( z < 0 ) { return 0.0; }
else if ( z > n ) { return 1.0; }
double z = b + ((double)R)/a;
if ( z < 0 ) { return 1.0; }
else if ( z > n ) { return 0.0; }
else {
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
if ( z > ((double) n) /2 ) {
return 1.0-1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
} else {
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
}
}
}
@ -167,11 +172,13 @@ public class MannWhitneyU {
/**
* Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
* stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
* later on simply by multiplying the p-value by 2
* @param observed
* later on simply by multiplying the p-value by 2.
*
* Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen
* @param observed - the observed data
* @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
*/
public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed) {
int set1SeenSoFar = 0;
int set2SeenSoFar = 0;
long uSet1DomSet2 = 0;
@ -201,36 +208,45 @@ public class MannWhitneyU {
/**
* Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates
* the other U-set
* the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1.
* @param observed - the observed data points, tagged by each set
* @param dominator - the set that is hypothesized to be stochastically dominating
* @return the u-statistic associated with the hypothesis
*/
public static long calculateOneSidedU(TreeSet<Pair<Number,USet>> observed,USet dominator) {
long domninatorBeforeOther = 0l;
int domSeenSoFar = 0;
long otherBeforeDominator = 0l;
int otherSeenSoFar = 0;
for ( Pair<Number,USet> dataPoint : observed ) {
if ( dataPoint.second == dominator ) {
++domSeenSoFar;
if ( dataPoint.second != dominator ) {
++otherSeenSoFar;
} else {
domninatorBeforeOther += domSeenSoFar;
otherBeforeDominator += otherSeenSoFar;
}
}
return domninatorBeforeOther;
return otherBeforeDominator;
}
/**
* The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
* binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
* calls into that recursive calculation.
* @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
* @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
* @param m: number of set-two entries
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
* @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
*/
public static double calculatePRecursively(int n, int m, long u) {
if ( m > 7 && n > 4 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
if ( m > 8 && n > 5 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
return cpr(n,m,u);
}
/**
* Hook into CPR with sufficient warning (for testing purposes)
* @deprecated - for testing only (really)
*/
@Deprecated
public static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) {
return cpr(n,m,u);
}
@ -264,6 +280,15 @@ public class MannWhitneyU {
return observations;
}
/**
* hook into the set sizes, for testing purposes only
* @return size set 1, size set 2
* @deprecated - only for testing
*/
public Pair<Integer,Integer> getSetSizes() {
return new Pair<Integer,Integer>(sizeSet1,sizeSet2);
}
/**
* A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
* and to ensure that rank ties are broken at random.

View File

@ -113,8 +113,49 @@ public class RegionalAssociationUnitTest extends BaseTest {
mwu.add(9,MannWhitneyU.USet.SET1);
mwu.add(10,MannWhitneyU.USet.SET1);
mwu.add(11,MannWhitneyU.USet.SET2);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
MannWhitneyU mwu2 = new MannWhitneyU();
for ( int dp : new int[]{2,4,5,6,8} ) {
mwu2.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
mwu2.add(dp,MannWhitneyU.USet.SET2);
}
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
Pair<Integer,Integer> sizes = mwu2.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
// tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
logger.warn("Set 1");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
logger.warn("Set 2");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
MannWhitneyU mwu3 = new MannWhitneyU();
for ( int dp : new int[]{0,2,4} ) {
mwu3.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
mwu3.add(dp,MannWhitneyU.USet.SET2);
}
long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
Pair<Integer,Integer> nums = mwu3.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
}