Minor changes to MannWhitneyU:
- Comment fixes to better explain why two-sided test wants to use the LOWER (not higher) value for U
- Much more direct testing of MWU functions
- Uniform approximation was always using the < cumulant (sometimes the > cumulant should be used instead)
- Uniform approximation currently not used (regime in which it was being used was not the right one -- not necessarily bad, but not an improvement over normal)
+ this particular approximation is for major imbalances of the form m >> n. Code may be altered in the future to use this method for this particular regime, if the method's not too slow.
- Hook into one-sided test.
RegionalAssociationRecalibrator: NaNs were being caused by presence of Infinity and -Infinity values out of the walker. Currently I'm just re-setting them to arbitrary post-whitened values, but the walker will be changed to prevent output of these values, and the "fix" will undone.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5539 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fff11a3279
commit
328f89f66a
|
|
@ -207,6 +207,9 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
|||
if ( Double.compare(v,Double.NaN) == 0 || Double.compare(v1,Double.NaN) == 0 ) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if ( Double.compare(v,Double.POSITIVE_INFINITY) == 0 ) { return 50.0; } // todo -- fixme
|
||||
if ( Double.compare(v,Double.NEGATIVE_INFINITY) == 0 ) { return -50.0; } // todo -- fixme, really should be an exception
|
||||
return v - v1;
|
||||
}
|
||||
};
|
||||
|
|
@ -244,14 +247,6 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
|||
|
||||
public double[] whiten(double[] value) {
|
||||
DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
|
||||
double[] h = ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
|
||||
for ( double d : h ) {
|
||||
if ( d == Double.NaN ) {
|
||||
logger.debug(vec.toString());
|
||||
logger.debug(means.toString());
|
||||
}
|
||||
}
|
||||
|
||||
return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,8 +70,8 @@ public class MannWhitneyU {
|
|||
|
||||
/**
|
||||
* Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
|
||||
* @param n - The number of entries in the DOMINATED set
|
||||
* @param m - The number of entries in the DOMINANT set
|
||||
* @param n - The number of entries in the stochastically smaller (dominant) set
|
||||
* @param m - The number of entries in the stochastically larger (dominated) set
|
||||
* @param u - the Mann-Whitney U value
|
||||
* @param twoSided - is the test twosided
|
||||
* @return the (possibly approximate) p-value associated with the MWU test
|
||||
|
|
@ -84,13 +84,15 @@ public class MannWhitneyU {
|
|||
} else if ( n > 8 && m > 8 ) {
|
||||
// large m and n - normal approx
|
||||
pval = calculatePNormalApproximation(n,m,u);
|
||||
} else if ( n > 4 && m > 7 ) {
|
||||
} else if ( n > 5 && m > 7 ) {
|
||||
// large m, small n - sum uniform approx
|
||||
pval = calculatePUniformApproximation(n,m,u);
|
||||
// todo -- find the appropriate regimes where this approximation is actually better
|
||||
// pval = calculatePUniformApproximation(n,m,u);
|
||||
pval = calculatePNormalApproximation(n,m,u);
|
||||
} else if ( n > 8 || m > 8 ) {
|
||||
pval = calculatePFromTable(n,m,u);
|
||||
} else {
|
||||
// small m [possibly small n] - full approx
|
||||
// small m and n [possibly small n] - full approx
|
||||
pval = calculatePRecursively(n,m,u);
|
||||
}
|
||||
|
||||
|
|
@ -99,15 +101,14 @@ public class MannWhitneyU {
|
|||
|
||||
public static double calculatePFromTable(int n, int m, long u) {
|
||||
// todo -- actually use a table for:
|
||||
// todo - n small, m large
|
||||
// todo - n large, m small
|
||||
return calculatePUniformApproximation(n,m,u);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
|
||||
* @param n - The number of entries in the DOMINATED set
|
||||
* @param m - The number of entries in the DOMINANT set
|
||||
* @param n - The number of entries in the stochastically smaller (dominant) set
|
||||
* @param m - The number of entries in the stochastically larger (dominated) set
|
||||
* @param u - the Mann-Whitney U value
|
||||
* @return p-value associated with the normal approximation
|
||||
*/
|
||||
|
|
@ -118,8 +119,8 @@ public class MannWhitneyU {
|
|||
|
||||
/**
|
||||
* Calculates the Z-score approximation of the u-statistic
|
||||
* @param n - The number of entries in the DOMINATED set
|
||||
* @param m - The number of entries in the DOMINANT set
|
||||
* @param n - The number of entries in the stochastically smaller (dominant) set
|
||||
* @param m - The number of entries in the stochastically larger (dominated) set
|
||||
* @param u - the Mann-Whitney U value
|
||||
* @return z-score associated with the normal approximation
|
||||
*/
|
||||
|
|
@ -132,21 +133,25 @@ public class MannWhitneyU {
|
|||
|
||||
/**
|
||||
* Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
|
||||
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
|
||||
* @param n -
|
||||
* @param m -
|
||||
* @param u -
|
||||
* @return
|
||||
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF)
|
||||
* @param n - The number of entries in the stochastically smaller (dominant) set
|
||||
* @param m - The number of entries in the stochastically larger (dominated) set
|
||||
* @param u - mann-whitney u value
|
||||
* @return p-value according to sum of uniform approx
|
||||
*/
|
||||
public static double calculatePUniformApproximation(int n, int m, long u) {
|
||||
long R = u + (n*(n+1))/2;
|
||||
double a = Math.sqrt(m*(n+m+1));
|
||||
double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
|
||||
double z = b + R/a;
|
||||
if ( z < 0 ) { return 0.0; }
|
||||
else if ( z > n ) { return 1.0; }
|
||||
double z = b + ((double)R)/a;
|
||||
if ( z < 0 ) { return 1.0; }
|
||||
else if ( z > n ) { return 0.0; }
|
||||
else {
|
||||
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
|
||||
if ( z > ((double) n) /2 ) {
|
||||
return 1.0-1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
|
||||
} else {
|
||||
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -167,11 +172,13 @@ public class MannWhitneyU {
|
|||
/**
|
||||
* Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
|
||||
* stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
|
||||
* later on simply by multiplying the p-value by 2
|
||||
* @param observed
|
||||
* later on simply by multiplying the p-value by 2.
|
||||
*
|
||||
* Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen
|
||||
* @param observed - the observed data
|
||||
* @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
|
||||
*/
|
||||
public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
|
||||
public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed) {
|
||||
int set1SeenSoFar = 0;
|
||||
int set2SeenSoFar = 0;
|
||||
long uSet1DomSet2 = 0;
|
||||
|
|
@ -201,36 +208,45 @@ public class MannWhitneyU {
|
|||
|
||||
/**
|
||||
* Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates
|
||||
* the other U-set
|
||||
* the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1.
|
||||
* @param observed - the observed data points, tagged by each set
|
||||
* @param dominator - the set that is hypothesized to be stochastically dominating
|
||||
* @return the u-statistic associated with the hypothesis
|
||||
*/
|
||||
public static long calculateOneSidedU(TreeSet<Pair<Number,USet>> observed,USet dominator) {
|
||||
long domninatorBeforeOther = 0l;
|
||||
int domSeenSoFar = 0;
|
||||
long otherBeforeDominator = 0l;
|
||||
int otherSeenSoFar = 0;
|
||||
for ( Pair<Number,USet> dataPoint : observed ) {
|
||||
if ( dataPoint.second == dominator ) {
|
||||
++domSeenSoFar;
|
||||
if ( dataPoint.second != dominator ) {
|
||||
++otherSeenSoFar;
|
||||
} else {
|
||||
domninatorBeforeOther += domSeenSoFar;
|
||||
otherBeforeDominator += otherSeenSoFar;
|
||||
}
|
||||
}
|
||||
|
||||
return domninatorBeforeOther;
|
||||
return otherBeforeDominator;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
|
||||
* binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
|
||||
* calls into that recursive calculation.
|
||||
* @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
|
||||
* @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
|
||||
* @param m: number of set-two entries
|
||||
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
|
||||
* @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
|
||||
*/
|
||||
public static double calculatePRecursively(int n, int m, long u) {
|
||||
if ( m > 7 && n > 4 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
|
||||
if ( m > 8 && n > 5 ) { throw new StingException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
|
||||
return cpr(n,m,u);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook into CPR with sufficient warning (for testing purposes)
|
||||
* @deprecated - for testing only (really)
|
||||
*/
|
||||
@Deprecated
|
||||
public static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) {
|
||||
return cpr(n,m,u);
|
||||
}
|
||||
|
||||
|
|
@ -264,6 +280,15 @@ public class MannWhitneyU {
|
|||
return observations;
|
||||
}
|
||||
|
||||
/**
|
||||
* hook into the set sizes, for testing purposes only
|
||||
* @return size set 1, size set 2
|
||||
* @deprecated - only for testing
|
||||
*/
|
||||
public Pair<Integer,Integer> getSetSizes() {
|
||||
return new Pair<Integer,Integer>(sizeSet1,sizeSet2);
|
||||
}
|
||||
|
||||
/**
|
||||
* A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
|
||||
* and to ensure that rank ties are broken at random.
|
||||
|
|
|
|||
|
|
@ -113,8 +113,49 @@ public class RegionalAssociationUnitTest extends BaseTest {
|
|||
mwu.add(9,MannWhitneyU.USet.SET1);
|
||||
mwu.add(10,MannWhitneyU.USet.SET1);
|
||||
mwu.add(11,MannWhitneyU.USet.SET2);
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l);
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l);
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
|
||||
|
||||
MannWhitneyU mwu2 = new MannWhitneyU();
|
||||
for ( int dp : new int[]{2,4,5,6,8} ) {
|
||||
mwu2.add(dp,MannWhitneyU.USet.SET1);
|
||||
}
|
||||
|
||||
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
|
||||
mwu2.add(dp,MannWhitneyU.USet.SET2);
|
||||
}
|
||||
|
||||
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
|
||||
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
|
||||
Pair<Integer,Integer> sizes = mwu2.getSetSizes();
|
||||
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
|
||||
Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
|
||||
|
||||
// tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
|
||||
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
|
||||
|
||||
logger.warn("Set 1");
|
||||
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
|
||||
logger.warn("Set 2");
|
||||
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
|
||||
|
||||
MannWhitneyU mwu3 = new MannWhitneyU();
|
||||
for ( int dp : new int[]{0,2,4} ) {
|
||||
mwu3.add(dp,MannWhitneyU.USet.SET1);
|
||||
}
|
||||
for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
|
||||
mwu3.add(dp,MannWhitneyU.USet.SET2);
|
||||
}
|
||||
long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
|
||||
Pair<Integer,Integer> nums = mwu3.getSetSizes();
|
||||
Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
|
||||
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue