Fixed an edge case where an exception was thrown if either of the sets was empty for the MWU test. Also altered the output format so U itself is not printed (which though interesting, isn't so useful for recalibration), but rather a value I call V (really the deviation of U from its expectation).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5490 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-03-22 16:28:44 +00:00
parent 748787c509
commit 5a79f16ea4
3 changed files with 28 additions and 12 deletions

View File

@ -134,14 +134,16 @@ public class AssociationTestRunner {
}
public static String runU(UStatistic context) {
Pair<Long,Double> results = mannWhitneyUTest(context);
return String.format("U: %d\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second)));
// note: u statistic (U) is relatively useless for recalibrating outside of the context of m and n
// thus we report V = (U - (m*n+1)/2)/(n*m*(n+m+1)/12)
Pair<Double,Double> results = mannWhitneyUTest(context);
return String.format("V: %.2f\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second)));
}
public static Pair<Long,Double> mannWhitneyUTest(UStatistic context) {
public static Pair<Double,Double> mannWhitneyUTest(UStatistic context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Long,Double>(-1l,Double.NaN);
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
MannWhitneyU mwu = new MannWhitneyU();
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) {

View File

@ -45,13 +45,13 @@ public class MannWhitneyU {
* returns the u and p values.
* @Returns a pair holding the u and p-value.
*/
public Pair<Long,Double> runTwoSidedTest() {
public Pair<Double,Double> runTwoSidedTest() {
Pair<Long,USet> uPair = calculateTwoSidedU(observations);
long u = uPair.first;
int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2;
int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1;
double pval = calculateP(n,m,u,true);
return new Pair<Long,Double>(u,pval);
return new Pair<Double,Double>(getZApprox(n,m,u),pval);
}
/**
@ -65,7 +65,9 @@ public class MannWhitneyU {
*/
public static double calculateP(int n, int m, long u, boolean twoSided) {
double pval;
if ( n > 8 && m > 8 ) {
if ( m == 0 || n == 0 ) {
pval = 1.0;
} else if ( n > 8 && m > 8 ) {
// large m and n - normal approx
pval = calculatePNormalApproximation(n,m,u);
} else if ( n > 4 && m > 7 ) {
@ -96,10 +98,22 @@ public class MannWhitneyU {
* @return p-value associated with the normal approximation
*/
public static double calculatePNormalApproximation(int n,int m,long u) {
double z = getZApprox(n,m,u);
return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z);
}
/**
* Calculates the Z-score approximation of the u-statistic
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param u - the Mann-Whitney U value
* @return z-score associated with the normal approximation
*/
private static double getZApprox(int n, int m, long u) {
double mean = ( ((long)m)*n+1.0)/2;
double var = (((long) n)*m*(n+m+1.0))/12;
double z = ( u - mean )/Math.sqrt(var);
return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z);
return z;
}
/**

View File

@ -76,22 +76,22 @@ public class RegionalAssociationUnitTest extends BaseTest {
UTest test1 = new UTest();
test1.setCaseData((Collection) Arrays.asList(2,4,5,6,8));
test1.setControlData((Collection) Arrays.asList(1,3,7,9,10,11,12,13));
Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test1).first,10l);
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test1).first,-1.537,1e-4);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.092292,5e-2); // z-approximation, off by about 0.05
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.044444,1e-3); // recursive calculation
UTest test2 = new UTest();
test2.setCaseData((Collection) Arrays.asList(1,7,8,9,10,11,15,18));
test2.setControlData((Collection) Arrays.asList(2,3,4,5,6,12,13,14,16,17));
Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test2).first,37l);
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test2).first,-0.3109831608,1e-10);
UTest test3 = new UTest();
test3.setCaseData((Collection)Arrays.asList(13,14,7,18,5,2,9,17,8,10,3,15,19,6,20,16,11,4,12,1));
test3.setControlData((Collection) Arrays.asList(29,21,14,10,12,11,28,19,18,13,7,27,20,5,17,16,9,23,22,26));
Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test3).first,93l);
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test3).first,-2.907884571802469,1e-14);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test3).second,2*0.00302,1e-3);
UTest test4 = new UTest();
test4.setCaseData((Collection) Arrays.asList(1,2,4,5,6,9));
test4.setControlData((Collection) Arrays.asList(3,8,11,12,13));
Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test4).first,5l);
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test4).first,-1.9170289512680814,1e-14);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test4).second,0.0303,1e-4);
}