Fixed an edge case where an exception was thrown if either of the sets was empty for the MWU test. Also altered the output format so U itself is not printed (which though interesting, isn't so useful for recalibration), but rather a value I call V (really the deviation of U from its expectation).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5490 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-22 16:28:44 +00:00 · 2011-03-22 16:28:44 +00:00 · 5a79f16ea4
parent 748787c509
commit 5a79f16ea4
3 changed files with 28 additions and 12 deletions
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java
@ -134,14 +134,16 @@ public class AssociationTestRunner {
    }

    public static String runU(UStatistic context) {
-        Pair<Long,Double> results = mannWhitneyUTest(context);
-        return String.format("U: %d\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second)));
+        // note: u statistic (U) is relatively useless for recalibrating outside of the context of m and n
+        // thus we report V = (U - (m*n+1)/2)/(n*m*(n+m+1)/12)
+        Pair<Double,Double> results = mannWhitneyUTest(context);
+        return String.format("V: %.2f\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second)));
    }

-    public static Pair<Long,Double> mannWhitneyUTest(UStatistic context) {
+    public static Pair<Double,Double> mannWhitneyUTest(UStatistic context) {
        Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
        if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
-            return new Pair<Long,Double>(-1l,Double.NaN);
+            return new Pair<Double,Double>(Double.NaN,Double.NaN);
        }
        MannWhitneyU mwu = new MannWhitneyU();
        for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) {
--- a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
+++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java
@ -45,13 +45,13 @@ public class MannWhitneyU {
     * returns the u and p values.
     * @Returns a pair holding the u and p-value.
     */
-    public Pair<Long,Double> runTwoSidedTest() {
+    public Pair<Double,Double> runTwoSidedTest() {
        Pair<Long,USet> uPair = calculateTwoSidedU(observations);
        long u = uPair.first;
        int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2;
        int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1;
        double pval = calculateP(n,m,u,true);
-        return new Pair<Long,Double>(u,pval);
+        return new Pair<Double,Double>(getZApprox(n,m,u),pval);
    }

    /**
@ -65,7 +65,9 @@ public class MannWhitneyU {
     */
    public static double calculateP(int n, int m, long u, boolean twoSided) {
        double pval;
-        if ( n > 8 && m > 8 ) {
+        if ( m == 0 || n == 0 ) {
+            pval = 1.0;
+        } else if ( n > 8 && m > 8 ) {
            // large m and n - normal approx
            pval = calculatePNormalApproximation(n,m,u);
        } else if ( n > 4 && m > 7 ) {
@ -96,10 +98,22 @@ public class MannWhitneyU {
     * @return p-value associated with the normal approximation
     */
    public static double calculatePNormalApproximation(int n,int m,long u) {
+        double z = getZApprox(n,m,u);
+        return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z);
+    }
+
+    /**
+     * Calculates the Z-score approximation of the u-statistic
+     * @param n - The number of entries in the DOMINATED set
+     * @param m - The number of entries in the DOMINANT set
+     * @param u - the Mann-Whitney U value
+     * @return z-score associated with the normal approximation
+     */
+    private static double getZApprox(int n, int m, long u) {
        double mean = ( ((long)m)*n+1.0)/2;
        double var = (((long) n)*m*(n+m+1.0))/12;
        double z = ( u - mean )/Math.sqrt(var);
-        return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z);
+        return z;
    }

    /**
--- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
+++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java
@ -76,22 +76,22 @@ public class RegionalAssociationUnitTest extends BaseTest {
        UTest test1 = new UTest();
        test1.setCaseData((Collection) Arrays.asList(2,4,5,6,8));
        test1.setControlData((Collection) Arrays.asList(1,3,7,9,10,11,12,13));
-        Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test1).first,10l);
+        Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test1).first,-1.537,1e-4);
        Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.092292,5e-2); // z-approximation, off by about 0.05
        Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.044444,1e-3); // recursive calculation
        UTest test2 = new UTest();
        test2.setCaseData((Collection) Arrays.asList(1,7,8,9,10,11,15,18));
        test2.setControlData((Collection) Arrays.asList(2,3,4,5,6,12,13,14,16,17));
-        Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test2).first,37l);
+        Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test2).first,-0.3109831608,1e-10);
        UTest test3 = new UTest();
        test3.setCaseData((Collection)Arrays.asList(13,14,7,18,5,2,9,17,8,10,3,15,19,6,20,16,11,4,12,1));
        test3.setControlData((Collection) Arrays.asList(29,21,14,10,12,11,28,19,18,13,7,27,20,5,17,16,9,23,22,26));
-        Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test3).first,93l);
+        Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test3).first,-2.907884571802469,1e-14);
        Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test3).second,2*0.00302,1e-3);
        UTest test4 = new UTest();
        test4.setCaseData((Collection) Arrays.asList(1,2,4,5,6,9));
        test4.setControlData((Collection) Arrays.asList(3,8,11,12,13));
-        Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test4).first,5l);
+        Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test4).first,-1.9170289512680814,1e-14);
        Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test4).second,0.0303,1e-4);

    }