From 5a79f16ea42ac4c023ddc032781d33f97dc17e4b Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 22 Mar 2011 16:28:44 +0000 Subject: [PATCH] Fixed an edge case where an exception was thrown if either of the sets was empty for the MWU test. Also altered the output format so U itself is not printed (which though interesting, isn't so useful for recalibration), but rather a value I call V (really the deviation of U from its expectation). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5490 348d0f76-0448-11de-a6fe-93d51630548a --- .../association/AssociationTestRunner.java | 10 +++++---- .../sting/utils/MannWhitneyU.java | 22 +++++++++++++++---- .../walkers/RegionalAssociationUnitTest.java | 8 +++---- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java index 50ea8b4e3..72e261a43 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java @@ -134,14 +134,16 @@ public class AssociationTestRunner { } public static String runU(UStatistic context) { - Pair results = mannWhitneyUTest(context); - return String.format("U: %d\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second))); + // note: u statistic (U) is relatively useless for recalibrating outside of the context of m and n + // thus we report V = (U - (m*n+1)/2)/(n*m*(n+m+1)/12) + Pair results = mannWhitneyUTest(context); + return String.format("V: %.2f\tP: %.2e\tQ: %d",results.first,results.second,(int)Math.floor(QualityUtils.phredScaleErrorRate(results.second))); } - public static Pair mannWhitneyUTest(UStatistic context) { + public static Pair mannWhitneyUTest(UStatistic context) { Map> caseControlVectors = context.getCaseControl(); if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) { - return new Pair(-1l,Double.NaN); + return new Pair(Double.NaN,Double.NaN); } MannWhitneyU mwu = new MannWhitneyU(); for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) { diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index 508cb8a83..d2575c6cc 100755 --- a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -45,13 +45,13 @@ public class MannWhitneyU { * returns the u and p values. * @Returns a pair holding the u and p-value. */ - public Pair runTwoSidedTest() { + public Pair runTwoSidedTest() { Pair uPair = calculateTwoSidedU(observations); long u = uPair.first; int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; double pval = calculateP(n,m,u,true); - return new Pair(u,pval); + return new Pair(getZApprox(n,m,u),pval); } /** @@ -65,7 +65,9 @@ public class MannWhitneyU { */ public static double calculateP(int n, int m, long u, boolean twoSided) { double pval; - if ( n > 8 && m > 8 ) { + if ( m == 0 || n == 0 ) { + pval = 1.0; + } else if ( n > 8 && m > 8 ) { // large m and n - normal approx pval = calculatePNormalApproximation(n,m,u); } else if ( n > 4 && m > 7 ) { @@ -96,10 +98,22 @@ public class MannWhitneyU { * @return p-value associated with the normal approximation */ public static double calculatePNormalApproximation(int n,int m,long u) { + double z = getZApprox(n,m,u); + return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z); + } + + /** + * Calculates the Z-score approximation of the u-statistic + * @param n - The number of entries in the DOMINATED set + * @param m - The number of entries in the DOMINANT set + * @param u - the Mann-Whitney U value + * @return z-score associated with the normal approximation + */ + private static double getZApprox(int n, int m, long u) { double mean = ( ((long)m)*n+1.0)/2; double var = (((long) n)*m*(n+m+1.0))/12; double z = ( u - mean )/Math.sqrt(var); - return z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z); + return z; } /** diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java index dd0401b7b..f9e452ae3 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java @@ -76,22 +76,22 @@ public class RegionalAssociationUnitTest extends BaseTest { UTest test1 = new UTest(); test1.setCaseData((Collection) Arrays.asList(2,4,5,6,8)); test1.setControlData((Collection) Arrays.asList(1,3,7,9,10,11,12,13)); - Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test1).first,10l); + Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test1).first,-1.537,1e-4); Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.092292,5e-2); // z-approximation, off by about 0.05 Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.044444,1e-3); // recursive calculation UTest test2 = new UTest(); test2.setCaseData((Collection) Arrays.asList(1,7,8,9,10,11,15,18)); test2.setControlData((Collection) Arrays.asList(2,3,4,5,6,12,13,14,16,17)); - Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test2).first,37l); + Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test2).first,-0.3109831608,1e-10); UTest test3 = new UTest(); test3.setCaseData((Collection)Arrays.asList(13,14,7,18,5,2,9,17,8,10,3,15,19,6,20,16,11,4,12,1)); test3.setControlData((Collection) Arrays.asList(29,21,14,10,12,11,28,19,18,13,7,27,20,5,17,16,9,23,22,26)); - Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test3).first,93l); + Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test3).first,-2.907884571802469,1e-14); Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test3).second,2*0.00302,1e-3); UTest test4 = new UTest(); test4.setCaseData((Collection) Arrays.asList(1,2,4,5,6,9)); test4.setControlData((Collection) Arrays.asList(3,8,11,12,13)); - Assert.assertEquals((long) AssociationTestRunner.mannWhitneyUTest(test4).first,5l); + Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test4).first,-1.9170289512680814,1e-14); Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test4).second,0.0303,1e-4); }