From f6dfdc7f3b702bf0bf8715a87de4dbac8be6c470 Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 29 Mar 2011 15:53:40 +0000 Subject: [PATCH] Single-tailed hypothesis testing in MWU git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5533 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/MannWhitneyU.java | 49 ++++++++++++++++++- .../walkers/RegionalAssociationUnitTest.java | 22 +++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index d2575c6cc..27ee25e5b 100755 --- a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -41,8 +41,22 @@ public class MannWhitneyU { } /** - * temporary method that will be generalized. Runs the standard two-sided test, - * returns the u and p values. + * Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically + * dominates the other set + * @param lessThanOther - either Set1 or Set2 + * @return - u-based z-approximation, and p-value associated with the test (p-value is exact for small n,m) + */ + public Pair runOneSidedTest(USet lessThanOther) { + long u = calculateOneSidedU(observations, lessThanOther); + int n = lessThanOther == USet.SET1 ? sizeSet1 : sizeSet2; + int m = lessThanOther == USet.SET1 ? sizeSet2 : sizeSet1; + double pval = calculateP(n,m,u,false); + return new Pair(getZApprox(n,m,u),pval); + } + + /** + * Runs the standard two-sided test, + * returns the u-based z-approximate and p values. * @Returns a pair holding the u and p-value. */ public Pair runTwoSidedTest() { @@ -185,6 +199,27 @@ public class MannWhitneyU { return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); } + /** + * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates + * the other U-set + * @param observed - the observed data points, tagged by each set + * @param dominator - the set that is hypothesized to be stochastically dominating + * @return the u-statistic associated with the hypothesis + */ + public static long calculateOneSidedU(TreeSet> observed,USet dominator) { + long domninatorBeforeOther = 0l; + int domSeenSoFar = 0; + for ( Pair dataPoint : observed ) { + if ( dataPoint.second == dominator ) { + ++domSeenSoFar; + } else { + domninatorBeforeOther += domSeenSoFar; + } + } + + return domninatorBeforeOther; + } + /** * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor @@ -219,6 +254,16 @@ public class MannWhitneyU { return (((double)n)/(n+m))*cpr(n-1,m,u-m) + (((double)m)/(n+m))*cpr(n,m-1,u); } + /** + * hook into the data tree, for testing purposes only + * @return observations + * @deprecated - only for testing + */ + @Deprecated + public TreeSet> getObservations() { + return observations; + } + /** * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java index f9e452ae3..a9483a291 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/RegionalAssociationUnitTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers; import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.UStatistic; import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.ZStatistic; +import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationTestRunner; @@ -96,6 +97,27 @@ public class RegionalAssociationUnitTest extends BaseTest { } + @Test + private void testMWU() { + logger.warn("Testing MWU"); + MannWhitneyU mwu = new MannWhitneyU(); + mwu.add(0, MannWhitneyU.USet.SET1); + mwu.add(1,MannWhitneyU.USet.SET2); + mwu.add(2,MannWhitneyU.USet.SET2); + mwu.add(3,MannWhitneyU.USet.SET2); + mwu.add(4,MannWhitneyU.USet.SET2); + mwu.add(5,MannWhitneyU.USet.SET2); + mwu.add(6,MannWhitneyU.USet.SET1); + mwu.add(7,MannWhitneyU.USet.SET1); + mwu.add(8,MannWhitneyU.USet.SET1); + mwu.add(9,MannWhitneyU.USet.SET1); + mwu.add(10,MannWhitneyU.USet.SET1); + mwu.add(11,MannWhitneyU.USet.SET2); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),11l); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),25l); + + } + private class TTest extends TStatistic { Map> toTest = new HashMap>(2);