package org.broadinstitute.sting.utils; import cern.jet.math.Arithmetic; import cern.jet.random.Normal; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import java.util.Comparator; import java.util.TreeSet; /** * Created by IntelliJ IDEA. * User: chartl */ public class MannWhitneyU { private TreeSet> observations; private int sizeSet1; private int sizeSet2; public MannWhitneyU() { observations = new TreeSet>(new DitheringComparator()); sizeSet1 = 0; sizeSet2 = 0; } /** * Add an observation into the observation tree * @param n: the observation (a number) * @param set: whether the observation comes from set 1 or set 2 */ public void add(Number n, USet set) { observations.add(new Pair(n,set)); if ( set == USet.SET1 ) { ++sizeSet1; } else { ++sizeSet2; } } /** * temporary method that will be generalized. Runs the standard two-sided test, * returns the u and p values. * @Returns a pair holding the u and p-value. */ public Pair runTwoSidedTest() { Pair uPair = calculateTwoSidedU(observations); int u = uPair.first; int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; double pval = calculateP(n,m,u,true); return new Pair(u,pval); } /** * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate * @param n - The number of entries in the DOMINATED set * @param m - The number of entries in the DOMINANT set * @param u - the Mann-Whitney U value * @param twoSided - is the test twosided * @return the (possibly approximate) p-value associated with the MWU test */ public static double calculateP(int n, int m, int u, boolean twoSided) { double pval; if ( n > 8 && m > 8 ) { pval = calculatePNormalApproximation(n,m,u); } else if ( n > 4 && m > 8 ) { pval = calculatePUniformApproximation(n,m,u); } else { pval = calculatePRecursively(n,m,u); } return twoSided ? 2*pval : pval; } /** * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] * @param n - The number of entries in the DOMINATED set * @param m - The number of entries in the DOMINANT set * @param u - the Mann-Whitney U value * @return p-value associated with the normal approximation */ public static double calculatePNormalApproximation(int n,int m,int u) { double mean = ((double) m*n+1)/2; Normal normal = new Normal( mean , ((double) n*m*(n+m+1))/12, null); return u < mean ? normal.cdf(u) : 1.0-normal.cdf(u); } /** * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF) * @param n - * @param m - * @param u - * @return */ public static double calculatePUniformApproximation(int n, int m, int u) { int R = u + (n*(n+1))/2; double a = Math.sqrt(m*(n+m+1)); double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); double z = b + R/a; if ( z < 0 ) { return 0.0; } else if ( z > n ) { return 1.0; } else { return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); } } /** * Helper function for the sum of n uniform random variables * @param z - value at which to compute the (un-normalized) cdf * @param m - a cutoff integer (defined by m <= z < m + 1) * @param n - the number of uniform random variables * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) * @return the (un-normalized) cdf for the sum of n random variables */ private static double uniformSumHelper(double z, int m, int n, int k) { if ( k > m ) { return 0; } int coef = (k % 2 == 0) ? 1 : -1; return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); } /** * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for * later on simply by multiplying the p-value by 2 * @param observed * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) */ public static Pair calculateTwoSidedU(TreeSet> observed ) { int set1SeenSoFar = 0; int set2SeenSoFar = 0; int uSet1DomSet2 = 0; int uSet2DomSet1 = 0; USet previous = null; for ( Pair dataPoint : observed ) { if ( previous != null && previous != dataPoint.second ) { if ( dataPoint.second == USet.SET1 ) { uSet2DomSet1 += set2SeenSoFar; } else { uSet1DomSet2 += set1SeenSoFar; } } if ( dataPoint.second == USet.SET1 ) { ++set1SeenSoFar; } else { ++set2SeenSoFar; } previous = dataPoint.second; } return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); } /** * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor * calls into that recursive calculation. * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two) * @param m: number of set-two entries * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. */ public static double calculatePRecursively(int n, int m, int u) { if ( m > 6 && n > 4 || m + n > 16 ) { throw new StingException("Please use the appropriate (normal or sum of uniform) approximation"); } return cpr(n,m,u); } /** * @doc: just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] * @n: number of set-1 entries * @m: number of set-2 entries * @u: number of times a set-2 entry as preceded a set-1 entry */ private static double cpr(int n, int m, int u) { if ( u < 0 || n == 0 && m == 0 ) { return 0.0; } if ( m == 0 || n == 0 ) { // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway return ( u == 0 ) ? 1.0 : 0.0; } return (n/(n+m))*cpr(n-1,m,u-m) + (m/(n+m))*cpr(n,m-1,u); } /** * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. */ private class DitheringComparator implements Comparator> { public DitheringComparator() {} public boolean equals(Object other) { return false; } public int compare(Pair left, Pair right) { double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); if ( comp > 0 ) { return 1; } if ( comp < 0 ) { return -1; } return MathUtils.rand.nextBoolean() ? -1 : 1; } } public enum USet { SET1, SET2 } }