Improved RankSum calculations and RankSum annotation. Much more meaningful

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2266 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-04 22:16:40 +00:00 · 2009-12-04 22:16:40 +00:00 · 07b88621c5
parent 0753315156
commit 07b88621c5
2 changed files with 158 additions and 23 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@ -11,7 +11,7 @@ import java.util.ArrayList;


 public class RankSumTest implements VariantAnnotation {
-
+    private final static boolean DEBUG = false;
    private static final double minPValue = 1e-10;

    public String annotate(ReferenceContext ref, ReadBackedPileup pileup, Variation variation, List<Genotype> genotypes) {
@ -38,7 +38,16 @@ public class RankSumTest implements VariantAnnotation {
        for ( Integer qual : refQuals )
            wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2);

-        double pvalue = wilcoxon.getTwoTailedPValue();
+        // for R debugging
+        if ( DEBUG ) {
+            wilcoxon.DEBUG = DEBUG;
+            System.out.printf("%s%n", pileup.getLocation());
+            System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals));
+            System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals));
+        }
+
+        // we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases)
+        double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT);
        if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 )
            return null;

--- a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java
+++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java
@ -5,6 +5,56 @@ import cern.jet.random.Normal;
 import java.util.*;

 public class WilcoxonRankSum {
+    static final String headerString = ("nA nB .005 .01 .025 .05 .10 .20");
+    static final String header[] = headerString.split(" ");
+    static final double sigs[] = {-1, -1, .005, .01, .025, .05, .10, .20};
+
+    // Probabilities relate to the distribution of WA, the rank sum for group A when
+    // H0 : A = B is true. The tabulated value for the lower tail is the largest
+    // value of wA for which pr(WA ² wA) ² prob . The tabulated value for the
+    // upper tail is the smallest value of wfor which pr(W³ w) ² prob .
+
+    // I think this data is wrong -- we should really do the computation outselves and remember the results
+    static final int data[][] = {
+            {4,     4,      -1,     -1,     10,     11,     13,     14},
+            {4,     5,      -1,     10,     11,     12,     14,     15},
+            {4,     6,      10,     11,     12,     13,     15,     17},
+            {4,     7,      10,     11,     13,     14,     16,     18},
+            {4,     8,      11,     12,     14,     15,     17,     20},
+            {4,     9,      11,     13,     14,     16,     19,     21},
+            {4,     10,     12,     13,     15,     17,     20,     23},
+            {4,     11,     12,     14,     16,     18,     21,     24},
+            {4,     12,     13,     15,     17,     19,     22,     26},
+            {5,     5,      5,      15,     16,     17,     19,     20},
+            {5,     6,      16,     17,     18,     20,     22,     24},
+            {5,     7,      16,     18,     20,     21,     23,     26},
+            {5,     8,      17,     19,     21,     23,     25,     28},
+            {5,     9,      18,     20,     22,     24,     27,     30},
+            {5,     10,     19,     21,     23,     26,     28,     32},
+            {5,     11,     20,     22,     24,     27,     30,     34},
+            {5,     12,     21,     23,     26,     28,     32,     36},
+            {6,     6,      23,     24,     26,     28,     30,     33},
+            {6,     7,      24,     25,     27,     29,     32,     35},
+            {6,     8,      25,     27,     29,     31,     34,     37},
+            {6,     9,      26,     28,     31,     33,     36,     40},
+            {6,     10,     27,     29,     32,     35,     38,     42},
+            {6,     11,     28,     30,     34,     37,     40,     44},
+            {6,     12,     30,     32,     35,     38,     42,     47},
+            {7,     7,      32,     34,     36,     39,     41,     45},
+            {7,     8,      34,     35,     38,     41,     44,     48},
+            {7,     9,      35,     37,     40,     43,     46,     50},
+            {7,     10,     37,     39,     42,     45,     49,     53},
+            {7,     11,     38,     40,     44,     47,     51,     56},
+            {7,     12,     40,     42,     46,     49,     54,     59},
+            {8,     8,      43,     45,     49,     51,     55,     59},
+            {8,     9,      45,     47,     51,     54,     58,     62},
+            {8,     10,     47,     49,     53,     56,     60,     65},
+            {8,     11,     49,     51,     55,     59,     63,     69},
+            {8,     12,     51,     53,     58,     62,     66,     72},
+            {9,     9,      56,     59,     62,     66,     70,     75},
+            {9,     10,     58,     61,     65,     69,     73,     78},
+            {9,     11,     61,     63,     68,     72,     76,     82},
+            {9,     12,     63,     66,     71,     75,     80,     86}};

    // *****************************************************************************************//
    // The following 4 variables were copied from Tim Fennell's RankSumTest.java code in Picard //
@ -12,18 +62,21 @@ public class WilcoxonRankSum {

    // Constructs a normal distribution; actual values of mean and SD don't matter since it's
    // just used to convert a z-score into a cumulative probability
-    private static final double NORMAL_MEAN = 100;
-    private static final double NORMAL_SD   = 15;
+    public boolean DEBUG = false;
+
+    private static final double NORMAL_MEAN = 0;
+    private static final double NORMAL_SD   = 1;
    private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null);

    // The minimum length for both data series (individually) in order to use a normal distribution
    // to calculate the Z-score and the p-value. If either series is shorter than this value then
    // we don't attempt to assign a p-value
-    private static final int minimumNormalN = 10;
+    private static final int minimumNormalN = 5;

    // *****************************************************************************************//

    public enum WILCOXON_SET { SET1, SET2 }
+    public enum WILCOXON_H0 { SET1_NE_SET2, SET1_LT_SET2, SET1_GT_SET2, SMALLER_SET_LT }

    // random number generator for dithering
    private static final long RANDOM_SEED = 1252863494;
@ -42,7 +95,7 @@ public class WilcoxonRankSum {

    // calculate normal approximation of the p-value
    // returns -1 when unable to calculate it (too few data points)
-    public double getTwoTailedPValue() {
+    public double getPValue(WILCOXON_H0 h0) {
        if ( observations.size() == 0 )
            return -1.0;

@ -63,34 +116,107 @@ public class WilcoxonRankSum {
        }
        int n2 = observations.size() - n1;

-        // if we don't have enough data points, quit
-        if ( n1 < minimumNormalN || n2 < minimumNormalN )
-            return -1.0;
-                
+        // todo -- these are actually integers
        // we want the smaller of U1 and U2
        double U1 = sum - (n1 * (n1 + 1.0) / 2.0);
-        double U2 = (n1 * n2) - U1;                                                                
-        double U = Math.min(U1, U2);
+        double U2 = (n1 * n2) - U1;

+        double pvalue;
+        // if we don't have enough data points, quit
+        if ( n1 < minimumNormalN || n2 < minimumNormalN ) {
+            pvalue = exactCalculation(h0, n1, n2, U1, U2);
+        } else {
+            pvalue = normalApproximation(h0, n1, n2, U1, U2);
+        }
+
+        if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) {
+            //for (int i = 0; i < observations.size(); i++)
+            //    System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? "Alt" : "Ref"));
+            //System.out.printf("n1 %d n2 %d U1 %f U2 %f pValue %f QPValue %f%n", n1, n2, U1, U2, pvalue, QualityUtils.phredScaleErrorRate(pvalue));
+        }
+
+        return pvalue;
+    }
+
+    private double exactCalculation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) {
+        double U = 0;
+        switch (h0) {
+//            case SET1_NE_SET2: // two-tailed
+//                double U = Math.min(U1, U2);
+//                z = (U - MuU) / stdevU;
+//                pvalue = 2.0 * NORMAL.cdf(z);
+//                break;
+//            case SET1_LT_SET2: // one-tailed, SET1 < SET2
+//                z = (U1 - MuU) / stdevU;
+//                pvalue = NORMAL.cdf(z);
+//                break;
+//            case SET1_GT_SET2: // one-tailed, SET1 < SET2
+//                z = (U2 - MuU) / stdevU;
+//                pvalue = NORMAL.cdf(z);
+//                break;
+            case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set
+                U = n1 < n2 ? U1 : U2;
+                break;
+            default:
+                throw new StingException("Unexpected WILCOXON H0: " + h0);
+        }
+
+        // data is nA nB then
+        double pvalue = 1;  // if we don't find anything, the pvalue is 1 [not significant]
+        for ( int nAi = 0; nAi < data.length; nAi++ ) {
+            if ( data[nAi][0] == n1 && data[nAi][1] == n2 ) {
+                for ( int j = 2; j < data[nAi].length; j++ ) {
+                    if ( U < data[nAi][j] ) {
+                        // we are significant
+                        pvalue = sigs[j];
+                        //System.out.printf("Found %d %d %f a match at %d %d with %d %f%n", n1, n2, U, nAi, j, data[nAi][j], pvalue );
+                        break;
+                    }
+                }
+            }
+        }
+
+        return pvalue;
+    }
+
+    private double normalApproximation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) {
        // calculate the normal approximation
        double MuU = n1 * n2 / 2.0;
        double stdevU = Math.sqrt(n1 * n2 * (n1 + n2 + 1.0) / 12.0);
-        double z = (U - MuU) / stdevU;

-        // compute p-value.  Taken from Tim Fennell's RankSumTest.java code in Picard
-        double pvalue = 2.0 * NORMAL.cdf(NORMAL_MEAN + z * NORMAL_SD);
-
-        // for (int i = 0; i < observations.size(); i++)
-        //      System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? 1 : 2));
-        // System.out.println("U1=" + U1);
-        // System.out.println("U2=" + U2);
-        // System.out.println("U=" + U);
-        // System.out.println("Zscore=" + z);
-        // System.out.println("Pvalue=" + pvalue);
+        double z, pvalue;
+        switch (h0) {
+            case SET1_NE_SET2: // two-tailed
+                double U = Math.min(U1, U2);
+                z = (U - MuU) / stdevU;
+                pvalue = 2.0 * NORMAL.cdf(z);
+                break;
+            case SET1_LT_SET2: // one-tailed, SET1 < SET2
+                z = (U1 - MuU) / stdevU;
+                pvalue = NORMAL.cdf(z);
+                break;
+            case SET1_GT_SET2: // one-tailed, SET1 < SET2
+                z = (U2 - MuU) / stdevU;
+                pvalue = NORMAL.cdf(z);
+                break;
+            case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set
+                double smallerU = n1 < n2 ? U1 : U2;
+                z = (smallerU - MuU) / stdevU;
+                pvalue = NORMAL.cdf(z);
+                break;
+            default:
+                throw new StingException("Unexpected WILCOXON H0: " + h0);
+        }

        return pvalue;
    }

+    private void printValues() {
+        for ( Pair<Double, WILCOXON_SET> obs : observations ) {
+            System.out.printf("%s%n", obs);
+        }
+    }
+
    private void dither() {
        for ( Pair<Double, WILCOXON_SET> observation : observations ) {
            // generate a random number between 0 and 10,000