diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 5adfba700..2e98f8ce5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -11,7 +11,7 @@ import java.util.ArrayList; public class RankSumTest implements VariantAnnotation { - + private final static boolean DEBUG = false; private static final double minPValue = 1e-10; public String annotate(ReferenceContext ref, ReadBackedPileup pileup, Variation variation, List genotypes) { @@ -38,7 +38,16 @@ public class RankSumTest implements VariantAnnotation { for ( Integer qual : refQuals ) wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2); - double pvalue = wilcoxon.getTwoTailedPValue(); + // for R debugging + if ( DEBUG ) { + wilcoxon.DEBUG = DEBUG; + System.out.printf("%s%n", pileup.getLocation()); + System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals)); + System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals)); + } + + // we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases) + double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT); if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) return null; diff --git a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java index 0b5430057..ca70b0671 100755 --- a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java +++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java @@ -5,6 +5,56 @@ import cern.jet.random.Normal; import java.util.*; public class WilcoxonRankSum { + static final String headerString = ("nA nB .005 .01 .025 .05 .10 .20"); + static final String header[] = headerString.split(" "); + static final double sigs[] = {-1, -1, .005, .01, .025, .05, .10, .20}; + + // Probabilities relate to the distribution of WA, the rank sum for group A when + // H0 : A = B is true. The tabulated value for the lower tail is the largest + // value of wA for which pr(WA ˛ wA) ˛ prob . The tabulated value for the + // upper tail is the smallest value of wfor which pr(Wł w) ˛ prob . + + // I think this data is wrong -- we should really do the computation outselves and remember the results + static final int data[][] = { + {4, 4, -1, -1, 10, 11, 13, 14}, + {4, 5, -1, 10, 11, 12, 14, 15}, + {4, 6, 10, 11, 12, 13, 15, 17}, + {4, 7, 10, 11, 13, 14, 16, 18}, + {4, 8, 11, 12, 14, 15, 17, 20}, + {4, 9, 11, 13, 14, 16, 19, 21}, + {4, 10, 12, 13, 15, 17, 20, 23}, + {4, 11, 12, 14, 16, 18, 21, 24}, + {4, 12, 13, 15, 17, 19, 22, 26}, + {5, 5, 5, 15, 16, 17, 19, 20}, + {5, 6, 16, 17, 18, 20, 22, 24}, + {5, 7, 16, 18, 20, 21, 23, 26}, + {5, 8, 17, 19, 21, 23, 25, 28}, + {5, 9, 18, 20, 22, 24, 27, 30}, + {5, 10, 19, 21, 23, 26, 28, 32}, + {5, 11, 20, 22, 24, 27, 30, 34}, + {5, 12, 21, 23, 26, 28, 32, 36}, + {6, 6, 23, 24, 26, 28, 30, 33}, + {6, 7, 24, 25, 27, 29, 32, 35}, + {6, 8, 25, 27, 29, 31, 34, 37}, + {6, 9, 26, 28, 31, 33, 36, 40}, + {6, 10, 27, 29, 32, 35, 38, 42}, + {6, 11, 28, 30, 34, 37, 40, 44}, + {6, 12, 30, 32, 35, 38, 42, 47}, + {7, 7, 32, 34, 36, 39, 41, 45}, + {7, 8, 34, 35, 38, 41, 44, 48}, + {7, 9, 35, 37, 40, 43, 46, 50}, + {7, 10, 37, 39, 42, 45, 49, 53}, + {7, 11, 38, 40, 44, 47, 51, 56}, + {7, 12, 40, 42, 46, 49, 54, 59}, + {8, 8, 43, 45, 49, 51, 55, 59}, + {8, 9, 45, 47, 51, 54, 58, 62}, + {8, 10, 47, 49, 53, 56, 60, 65}, + {8, 11, 49, 51, 55, 59, 63, 69}, + {8, 12, 51, 53, 58, 62, 66, 72}, + {9, 9, 56, 59, 62, 66, 70, 75}, + {9, 10, 58, 61, 65, 69, 73, 78}, + {9, 11, 61, 63, 68, 72, 76, 82}, + {9, 12, 63, 66, 71, 75, 80, 86}}; // *****************************************************************************************// // The following 4 variables were copied from Tim Fennell's RankSumTest.java code in Picard // @@ -12,18 +62,21 @@ public class WilcoxonRankSum { // Constructs a normal distribution; actual values of mean and SD don't matter since it's // just used to convert a z-score into a cumulative probability - private static final double NORMAL_MEAN = 100; - private static final double NORMAL_SD = 15; + public boolean DEBUG = false; + + private static final double NORMAL_MEAN = 0; + private static final double NORMAL_SD = 1; private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null); // The minimum length for both data series (individually) in order to use a normal distribution // to calculate the Z-score and the p-value. If either series is shorter than this value then // we don't attempt to assign a p-value - private static final int minimumNormalN = 10; + private static final int minimumNormalN = 5; // *****************************************************************************************// public enum WILCOXON_SET { SET1, SET2 } + public enum WILCOXON_H0 { SET1_NE_SET2, SET1_LT_SET2, SET1_GT_SET2, SMALLER_SET_LT } // random number generator for dithering private static final long RANDOM_SEED = 1252863494; @@ -42,7 +95,7 @@ public class WilcoxonRankSum { // calculate normal approximation of the p-value // returns -1 when unable to calculate it (too few data points) - public double getTwoTailedPValue() { + public double getPValue(WILCOXON_H0 h0) { if ( observations.size() == 0 ) return -1.0; @@ -63,34 +116,107 @@ public class WilcoxonRankSum { } int n2 = observations.size() - n1; - // if we don't have enough data points, quit - if ( n1 < minimumNormalN || n2 < minimumNormalN ) - return -1.0; - + // todo -- these are actually integers // we want the smaller of U1 and U2 double U1 = sum - (n1 * (n1 + 1.0) / 2.0); - double U2 = (n1 * n2) - U1; - double U = Math.min(U1, U2); + double U2 = (n1 * n2) - U1; + double pvalue; + // if we don't have enough data points, quit + if ( n1 < minimumNormalN || n2 < minimumNormalN ) { + pvalue = exactCalculation(h0, n1, n2, U1, U2); + } else { + pvalue = normalApproximation(h0, n1, n2, U1, U2); + } + + if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) { + //for (int i = 0; i < observations.size(); i++) + // System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? "Alt" : "Ref")); + //System.out.printf("n1 %d n2 %d U1 %f U2 %f pValue %f QPValue %f%n", n1, n2, U1, U2, pvalue, QualityUtils.phredScaleErrorRate(pvalue)); + } + + return pvalue; + } + + private double exactCalculation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) { + double U = 0; + switch (h0) { +// case SET1_NE_SET2: // two-tailed +// double U = Math.min(U1, U2); +// z = (U - MuU) / stdevU; +// pvalue = 2.0 * NORMAL.cdf(z); +// break; +// case SET1_LT_SET2: // one-tailed, SET1 < SET2 +// z = (U1 - MuU) / stdevU; +// pvalue = NORMAL.cdf(z); +// break; +// case SET1_GT_SET2: // one-tailed, SET1 < SET2 +// z = (U2 - MuU) / stdevU; +// pvalue = NORMAL.cdf(z); +// break; + case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set + U = n1 < n2 ? U1 : U2; + break; + default: + throw new StingException("Unexpected WILCOXON H0: " + h0); + } + + // data is nA nB then + double pvalue = 1; // if we don't find anything, the pvalue is 1 [not significant] + for ( int nAi = 0; nAi < data.length; nAi++ ) { + if ( data[nAi][0] == n1 && data[nAi][1] == n2 ) { + for ( int j = 2; j < data[nAi].length; j++ ) { + if ( U < data[nAi][j] ) { + // we are significant + pvalue = sigs[j]; + //System.out.printf("Found %d %d %f a match at %d %d with %d %f%n", n1, n2, U, nAi, j, data[nAi][j], pvalue ); + break; + } + } + } + } + + return pvalue; + } + + private double normalApproximation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) { // calculate the normal approximation double MuU = n1 * n2 / 2.0; double stdevU = Math.sqrt(n1 * n2 * (n1 + n2 + 1.0) / 12.0); - double z = (U - MuU) / stdevU; - // compute p-value. Taken from Tim Fennell's RankSumTest.java code in Picard - double pvalue = 2.0 * NORMAL.cdf(NORMAL_MEAN + z * NORMAL_SD); - - // for (int i = 0; i < observations.size(); i++) - // System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? 1 : 2)); - // System.out.println("U1=" + U1); - // System.out.println("U2=" + U2); - // System.out.println("U=" + U); - // System.out.println("Zscore=" + z); - // System.out.println("Pvalue=" + pvalue); + double z, pvalue; + switch (h0) { + case SET1_NE_SET2: // two-tailed + double U = Math.min(U1, U2); + z = (U - MuU) / stdevU; + pvalue = 2.0 * NORMAL.cdf(z); + break; + case SET1_LT_SET2: // one-tailed, SET1 < SET2 + z = (U1 - MuU) / stdevU; + pvalue = NORMAL.cdf(z); + break; + case SET1_GT_SET2: // one-tailed, SET1 < SET2 + z = (U2 - MuU) / stdevU; + pvalue = NORMAL.cdf(z); + break; + case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set + double smallerU = n1 < n2 ? U1 : U2; + z = (smallerU - MuU) / stdevU; + pvalue = NORMAL.cdf(z); + break; + default: + throw new StingException("Unexpected WILCOXON H0: " + h0); + } return pvalue; } + private void printValues() { + for ( Pair obs : observations ) { + System.out.printf("%s%n", obs); + } + } + private void dither() { for ( Pair observation : observations ) { // generate a random number between 0 and 10,000