Improved RankSum calculations and RankSum annotation. Much more meaningful

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2266 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-04 22:16:40 +00:00
parent 0753315156
commit 07b88621c5
2 changed files with 158 additions and 23 deletions

View File

@ -11,7 +11,7 @@ import java.util.ArrayList;
public class RankSumTest implements VariantAnnotation {
private final static boolean DEBUG = false;
private static final double minPValue = 1e-10;
public String annotate(ReferenceContext ref, ReadBackedPileup pileup, Variation variation, List<Genotype> genotypes) {
@ -38,7 +38,16 @@ public class RankSumTest implements VariantAnnotation {
for ( Integer qual : refQuals )
wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2);
double pvalue = wilcoxon.getTwoTailedPValue();
// for R debugging
if ( DEBUG ) {
wilcoxon.DEBUG = DEBUG;
System.out.printf("%s%n", pileup.getLocation());
System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals));
System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals));
}
// we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases)
double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT);
if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 )
return null;

View File

@ -5,6 +5,56 @@ import cern.jet.random.Normal;
import java.util.*;
public class WilcoxonRankSum {
static final String headerString = ("nA nB .005 .01 .025 .05 .10 .20");
static final String header[] = headerString.split(" ");
static final double sigs[] = {-1, -1, .005, .01, .025, .05, .10, .20};
// Probabilities relate to the distribution of WA, the rank sum for group A when
// H0 : A = B is true. The tabulated value for the lower tail is the largest
// value of wA for which pr(WA ² wA) ² prob . The tabulated value for the
// upper tail is the smallest value of wfor which pr(W³ w) ² prob .
// I think this data is wrong -- we should really do the computation outselves and remember the results
static final int data[][] = {
{4, 4, -1, -1, 10, 11, 13, 14},
{4, 5, -1, 10, 11, 12, 14, 15},
{4, 6, 10, 11, 12, 13, 15, 17},
{4, 7, 10, 11, 13, 14, 16, 18},
{4, 8, 11, 12, 14, 15, 17, 20},
{4, 9, 11, 13, 14, 16, 19, 21},
{4, 10, 12, 13, 15, 17, 20, 23},
{4, 11, 12, 14, 16, 18, 21, 24},
{4, 12, 13, 15, 17, 19, 22, 26},
{5, 5, 5, 15, 16, 17, 19, 20},
{5, 6, 16, 17, 18, 20, 22, 24},
{5, 7, 16, 18, 20, 21, 23, 26},
{5, 8, 17, 19, 21, 23, 25, 28},
{5, 9, 18, 20, 22, 24, 27, 30},
{5, 10, 19, 21, 23, 26, 28, 32},
{5, 11, 20, 22, 24, 27, 30, 34},
{5, 12, 21, 23, 26, 28, 32, 36},
{6, 6, 23, 24, 26, 28, 30, 33},
{6, 7, 24, 25, 27, 29, 32, 35},
{6, 8, 25, 27, 29, 31, 34, 37},
{6, 9, 26, 28, 31, 33, 36, 40},
{6, 10, 27, 29, 32, 35, 38, 42},
{6, 11, 28, 30, 34, 37, 40, 44},
{6, 12, 30, 32, 35, 38, 42, 47},
{7, 7, 32, 34, 36, 39, 41, 45},
{7, 8, 34, 35, 38, 41, 44, 48},
{7, 9, 35, 37, 40, 43, 46, 50},
{7, 10, 37, 39, 42, 45, 49, 53},
{7, 11, 38, 40, 44, 47, 51, 56},
{7, 12, 40, 42, 46, 49, 54, 59},
{8, 8, 43, 45, 49, 51, 55, 59},
{8, 9, 45, 47, 51, 54, 58, 62},
{8, 10, 47, 49, 53, 56, 60, 65},
{8, 11, 49, 51, 55, 59, 63, 69},
{8, 12, 51, 53, 58, 62, 66, 72},
{9, 9, 56, 59, 62, 66, 70, 75},
{9, 10, 58, 61, 65, 69, 73, 78},
{9, 11, 61, 63, 68, 72, 76, 82},
{9, 12, 63, 66, 71, 75, 80, 86}};
// *****************************************************************************************//
// The following 4 variables were copied from Tim Fennell's RankSumTest.java code in Picard //
@ -12,18 +62,21 @@ public class WilcoxonRankSum {
// Constructs a normal distribution; actual values of mean and SD don't matter since it's
// just used to convert a z-score into a cumulative probability
private static final double NORMAL_MEAN = 100;
private static final double NORMAL_SD = 15;
public boolean DEBUG = false;
private static final double NORMAL_MEAN = 0;
private static final double NORMAL_SD = 1;
private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null);
// The minimum length for both data series (individually) in order to use a normal distribution
// to calculate the Z-score and the p-value. If either series is shorter than this value then
// we don't attempt to assign a p-value
private static final int minimumNormalN = 10;
private static final int minimumNormalN = 5;
// *****************************************************************************************//
public enum WILCOXON_SET { SET1, SET2 }
public enum WILCOXON_H0 { SET1_NE_SET2, SET1_LT_SET2, SET1_GT_SET2, SMALLER_SET_LT }
// random number generator for dithering
private static final long RANDOM_SEED = 1252863494;
@ -42,7 +95,7 @@ public class WilcoxonRankSum {
// calculate normal approximation of the p-value
// returns -1 when unable to calculate it (too few data points)
public double getTwoTailedPValue() {
public double getPValue(WILCOXON_H0 h0) {
if ( observations.size() == 0 )
return -1.0;
@ -63,34 +116,107 @@ public class WilcoxonRankSum {
}
int n2 = observations.size() - n1;
// if we don't have enough data points, quit
if ( n1 < minimumNormalN || n2 < minimumNormalN )
return -1.0;
// todo -- these are actually integers
// we want the smaller of U1 and U2
double U1 = sum - (n1 * (n1 + 1.0) / 2.0);
double U2 = (n1 * n2) - U1;
double U = Math.min(U1, U2);
double U2 = (n1 * n2) - U1;
double pvalue;
// if we don't have enough data points, quit
if ( n1 < minimumNormalN || n2 < minimumNormalN ) {
pvalue = exactCalculation(h0, n1, n2, U1, U2);
} else {
pvalue = normalApproximation(h0, n1, n2, U1, U2);
}
if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) {
//for (int i = 0; i < observations.size(); i++)
// System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? "Alt" : "Ref"));
//System.out.printf("n1 %d n2 %d U1 %f U2 %f pValue %f QPValue %f%n", n1, n2, U1, U2, pvalue, QualityUtils.phredScaleErrorRate(pvalue));
}
return pvalue;
}
private double exactCalculation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) {
double U = 0;
switch (h0) {
// case SET1_NE_SET2: // two-tailed
// double U = Math.min(U1, U2);
// z = (U - MuU) / stdevU;
// pvalue = 2.0 * NORMAL.cdf(z);
// break;
// case SET1_LT_SET2: // one-tailed, SET1 < SET2
// z = (U1 - MuU) / stdevU;
// pvalue = NORMAL.cdf(z);
// break;
// case SET1_GT_SET2: // one-tailed, SET1 < SET2
// z = (U2 - MuU) / stdevU;
// pvalue = NORMAL.cdf(z);
// break;
case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set
U = n1 < n2 ? U1 : U2;
break;
default:
throw new StingException("Unexpected WILCOXON H0: " + h0);
}
// data is nA nB then
double pvalue = 1; // if we don't find anything, the pvalue is 1 [not significant]
for ( int nAi = 0; nAi < data.length; nAi++ ) {
if ( data[nAi][0] == n1 && data[nAi][1] == n2 ) {
for ( int j = 2; j < data[nAi].length; j++ ) {
if ( U < data[nAi][j] ) {
// we are significant
pvalue = sigs[j];
//System.out.printf("Found %d %d %f a match at %d %d with %d %f%n", n1, n2, U, nAi, j, data[nAi][j], pvalue );
break;
}
}
}
}
return pvalue;
}
private double normalApproximation(WILCOXON_H0 h0, int n1, int n2, double U1, double U2) {
// calculate the normal approximation
double MuU = n1 * n2 / 2.0;
double stdevU = Math.sqrt(n1 * n2 * (n1 + n2 + 1.0) / 12.0);
double z = (U - MuU) / stdevU;
// compute p-value. Taken from Tim Fennell's RankSumTest.java code in Picard
double pvalue = 2.0 * NORMAL.cdf(NORMAL_MEAN + z * NORMAL_SD);
// for (int i = 0; i < observations.size(); i++)
// System.out.println(observations.get(i).first + " -> set" + (observations.get(i).second == WILCOXON_SET.SET1 ? 1 : 2));
// System.out.println("U1=" + U1);
// System.out.println("U2=" + U2);
// System.out.println("U=" + U);
// System.out.println("Zscore=" + z);
// System.out.println("Pvalue=" + pvalue);
double z, pvalue;
switch (h0) {
case SET1_NE_SET2: // two-tailed
double U = Math.min(U1, U2);
z = (U - MuU) / stdevU;
pvalue = 2.0 * NORMAL.cdf(z);
break;
case SET1_LT_SET2: // one-tailed, SET1 < SET2
z = (U1 - MuU) / stdevU;
pvalue = NORMAL.cdf(z);
break;
case SET1_GT_SET2: // one-tailed, SET1 < SET2
z = (U2 - MuU) / stdevU;
pvalue = NORMAL.cdf(z);
break;
case SMALLER_SET_LT: // one-tailed that the smaller set is < the bigger set
double smallerU = n1 < n2 ? U1 : U2;
z = (smallerU - MuU) / stdevU;
pvalue = NORMAL.cdf(z);
break;
default:
throw new StingException("Unexpected WILCOXON H0: " + h0);
}
return pvalue;
}
private void printValues() {
for ( Pair<Double, WILCOXON_SET> obs : observations ) {
System.out.printf("%s%n", obs);
}
}
private void dither() {
for ( Pair<Double, WILCOXON_SET> observation : observations ) {
// generate a random number between 0 and 10,000