An unfinished implementation of the Wilcoxon rank sum test and a variant annotation that uses it. I need to merge and update this code with Tim's implementation somehow - but that won't happen until later this week, so I'm committing this before I accidentally blow it away.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2193 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-01 04:56:17 +00:00
parent 00f15ea909
commit 7c6c490652
2 changed files with 219 additions and 0 deletions

View File

@ -0,0 +1,75 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.List;
import java.util.ArrayList;
public class RankSumTest implements VariantAnnotation {
public Pair<String, String> annotate(ReferenceContext ref, ReadBackedPileup pileup, Variation variation, List<Genotype> genotypes) {
if ( genotypes.size() == 0 )
return null;
// this test doesn't make sense for homs
Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes);
if ( genotype == null )
return null;
ArrayList<Integer> refQuals = new ArrayList<Integer>();
ArrayList<Integer> altQuals = new ArrayList<Integer>();
if ( genotype instanceof ReadBacked && ((ReadBacked)genotype).getPileup() != null )
fillQualsFromGenotypes(ref.getBase(), variation.getAlternativeBaseForSNP(), genotypes, refQuals, altQuals);
else
fillQualsFromPileup(ref.getBase(), variation.getAlternativeBaseForSNP(), pileup, refQuals, altQuals);
WilcoxonRankSum wilcoxon = new WilcoxonRankSum();
for ( Integer qual : altQuals )
wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET1);
for ( Integer qual : refQuals )
wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2);
double pvalue = wilcoxon.getTwoTailedPValue();
if ( MathUtils.compareDoubles(pvalue, 0.0) == 0 )
return null;
return new Pair<String, String>("RankSum", String.format("%.1f", -10.0 * Math.log10(pvalue)));
}
private void fillQualsFromPileup(char ref, char alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals) {
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
char base = (char)p.getBase();
if ( base == ref )
refQuals.add((int)p.getQual());
else if ( base == alt )
altQuals.add((int)p.getQual());
}
}
private void fillQualsFromGenotypes(char ref, char alt, List<Genotype> genotypes, List<Integer> refQuals, List<Integer> altQuals) {
// accumulate quals
for ( Genotype g : genotypes ) {
if ( !(g instanceof ReadBacked) )
continue;
ReadBackedPileup pileup = ((ReadBacked)g).getPileup();
if ( pileup == null )
continue;
fillQualsFromPileup(ref, alt, pileup, refQuals, altQuals);
}
}
public boolean useZeroQualityReads() { return false; }
}

View File

@ -0,0 +1,144 @@
package org.broadinstitute.sting.utils;
import java.util.*;
public class WilcoxonRankSum {
private LinkedList<Pair<Double, WILCOXON_SET>> observations = new LinkedList<Pair<Double, WILCOXON_SET>>();
public enum WILCOXON_SET { SET1, SET2 }
public WilcoxonRankSum() {}
public void addObservation(Double observation, WILCOXON_SET set) {
observations.add(new Pair<Double, WILCOXON_SET>(observation, set));
}
public double getTwoTailedPValue() {
if ( observations.size() == 0 )
return 0.0;
////////
// Remove me
////////
observations.clear();
for (int i=0; i < 50; i++)
addObservation(1.0, WILCOXON_SET.SET2);
for (int i=0; i < 50; i++)
addObservation(3.0, WILCOXON_SET.SET1);
observations.clear();
observations.add(new Pair<Double, WILCOXON_SET>(1.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(1.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(1.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(1.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(8.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(8.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(8.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(8.0, WILCOXON_SET.SET1));
observations.clear();
observations.add(new Pair<Double, WILCOXON_SET>(2.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(4.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(6.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(8.0, WILCOXON_SET.SET1));
observations.add(new Pair<Double, WILCOXON_SET>(1.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(2.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(3.0, WILCOXON_SET.SET2));
observations.add(new Pair<Double, WILCOXON_SET>(4.0, WILCOXON_SET.SET2));
////////
// sort
Collections.sort(observations, new PairComparator());
// rank
double[] ranks = calculateRanks(observations);
// for (int i = 0; i < ranks.length; i++)
// System.out.println(observations.get(i).first + " -> " + ranks[i]);
// sum
double sum = 0.0;
int n1 = 0;
for (int i = 0; i < ranks.length; i++) {
if ( observations.get(i).second == WILCOXON_SET.SET1 ) {
sum += ranks[i];
n1++;
}
}
int n2 = ranks.length - n1;
// we want the smaller of U1 and U2
double U1 = sum - (n1 * (n1 + 1.0) / 2.0);
double U2 = (n1 * n2) - U1;
double U = Math.min(U1, U2);
// calculate the normal approximation
double MuU = n1 * n2 / 2.0;
double stdevU = Math.sqrt(n1 * n2 * (n1 + n2 + 1.0) / 12.0);
double z = (U - MuU) / stdevU;
// System.out.println("U1=" + U1);
// System.out.println("U2=" + U2);
// System.out.println("U=" + U);
// System.out.println("z=" + z);
// compute p-value
// double pvalue = ndtr(z); // normal distribution function
double pvalue = 0.0;
return pvalue;
}
private static double[] calculateRanks(List<Pair<Double, WILCOXON_SET>> observations) {
int length = observations.size();
double[] ranks = new double[length];
int currentIndex = 1;
Double currentValue = observations.get(0).first;
int startIndex = 0;
int endIndex = 0;
while ( currentIndex < length ) {
// if two observations have the same value, they'll need to be ranked together
if ( observations.get(currentIndex).first.equals(currentValue) ) {
endIndex++;
} else {
setRanks(ranks, startIndex, endIndex);
// increment the holders
startIndex = endIndex = currentIndex;
currentValue = observations.get(currentIndex).first;
}
currentIndex++;
}
if ( startIndex < length )
setRanks(ranks, startIndex, endIndex);
return ranks;
}
private static void setRanks(double[] ranks, int startIndex, int endIndex) {
// the rank is the average rank of all equal observations
double rankValue = 0.0;
for (int i = startIndex; i <= endIndex; i++)
rankValue += (i+1);
rankValue /= (endIndex - startIndex + 1);
// set the value
for (int i = startIndex; i <= endIndex; i++)
ranks[i] = rankValue;
}
private class PairComparator implements Comparator<Pair<Double, WILCOXON_SET>>{
public int compare(Pair<Double, WILCOXON_SET> p1, Pair<Double, WILCOXON_SET> p2) {
return (p1.first < p2.first ? -1 : (p1.first == p2.first ? 0 : 1));
}
}
}