From 7c6c4906522c18284e37d21661bb3935b62d9068 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 1 Dec 2009 04:56:17 +0000 Subject: [PATCH] An unfinished implementation of the Wilcoxon rank sum test and a variant annotation that uses it. I need to merge and update this code with Tim's implementation somehow - but that won't happen until later this week, so I'm committing this before I accidentally blow it away. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2193 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/annotator/RankSumTest.java | 75 +++++++++ .../sting/utils/WilcoxonRankSum.java | 144 ++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java create mode 100755 java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java new file mode 100755 index 000000000..ccc6618a1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -0,0 +1,75 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.genotype.*; + +import java.util.List; +import java.util.ArrayList; + + +public class RankSumTest implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, Variation variation, List genotypes) { + + if ( genotypes.size() == 0 ) + return null; + + // this test doesn't make sense for homs + Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes); + if ( genotype == null ) + return null; + + ArrayList refQuals = new ArrayList(); + ArrayList altQuals = new ArrayList(); + + if ( genotype instanceof ReadBacked && ((ReadBacked)genotype).getPileup() != null ) + fillQualsFromGenotypes(ref.getBase(), variation.getAlternativeBaseForSNP(), genotypes, refQuals, altQuals); + else + fillQualsFromPileup(ref.getBase(), variation.getAlternativeBaseForSNP(), pileup, refQuals, altQuals); + + WilcoxonRankSum wilcoxon = new WilcoxonRankSum(); + for ( Integer qual : altQuals ) + wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET1); + for ( Integer qual : refQuals ) + wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2); + + double pvalue = wilcoxon.getTwoTailedPValue(); + if ( MathUtils.compareDoubles(pvalue, 0.0) == 0 ) + return null; + + return new Pair("RankSum", String.format("%.1f", -10.0 * Math.log10(pvalue))); + } + + private void fillQualsFromPileup(char ref, char alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + for ( PileupElement p : pileup ) { + // ignore deletions + if ( p.isDeletion() ) + continue; + + char base = (char)p.getBase(); + if ( base == ref ) + refQuals.add((int)p.getQual()); + else if ( base == alt ) + altQuals.add((int)p.getQual()); + } + } + + private void fillQualsFromGenotypes(char ref, char alt, List genotypes, List refQuals, List altQuals) { + // accumulate quals + for ( Genotype g : genotypes ) { + if ( !(g instanceof ReadBacked) ) + continue; + + ReadBackedPileup pileup = ((ReadBacked)g).getPileup(); + if ( pileup == null ) + continue; + + fillQualsFromPileup(ref, alt, pileup, refQuals, altQuals); + } + } + + public boolean useZeroQualityReads() { return false; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java new file mode 100755 index 000000000..2e2b9dcba --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java @@ -0,0 +1,144 @@ +package org.broadinstitute.sting.utils; + +import java.util.*; + +public class WilcoxonRankSum { + + private LinkedList> observations = new LinkedList>(); + + public enum WILCOXON_SET { SET1, SET2 } + + public WilcoxonRankSum() {} + + public void addObservation(Double observation, WILCOXON_SET set) { + observations.add(new Pair(observation, set)); + } + + public double getTwoTailedPValue() { + if ( observations.size() == 0 ) + return 0.0; + +//////// +// Remove me +//////// + observations.clear(); + + for (int i=0; i < 50; i++) + addObservation(1.0, WILCOXON_SET.SET2); + for (int i=0; i < 50; i++) + addObservation(3.0, WILCOXON_SET.SET1); + + observations.clear(); + + observations.add(new Pair(1.0, WILCOXON_SET.SET2)); + observations.add(new Pair(1.0, WILCOXON_SET.SET2)); + observations.add(new Pair(1.0, WILCOXON_SET.SET2)); + observations.add(new Pair(1.0, WILCOXON_SET.SET2)); + + observations.add(new Pair(8.0, WILCOXON_SET.SET1)); + observations.add(new Pair(8.0, WILCOXON_SET.SET1)); + observations.add(new Pair(8.0, WILCOXON_SET.SET1)); + observations.add(new Pair(8.0, WILCOXON_SET.SET1)); + + observations.clear(); + + observations.add(new Pair(2.0, WILCOXON_SET.SET1)); + observations.add(new Pair(4.0, WILCOXON_SET.SET1)); + observations.add(new Pair(6.0, WILCOXON_SET.SET1)); + observations.add(new Pair(8.0, WILCOXON_SET.SET1)); + + observations.add(new Pair(1.0, WILCOXON_SET.SET2)); + observations.add(new Pair(2.0, WILCOXON_SET.SET2)); + observations.add(new Pair(3.0, WILCOXON_SET.SET2)); + observations.add(new Pair(4.0, WILCOXON_SET.SET2)); +//////// + + + // sort + Collections.sort(observations, new PairComparator()); + + // rank + double[] ranks = calculateRanks(observations); + // for (int i = 0; i < ranks.length; i++) + // System.out.println(observations.get(i).first + " -> " + ranks[i]); + + // sum + double sum = 0.0; + int n1 = 0; + for (int i = 0; i < ranks.length; i++) { + if ( observations.get(i).second == WILCOXON_SET.SET1 ) { + sum += ranks[i]; + n1++; + } + } + int n2 = ranks.length - n1; + + // we want the smaller of U1 and U2 + double U1 = sum - (n1 * (n1 + 1.0) / 2.0); + double U2 = (n1 * n2) - U1; + double U = Math.min(U1, U2); + + // calculate the normal approximation + double MuU = n1 * n2 / 2.0; + double stdevU = Math.sqrt(n1 * n2 * (n1 + n2 + 1.0) / 12.0); + double z = (U - MuU) / stdevU; + + // System.out.println("U1=" + U1); + // System.out.println("U2=" + U2); + // System.out.println("U=" + U); + // System.out.println("z=" + z); + + // compute p-value + // double pvalue = ndtr(z); // normal distribution function + double pvalue = 0.0; + + return pvalue; + } + + private static double[] calculateRanks(List> observations) { + int length = observations.size(); + double[] ranks = new double[length]; + + int currentIndex = 1; + Double currentValue = observations.get(0).first; + int startIndex = 0; + int endIndex = 0; + + while ( currentIndex < length ) { + // if two observations have the same value, they'll need to be ranked together + if ( observations.get(currentIndex).first.equals(currentValue) ) { + endIndex++; + } else { + setRanks(ranks, startIndex, endIndex); + + // increment the holders + startIndex = endIndex = currentIndex; + currentValue = observations.get(currentIndex).first; + } + currentIndex++; + } + if ( startIndex < length ) + setRanks(ranks, startIndex, endIndex); + + return ranks; + } + + private static void setRanks(double[] ranks, int startIndex, int endIndex) { + + // the rank is the average rank of all equal observations + double rankValue = 0.0; + for (int i = startIndex; i <= endIndex; i++) + rankValue += (i+1); + rankValue /= (endIndex - startIndex + 1); + + // set the value + for (int i = startIndex; i <= endIndex; i++) + ranks[i] = rankValue; + } + + private class PairComparator implements Comparator>{ + public int compare(Pair p1, Pair p2) { + return (p1.first < p2.first ? -1 : (p1.first == p2.first ? 0 : 1)); + } + } +} \ No newline at end of file