Structural refactoring in advance of dichotomization statistics; generalization of statistical test infrastructure.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5579 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-04-05 18:52:32 +00:00
parent 77ca4eef31
commit 79b5fa6cc5
25 changed files with 318 additions and 233 deletions

View File

@ -1,15 +1,13 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import cern.jet.random.Normal;
import cern.jet.random.StudentT;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.*;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
@ -23,122 +21,34 @@ import org.broadinstitute.sting.utils.collections.Pair;
public class AssociationTestRunner {
final static int MAX_Q_VALUE = Integer.MAX_VALUE;
// todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance
static Normal standardNormal = new Normal(0.0,1.0,null);
public static int pToQ(double p) {
return Math.min((int) Math.floor(QualityUtils.phredScaleErrorRate(p)),MAX_Q_VALUE);
}
public static Pair<Double,Pair<Double,Integer>> getTestValues(AssociationContext context) {
if ( context instanceof TStatistic) {
Pair<Double,Double> t = testStudentT((TStatistic) context);
return new Pair<Double,Pair<Double,Integer>> (t.first,new Pair<Double,Integer>(t.second,pToQ(t.second)));
}
if ( context instanceof ZStatistic) {
Pair<Double,Double> z = testZ((ZStatistic) context);
return new Pair<Double,Pair<Double,Integer>> (z.first,new Pair<Double,Integer>(z.second,pToQ(z.second)));
}
if ( context instanceof UStatistic ) {
Pair<Double,Double> u = mannWhitneyUTest((UStatistic) context);
return new Pair<Double,Pair<Double,Integer>> ( u.first, new Pair<Double,Integer>(u.second,pToQ(u.second)));
if ( context instanceof StatisticalTest ) {
Pair<Double,Double> statAndP = ((StatisticalTest) context).getStatisticAndPValue();
return new Pair<Double,Pair<Double,Integer>>(statAndP.first,
new Pair<Double,Integer>(statAndP.second,pToQ(statAndP.second)));
}
return null;
}
public static String runTests(AssociationContext context) {
List<String> results = new ArrayList<String>();
if ( context instanceof TStatistic) {
results.add(runStudentT((TStatistic) context));
if ( context instanceof StatisticalTest ) {
Pair<Double,Pair<Double,Integer>> results = getTestValues(context);
return String.format("%s: %.2f\tP: %.2e\tQ: %d",
((StatisticalTest) context).getStatisticName() ,
results.first,results.second.first,results.second.second);
}
if ( context instanceof ZStatistic) {
results.add(runZ((ZStatistic) context));
}
if ( context instanceof UStatistic ) {
results.add(runU((UStatistic) context));
}
StringBuffer buf = new StringBuffer();
if ( results.size() > 0 ) {
buf.append(results.remove(0));
for ( String s : results ) {
buf.append('\t');
buf.append(s);
}
}
return buf.toString();
return null;
}
public static String runStudentT(TStatistic context) {
Pair<Double,Double> stats = testStudentT(context);
double t = stats.first;
double p = stats.second;
return String.format("T: %.2f\tP: %.2e\tQ: %d",t,p,pToQ(p));
}
public static Pair<Double,Double> testStudentT(TStatistic context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1);
double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl);
StudentT studentT = new StudentT(df_num/df_denom,null);
double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t));
return new Pair<Double,Double>(t,p);
}
public static String runZ(ZStatistic context) {
Pair<Double,Double> stats = testZ(context);
double z = stats.first;
double p = stats.second;
return String.format("Z: %.2f\tP: %.2e\tQ: %d",z,p,pToQ(p));
}
public static Pair<Double,Double> testZ(ZStatistic context) {
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = context.getCaseControl();
if ( caseControlCounts == null || caseControlCounts.get(CaseControl.Cohort.CASE) == null || caseControlCounts.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue());
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
double z = (pCase-pControl)/se;
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
return new Pair<Double,Double>(z,p);
}
public static String runU(UStatistic context) {
// note: u statistic (U) is relatively useless for recalibrating outside of the context of m and n
// thus we report V = (U - (m*n+1)/2)/(n*m*(n+m+1)/12)
Pair<Double,Double> results = mannWhitneyUTest(context);
return String.format("V: %.2f\tP: %.2e\tQ: %d",results.first,results.second,pToQ(results.second));
}
public static Pair<Double,Double> mannWhitneyUTest(UStatistic context) {
@Deprecated // this is used for testing only at the moment
public static Pair<Double,Double> mannWhitneyUTest(ValueTest context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
@ -152,8 +62,4 @@ public class AssociationTestRunner {
}
return mwu.runTwoSidedTest();
}
public static String runFisherExact(AssociationContext context) {
return "Test not yet implemented";
}
}

View File

@ -13,7 +13,7 @@ import java.util.Collection;
* Time: 1:52 PM
* To change this template use File | Settings | File Templates.
*/
public class BaseQualityScore extends TStatistic {
public class BaseQualityScore extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> baseQuals = new ArrayList<Integer>(rbp.size());

View File

@ -10,7 +10,7 @@ import java.util.List;
/**
* @author chartl
*/
public class InsertSizeDistribution extends UStatistic {
public class InsertSizeDistribution extends ValueTest {
public boolean usePreviouslySeenReads() { return false; }

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
* Time: 1:29 PM
* To change this template use File | Settings | File Templates.
*/
public class MappingQuality0 extends ZStatistic {
public class MappingQuality0 extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int total = 0;

View File

@ -13,7 +13,7 @@ import java.util.Collection;
* Time: 1:43 PM
* To change this template use File | Settings | File Templates.
*/
public class MateMappingQuality extends UStatistic {
public class MateMappingQuality extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> mateMapQ = new ArrayList<Integer>(rbp.size());

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
* Time: 3:00 PM
* To change this template use File | Settings | File Templates.
*/
public class MateOtherContig extends ZStatistic {
public class MateOtherContig extends ProportionTest {
public boolean usePreviouslySeenReads() { return false; }

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
* Time: 1:47 PM
* To change this template use File | Settings | File Templates.
*/
public class MateSameStrand extends ZStatistic {
public class MateSameStrand extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int numPairs = 0;

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
* Time: 2:09 PM
* To change this template use File | Settings | File Templates.
*/
public class MateUnmapped extends ZStatistic {
public class MateUnmapped extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup pileup) {
int numMatedReads = 0;

View File

@ -17,7 +17,7 @@ import java.util.*;
* Time: 2:32 PM
* To change this template use File | Settings | File Templates.
*/
public class MismatchRate extends UStatistic {
public class MismatchRate extends ValueTest {
private Map<Sample,Object> sampleStats = new HashMap<Sample,Object>();
private int currentRefBase = 0;

View File

@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
* Time: 1:50 PM
* To change this template use File | Settings | File Templates.
*/
public class ProperPairs extends ZStatistic {
public class ProperPairs extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int numReads = 0;

View File

@ -0,0 +1,71 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import cern.jet.random.Normal;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizable;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 12:58:38 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class ProportionTest extends CaseControl<Pair<Number,Number>> implements Dichotomizable, StatisticalTest {
public static final Normal standardNormal = new Normal(0.0,1.0,null);
public abstract Pair<Number,Number> map(ReadBackedPileup rbp );
public Pair<Number,Number> add(Pair<Number,Number> left, Pair<Number,Number> right) {
return new Pair<Number,Number>(left.first.doubleValue()+right.first.doubleValue(),
left.second.doubleValue()+right.second.doubleValue());
}
public Pair<Collection<Number>,Collection<Number>> getDichotomizedData() {
Collection<Number> caseProps = new ArrayList<Number>();
Collection<Number> controlProps = new ArrayList<Number>();
for (Map<Sample,Pair<Number,Number>> sampleNumberMap : window ) {
for ( Map.Entry<Sample,Pair<Number,Number>> samplePairEntry : sampleNumberMap.entrySet() ) {
Pair<Number,Number> val = samplePairEntry.getValue();
if ( samplePairEntry.getKey().getProperty("cohort").equals("case")) {
caseProps.add(val.first.doubleValue()/val.second.doubleValue());
} else if ( samplePairEntry.getKey().getProperty("cohort").equals("control") ) {
controlProps.add(val.first.doubleValue()/val.second.doubleValue());
}
}
}
return new Pair<Collection<Number>,Collection<Number>>(caseProps,controlProps);
}
public Pair<Double,Double> getStatisticAndPValue() {
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = getCaseControl();
if ( caseControlCounts == null || caseControlCounts.get(CaseControl.Cohort.CASE) == null || caseControlCounts.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue());
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
double z = (pCase-pControl)/se;
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
return new Pair<Double,Double>(z,p);
}
public String getStatisticName() { return "Z"; }
}

View File

@ -15,7 +15,7 @@ import java.util.Collection;
* Time: 1:31 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadClipping extends TStatistic {
public class ReadClipping extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> clipping = new ArrayList<Integer>(rbp.size());

View File

@ -15,7 +15,7 @@ import java.util.Collection;
* Time: 1:54 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadIndels extends UStatistic {
public class ReadIndels extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> indelElements = new ArrayList<Integer>(rbp.size());

View File

@ -13,7 +13,7 @@ import java.util.Collection;
* Time: 1:41 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadMappingQuality extends TStatistic {
public class ReadMappingQuality extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> mapQuals = new ArrayList<Integer>(rbp.size());

View File

@ -15,7 +15,7 @@ import java.util.Map;
* Time: 1:17 PM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceMismatches extends ZStatistic {
public class ReferenceMismatches extends ProportionTest {
final static int[] BASE_INDEX = {0,1,2,3};

View File

@ -12,7 +12,7 @@ import java.util.*;
/**
* @author chartl
*/
public class SampleDepth extends UStatistic {
public class SampleDepth extends ValueTest {
public Map<Sample,Object> sampleStats = null;

View File

@ -1,36 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 12:58:16 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class TStatistic extends CaseControl<Collection<Number>> {
public abstract Collection<Number> map(ReadBackedPileup rbp );
public Collection<Number> add(Collection<Number> left, Collection<Number> right) {
if ( left instanceof ArrayList) {
((ArrayList) left).addAll(right);
return left;
} else if ( left instanceof Set ) {
((Set) left).addAll(right);
return left;
} else {
List<Number> newList = new ArrayList<Number>(left.size()+right.size());
newList.addAll(left);
newList.addAll(right);
return newList;
}
}
}

View File

@ -1,36 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/2/11
* Time: 1:53 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class UStatistic extends CaseControl<Collection<Number>> {
public abstract Collection<Number> map(ReadBackedPileup rbp );
public Collection<Number> add(Collection<Number> left, Collection<Number> right) {
if ( left instanceof ArrayList ) {
((ArrayList) left).addAll(right);
return left;
} else if ( left instanceof Set) {
((Set) left).addAll(right);
return left;
} else {
List<Number> newList = new ArrayList<Number>(left.size()+right.size());
newList.addAll(left);
newList.addAll(right);
return newList;
}
}
}

View File

@ -0,0 +1,103 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import cern.jet.random.StudentT;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizable;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/2/11
* Time: 1:53 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class ValueTest extends CaseControl<Collection<Number>> implements StatisticalTest, Dichotomizable {
public abstract Collection<Number> map(ReadBackedPileup rbp );
public boolean useTStatistic() { return false; } // default to using the U statistic
public Collection<Number> add(Collection<Number> left, Collection<Number> right) {
if ( left instanceof ArrayList ) {
((ArrayList) left).addAll(right);
return left;
} else if ( left instanceof Set) {
((Set) left).addAll(right);
return left;
} else {
List<Number> newList = new ArrayList<Number>(left.size()+right.size());
newList.addAll(left);
newList.addAll(right);
return newList;
}
}
public Pair<Collection<Number>,Collection<Number>> getDichotomizedData() {
Collection<Number> caseMeans = new ArrayList<Number>();
Collection<Number> controlMeans = new ArrayList<Number>();
for ( Map<Sample,Collection<Number>> sampleMap : window ) {
for ( Map.Entry<Sample,Collection<Number>> sampleEntry : sampleMap.entrySet() ) {
if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) {
caseMeans.add(MathUtils.average(sampleEntry.getValue()));
} else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) {
controlMeans.add(MathUtils.average(sampleEntry.getValue()));
}
}
}
return new Pair<Collection<Number>,Collection<Number>>(caseMeans,controlMeans);
}
public Pair<Double,Double> getUStatisticAndPValue() {
MannWhitneyU mwu = new MannWhitneyU();
for ( Map.Entry<Cohort,Collection<Number>> cohortEntry : getCaseControl().entrySet() ) {
if ( cohortEntry.getKey().equals(Cohort.CASE) ) {
for ( Number n : cohortEntry.getValue() ) {
mwu.add(n,MannWhitneyU.USet.SET1);
}
} else if ( cohortEntry.getKey().equals(Cohort.CONTROL)) {
for ( Number n : cohortEntry.getValue() ) {
mwu.add(n,MannWhitneyU.USet.SET2);
}
}
}
return mwu.runTwoSidedTest();
}
public Pair<Double,Double> getStatisticAndPValue() { return useTStatistic() ? getTStatisticAndPValue() : getUStatisticAndPValue(); }
public String getStatisticName() { return useTStatistic() ? "T" : "V"; }
public Pair<Double,Double> getTStatisticAndPValue() {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1);
double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl);
StudentT studentT = new StudentT(df_num/df_denom,null);
double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t));
return new Pair<Double,Double>(t,p);
}
}

View File

@ -1,21 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 12:58:38 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class ZStatistic extends CaseControl<Pair<Number,Number>> {
public abstract Pair<Number,Number> map(ReadBackedPileup rbp );
public Pair<Number,Number> add(Pair<Number,Number> left, Pair<Number,Number> right) {
return new Pair<Number,Number>(left.first.doubleValue()+right.first.doubleValue(),
left.second.doubleValue()+right.second.doubleValue());
}
}

View File

@ -0,0 +1,16 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 4/5/11
* Time: 11:35 AM
* To change this template use File | Settings | File Templates.
*/
public interface Dichotomizable {
public abstract Pair<Collection<Number>,Collection<Number>> getDichotomizedData();
}

View File

@ -0,0 +1,56 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 4/4/11
* Time: 7:18 PM
* To change this template use File | Settings | File Templates.
*/
public class Dichotomizer1D {
private Dichotomizer1D() { } // no instantiation
public abstract class Transform {
public abstract double transform(double n);
public Collection<Number> apply(Collection<Number> collection) {
ArrayList<Number> tData = new ArrayList<Number>(collection.size());
for ( Number n : collection ) {
tData.add(transform(n.doubleValue()));
}
return tData;
}
}
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo) {
double meanOne = MathUtils.sum(setOne)/setOne.size();
double meanTwo = MathUtils.sum(setTwo)/setTwo.size();
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne));
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo));
return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo);
}
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo, Transform transform) {
return simpleGaussianDichotomy(transform.apply(setOne),transform.apply(setTwo));
}
public static double simpleGaussianDichotomy(Dichotomizable dichotomizable) {
Pair<Collection<Number>,Collection<Number>> dichotomizedData = dichotomizable.getDichotomizedData();
return simpleGaussianDichotomy(dichotomizedData.first,dichotomizedData.second);
}
public static double simpleGaussianDichotomy(Dichotomizable dichotomizable, Transform transform) {
Pair<Collection<Number>,Collection<Number>> dichotomizedData = dichotomizable.getDichotomizedData();
return simpleGaussianDichotomy(dichotomizedData.first,dichotomizedData.second, transform);
}
}

View File

@ -0,0 +1,15 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.collections.Pair;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 4/5/11
* Time: 12:26 PM
* To change this template use File | Settings | File Templates.
*/
public interface StatisticalTest {
public abstract Pair<Double,Double> getStatisticAndPValue();
public abstract String getStatisticName();
}

View File

@ -40,6 +40,17 @@ public class MannWhitneyU {
}
}
public Pair<Long,Long> getR1R2() {
long u1 = calculateOneSidedU(observations,MannWhitneyU.USet.SET1);
long n1 = sizeSet1*(sizeSet1+1)/2;
long r1 = u1 + n1;
long n2 = sizeSet2*(sizeSet2+1)/2;
long u2 = n1*n2-u1;
long r2 = u2 + n2;
return new Pair<Long,Long>(r1,r2);
}
/**
* Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically
* dominates the other set

View File

@ -1,18 +1,15 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.UStatistic;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ZStatistic;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ProportionTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ValueTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationTestRunner;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.TStatistic;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import java.util.Arrays;
import java.util.Collection;
@ -36,20 +33,20 @@ public class RegionalAssociationUnitTest extends BaseTest {
TTest test1 = new TTest();
test1.setCaseData((Collection) Arrays.asList(1,1,2,3,4));
test1.setControlData((Collection) Arrays.asList(10, 10, 20, 30, 40));
Assert.assertEquals(AssociationTestRunner.testStudentT(test1).second,0.1702,1e-2);
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).second.first,0.1702,1e-2);
TTest test2 = new TTest();
test2.setCaseData((Collection) Arrays.asList(5, 6, 5, 2, 3, 8, 7, 12, 10, 6, 4, 2, 8, 7, 3));
test2.setControlData((Collection) Arrays.asList(1, 6, 7, 2, 3, 3, 4, 1, 2, 5, 7, 3, 10, 3, 3, 2, 3));
Assert.assertEquals(AssociationTestRunner.testStudentT(test2).second, 0.5805, 1e-2);
Assert.assertEquals(AssociationTestRunner.getTestValues(test2).second.first, 0.5805, 1e-2);
TTest test3 = new TTest();
test3.setCaseData((Collection) Arrays.asList(94,25,68,4,27,51,9,10,91,61,61,37,39,44,36,27,86,33,3,38,5,6,28,93,30,56,81,8,40,44));
test3.setControlData((Collection) Arrays.asList(6,64,96,85,20,74,93,18,31,20,88,38,80,50,33,81,35,8,2,69,49,6,26,74,79,63,63,96,45,18));
Assert.assertEquals(AssociationTestRunner.testStudentT(test3).second,0.8229,1e-4);
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).second.first,0.8229,1e-4);
TTest test4 = new TTest();
test4.setCaseData((Collection) Arrays.asList(14,8,8,17,8,12,10,10,13,9,13,9,9,12,12,11,16,12,13,16,10,13,11,16,13,16,11,13,9,16,16,14,9,14,17,10,15,15,9,15,17,15,17,12,10,13,11,14,8,14));
test4.setControlData((Collection) Arrays.asList(7,1,4,2,3,7,8,5,5,4,10,6,4,9,2,9,9,3,3,10,1,8,9,5,3,7,2,7,10,9,4,9,2,10,10,3,2,3,4,4,5,10,9,4,3,5,6,10,5,10));
Assert.assertEquals(AssociationTestRunner.testStudentT(test4).second,0.1006,1e-4);
Assert.assertEquals(AssociationTestRunner.testStudentT(test4).first,1.657989,1e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test4).second.first,0.1006,1e-4);
Assert.assertEquals(AssociationTestRunner.getTestValues(test4).first,1.657989,1e-6);
}
@Test
@ -58,17 +55,17 @@ public class RegionalAssociationUnitTest extends BaseTest {
ZTest test1 = new ZTest();
test1.setCaseData(new Pair<Number,Number>(100,500));
test1.setControlData(new Pair<Number,Number>(55,300));
Assert.assertEquals(AssociationTestRunner.testZ(test1).first,0.57742362050306,2e-6);
Assert.assertEquals(AssociationTestRunner.testZ(test1).second,0.56367,2e-5);
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).first,0.57742362050306,2e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).second.first,0.56367,2e-5);
ZTest test2 = new ZTest();
test1.setCaseData(new Pair<Number, Number>(1020, 1800));
test1.setControlData(new Pair<Number, Number>(680, 1670));
Assert.assertEquals(AssociationTestRunner.testZ(test1).first,9.3898178216531,2e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).first,9.3898178216531,2e-6);
ZTest test3 = new ZTest();
test3.setCaseData(new Pair<Number,Number>(20,60));
test3.setControlData(new Pair<Number,Number>(30,80));
Assert.assertEquals(AssociationTestRunner.testZ(test3).first,-0.50917511840392,2e-6);
Assert.assertEquals(AssociationTestRunner.testZ(test3).second,0.610643593,2e-4);
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).first,-0.50917511840392,2e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).second.first,0.610643593,2e-4);
}
@Test
@ -160,7 +157,7 @@ public class RegionalAssociationUnitTest extends BaseTest {
}
private class TTest extends TStatistic {
private class TTest extends ValueTest {
Map<Cohort,Collection<Number>> toTest = new HashMap<Cohort,Collection<Number>>(2);
@Override
@ -178,11 +175,14 @@ public class RegionalAssociationUnitTest extends BaseTest {
public Collection<Number> map(ReadBackedPileup rbp) { return null; }
public int getWindowSize() { return 1; }
public int slideByValue() { return 1; }
@Override
public boolean useTStatistic() { return true; }
public boolean usePreviouslySeenReads() { return false; }
}
private class ZTest extends ZStatistic {
private class ZTest extends ProportionTest {
Map<Cohort,Pair<Number,Number>> toTest = new HashMap<Cohort,Pair<Number,Number>>(2);
@Override
@ -204,7 +204,7 @@ public class RegionalAssociationUnitTest extends BaseTest {
public boolean usePreviouslySeenReads() { return true; }
}
private class UTest extends UStatistic {
private class UTest extends ValueTest {
TTest test = new TTest();
public boolean usePreviouslySeenReads() { return false; }
public int getWindowSize() { return 1; }