Removing the old association walker, switching test to just validate that MannWhitneyU is doing the right thing. Unit tests still pass.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5690 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-04-26 18:05:19 +00:00
parent b915520653
commit bc3fd70b0a
34 changed files with 91 additions and 2294 deletions

View File

@ -1,89 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/2/11
* Time: 11:58 AM
* To change this template use File | Settings | File Templates.
*/
public abstract class AssociationContext<X,Y> {
protected List<Map<Sample,Y>> window;
private int size;
private int slide;
protected double zVal;
public AssociationContext() {
}
public AssociationContext(final RegionalAssociationWalker parent) {
this.init(parent);
}
// specifies whether to use previously seen reads
public abstract boolean usePreviouslySeenReads();
// specifies the map from a sample's pileup to the data we want to test
public abstract Object map(ReadBackedPileup rbp);
// specifies how to take the per-sample data and reduce them into testable pairs
public abstract Map<?,X> reduce(List<Map<Sample,Y>> win);
// do we filter the current location (e.g. omit from window)
public boolean filter(MapExtender m) { return true; }
// a basic initialization of the context (give the walker for access to object?)
public void init(RegionalAssociationWalker walker) {
size = walker.windowSize;
slide = walker.slideBy;
window = new ArrayList<Map<Sample,Y>>(size);
zVal = walker.zVal;
}
public Map<Sample,Object> mapLocus(MapExtender extender) {
Map<Sample,ReadBackedPileup> pileups;
if ( ! usePreviouslySeenReads() ) {
pileups = extender.getReadFilteredPileup();
} else {
pileups = extender.getFullPileup();
}
Map<Sample,Object> maps = new HashMap<Sample,Object>(pileups.size());
for ( Map.Entry<Sample,ReadBackedPileup> samPileup : pileups.entrySet() ) {
maps.put(samPileup.getKey(),map(samPileup.getValue()));
}
return maps;
}
public void addData(Map<Sample,Y> sampleData) {
window.add(sampleData);
}
public void flush() {
window = new ArrayList<Map<Sample,Y>>(size);
}
public boolean isFull() {
return window.size() >= size;
}
public void slide() {
ArrayList<Map<Sample,Y>> newWindow = new ArrayList<Map<Sample,Y>>((window.subList(slide,window.size())));
newWindow.ensureCapacity(size);
window = newWindow;
}
public int getWindowSize() {
return window.size();
}
public boolean testOnFlush() { return true; }
public boolean canFlush() { return true; }
}

View File

@ -1,93 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import java.util.Collection;
import java.util.Map;
import cern.jet.random.Normal;
import cern.jet.random.StudentT;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.*;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizable;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizer1D;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 11:48:26 AM
* To change this template use File | Settings | File Templates.
*/
public class AssociationTestRunner {
final static int MAX_Q_VALUE = Integer.MAX_VALUE;
// todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance
final static Dichotomizer1D.Transform LOG_TRANSFORM = (new Dichotomizer1D()).new Transform() {
private double EPSILON = 1e-20;
@Override
public double transform(double d) {
return Math.log(d+EPSILON);
}
};
final static Dichotomizer1D.Transform STD_ASIN = (new Dichotomizer1D()).new Transform() {
@Override
public double transform(double d) {
return Math.asin(Math.sqrt(d));
}
};
public static int pToQ(double p) {
return Math.min((int) Math.floor(QualityUtils.phredScaleErrorRate(p)),MAX_Q_VALUE);
}
public static Pair<Pair<Double,Double>,Pair<Double,Integer>> getTestValues(AssociationContext context) {
if ( context instanceof StatisticalTest ) {
Pair<Double,Double> statAndP = ((StatisticalTest) context).getStatisticAndPValue();
Double delta = ((StatisticalTest) context).getZConfDelta();
return new Pair<Pair<Double,Double>,Pair<Double,Integer>>(
new Pair<Double,Double>(statAndP.first,delta),
new Pair<Double,Integer>(statAndP.second,pToQ(statAndP.second)));
}
return null;
}
public static String runTests(AssociationContext context) {
if ( context instanceof StatisticalTest ) {
Pair<Pair<Double,Double>,Pair<Double,Integer>> results = getTestValues(context);
return String.format("%s: %.2f\tD: %.2f\tP: %.2e\tQ: %d",
((StatisticalTest) context).getStatisticName() ,
results.first.first,results.first.second,results.second.first,results.second.second);
}
return null;
}
@Deprecated // this is used for testing only at the moment
public static Pair<Double,Double> mannWhitneyUTest(ValueTest context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
MannWhitneyU mwu = new MannWhitneyU();
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) {
mwu.add(n, MannWhitneyU.USet.SET1);
}
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CONTROL) ) {
mwu.add(n,MannWhitneyU.USet.SET2);
}
return mwu.runTwoSidedTest();
}
public static Pair<Double,Double> getDichotomizedValues(AssociationContext context) {
if ( context instanceof Dichotomizable ) {
double raw = Dichotomizer1D.simpleGaussianDichotomy(((Dichotomizable)context));
double log = Dichotomizer1D.simpleGaussianDichotomy(((Dichotomizable)context),LOG_TRANSFORM);
return new Pair<Double,Double>(raw,log);
}
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
}

View File

@ -1,96 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import java.util.*;
/**
* @Author chartl
* @Date 2011-02-23
* Holds multiple map contexts for use in the regional association walker
*/
public class MapExtender {
static AlignmentContextUtils.ReadOrientation TYPE = AlignmentContextUtils.ReadOrientation.COMPLETE;
// hold on to these -- atoms may want access to the tracker or other context types
private MapHolder previous = null;
private MapHolder current = null;
private Map<Sample,ReadBackedPileup> fullPileup = null;
private Map<Sample,ReadBackedPileup> readFilteredPileup = null;
private Map<String,Sample> idToSampleObject = null;
public MapExtender(Set<Sample> samples) {
idToSampleObject = new HashMap<String,Sample>(samples.size());
for ( Sample s : samples ) {
idToSampleObject.put(s.getId(),s);
}
}
public void set(MapHolder holder) {
previous = current;
current = holder;
Map<Sample,ReadBackedPileup> prevPileup = fullPileup;
fullPileup = new HashMap<Sample,ReadBackedPileup>();
readFilteredPileup = new HashMap<Sample,ReadBackedPileup>();
if ( current != null ) {
for ( Map.Entry<Sample,AlignmentContext> sac : current.getContext(idToSampleObject).entrySet() ) {
AlignmentContext context = AlignmentContextUtils.stratify(sac.getValue(), TYPE);
if ( context.hasBasePileup() ) {
fullPileup.put(sac.getKey(),context.getBasePileup());
} else if ( context.hasExtendedEventPileup() ) {
fullPileup.put(sac.getKey(),context.getExtendedEventPileup());
}
if ( prevPileup != null ) {
List<PileupElement> filtElems = new ArrayList<PileupElement>(fullPileup.get(sac.getKey()).size());
Set<SAMRecord> seenReads = prevPileup.containsKey(sac.getKey()) ? new HashSet<SAMRecord>(prevPileup.get(sac.getKey()).getReads()) : new HashSet<SAMRecord>(0);
for ( PileupElement e : fullPileup.get(sac.getKey()) ) {
if ( ! seenReads.contains(e.getRead()) ) {
filtElems.add(e);
}
}
readFilteredPileup.put(sac.getKey(),new ReadBackedPileupImpl(current.getRef().getLocus(),filtElems));
} else {
readFilteredPileup = fullPileup;
}
}
}
}
public Map<Sample,ReadBackedPileup> getFullPileup() { return fullPileup; }
public Map<Sample,ReadBackedPileup> getReadFilteredPileup(){ return readFilteredPileup; }
public Map<Sample,AlignmentContext> getPreviousContext() {
return previous != null ? previous.getContext(idToSampleObject) : null;
}
public ReferenceContext getPreviousRef() {
return previous != null ? previous.getRef() : null;
}
public RefMetaDataTracker getPreviousTracker() {
return previous != null ? previous.getTracker() : null;
}
public Map<Sample,AlignmentContext> getContext() {
return current != null ? current.getContext(idToSampleObject) : null;
}
public ReferenceContext getReferenceContext() {
return current != null ? current.getRef() : null;
}
public RefMetaDataTracker getTracker() {
return current != null ? current.getTracker() : null;
}
}

View File

@ -1,38 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import java.util.HashMap;
import java.util.Map;
public class MapHolder {
private RefMetaDataTracker tracker;
private ReferenceContext ref;
private Map<String, AlignmentContext> alignments;
public MapHolder(RefMetaDataTracker t, ReferenceContext r, AlignmentContext a) {
tracker = t;
ref = r;
alignments = AlignmentContextUtils.splitContextBySampleName(a);
}
public Map<Sample, AlignmentContext> getContext(Map<String,Sample> stringSampleMap) {
Map<Sample,AlignmentContext> mappedContexts = new HashMap<Sample,AlignmentContext>(alignments.size());
for ( Map.Entry<String,AlignmentContext> entry : alignments.entrySet() ) {
mappedContexts.put(stringSampleMap.get(entry.getKey()),entry.getValue());
}
return mappedContexts;
}
public ReferenceContext getRef() {
return ref;
}
public RefMetaDataTracker getTracker() {
return tracker;
}
}

View File

@ -1,128 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationContext;
import org.broadinstitute.sting.oneoffprojects.walkers.association.MapHolder;
import org.broadinstitute.sting.oneoffprojects.walkers.association.RegionalAssociationHandler;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 4/12/11
* Time: 5:21 PM
* To change this template use File | Settings | File Templates.
*/
public class RAWSampleStats extends LocusWalker<MapHolder,RegionalAssociationHandler> {
@Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false)
public String[] associationsToUse = null;
@Argument(doc="Set the window size for associations to this value",shortName="w",fullName="window",required=false)
public int windowSize = 50;
@Argument(doc="Set the window sliding value for associations to this value",shortName="s",fullName="slide",required=false)
public int slideBy = 10;
@Output
PrintStream out;
public RegionalAssociationHandler reduceInit() {
return new RegionalAssociationHandler(getAssociations(),getToolkit().getSAMFileSamples(),false);
}
public MapHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return new MapHolder(tracker,ref,context);
}
public RegionalAssociationHandler reduce(MapHolder holder, RegionalAssociationHandler handler) {
handler.updateExtender(holder);
StringBuffer contextBuf = new StringBuffer();
contextBuf.append(holder.getRef().getLocus().toString());
for ( AssociationContext context : handler.getAssociations() ) {
contextBuf.append("\t");
contextBuf.append(context.getClass().getSimpleName());
StringBuffer caseBuf = new StringBuffer();
StringBuffer controlBuf = new StringBuffer();
boolean first = true;
for ( Map.Entry<Sample,AlignmentContext> entry : handler.getExtender().getContext().entrySet() ) {
StringBuffer toAppend = entry.getKey().getProperty("cohort").equals("case") ? caseBuf : controlBuf;
toAppend.append(entry.getKey().getId());
toAppend.append("=");
toAppend.append(handleMap(context.map(entry.getValue().getBasePileup())));
toAppend.append(";");
}
caseBuf.deleteCharAt(caseBuf.length()-1);
controlBuf.deleteCharAt(controlBuf.length()-1);
contextBuf.append("\t");
contextBuf.append("Case:\t");
contextBuf.append(caseBuf);
contextBuf.append("\tControl:\t");
contextBuf.append(controlBuf);
}
out.printf("%s%n",contextBuf.toString());
return handler;
}
public String handleMap(Object o) {
if ( o instanceof Pair) {
Pair<Number,Number> k = (Pair<Number,Number>) o;
return String.format("%.2f",k.first.doubleValue()/k.second.doubleValue());
}
if ( o instanceof Collection ) {
Collection<Number> k = (Collection<Number>) o;
return String.format("%.2f", MathUtils.average(k,true));
}
return "NA";
}
public Set<AssociationContext> getAssociations() {
List<Class<? extends AssociationContext>> contexts = new PluginManager<AssociationContext>(AssociationContext.class).getPlugins();
if ( associationsToUse.length > 0 && associationsToUse[0].equals("ALL") ) {
HashSet<AssociationContext> allAssoc = new HashSet<AssociationContext>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
AssociationContext context;
try {
context = clazz.getConstructor(new Class[] {}).newInstance(new Object[] {});
} catch (Exception e ) {
throw new StingException("The class "+clazz.getSimpleName()+" could not be instantiated",e);
}
allAssoc.add(context);
}
return allAssoc;
}
Map<String,Class<? extends AssociationContext>> classNameToClass = new HashMap<String,Class<? extends AssociationContext>>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
classNameToClass.put(clazz.getSimpleName(),clazz);
}
Set<AssociationContext> validAssociations = new HashSet<AssociationContext>();
for ( String s : associationsToUse ) {
AssociationContext context;
try {
context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[] {});
} catch ( Exception e ) {
throw new StingException("The class "+s+" could not be instantiated.",e);
}
validAssociations.add(context);
}
return validAssociations;
}
}

View File

@ -1,108 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.*;
/**
* @Author chartl
* @Date 2011-02-23
* A general context for windowed association
*/
public class RegionalAssociationHandler {
private MapExtender maps;
// todo -- the correct way to do this is via the PluginManager (a la VariantEval) but this is a QND implementation
private Set<AssociationContext> associations;
private boolean bedGraphFormat;
public RegionalAssociationHandler(Set<AssociationContext> contexts, Set<Sample> samples, boolean bedGraph) {
maps = new MapExtender(samples);
associations = contexts;
bedGraphFormat = bedGraph;
}
public void updateExtender(MapHolder mapHolder) {
maps.set(mapHolder);
}
/**
* Once the instances are collected; run map-reduce
* @map: MapExtender --> A child of AssociationContextAtom
* @implementation: Indirect construction via newinstance
* @reduce: (List<AssociationContext>,AssociationContext) --> List<AssociationContextAtom>
* @implementation: just append
*/
public Map<AssociationContext,String> runMapReduce() {
Map<AssociationContext,String> testResults = new HashMap<AssociationContext,String>();
if ( maps.getPreviousRef() != null && maps.getPreviousRef().getLocus().compareTo(getLocation()) != -1 ) {
testResults.putAll(flush());
}
for ( AssociationContext w : associations ) {
if ( w.filter(maps) ) {
w.addData(w.mapLocus(maps));
}
if ( w.isFull() ) {
testResults.put(w,runTest(w));
w.slide();
}
}
return testResults;
}
/**
* Switches what formatting to use based on wiggle or standard
* implicit param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q)
* @return - test results in proper format
*/
public String runTest(AssociationContext context) {
// todo -- maybe the tdf should be the whole window rather than just the most recent loc?
String outVal;
if ( bedGraphFormat ) {
Pair<Pair<Double,Double>,Pair<Double,Integer>> statVals = AssociationTestRunner.getTestValues(context);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
outVal = String.format("%.2f\t%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first.first,statVals.first.second,
statVals.second.first,statVals.second.second,simpleDichotVals.first,simpleDichotVals.second);
} else {
outVal = AssociationTestRunner.runTests(context);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
outVal += String.format("\tDi: %.2f\tLogDi: %.2f",simpleDichotVals.first,simpleDichotVals.second);
}
return String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(),
maps.getReferenceContext().getLocus().getStart()-context.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal);
}
public GenomeLoc getLocation() {
return maps.getReferenceContext().getLocus();
}
/**
* Flushes context on a jump between intervals. Can not return null.
* @return on-flush tests
*/
public Map<AssociationContext,String> flush() {
Map<AssociationContext,String> flushTests = new HashMap<AssociationContext,String>();
for ( AssociationContext ac : associations ) {
if ( ac.canFlush() ) {
if ( ac.testOnFlush() ) {
flushTests.put(ac,runTest(ac));
}
ac.flush();
}
}
return flushTests;
}
public Set<AssociationContext> getAssociations() {
return associations;
}
public MapExtender getExtender() {
return maps;
}
}

View File

@ -1,56 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.broadinstitute.sting.gatk.walkers.Multiplexer;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 12:09 PM
* To change this template use File | Settings | File Templates.
*/
public class RegionalAssociationMultiplexer implements Multiplexer<Class<? extends AssociationContext>> {
Set<Class<? extends AssociationContext>> contexts = null;
boolean bedGraphOut = false;
public RegionalAssociationMultiplexer(String[] toUse, boolean wiggleOutput) {
super();
contexts = getAssociations(toUse);
bedGraphOut = wiggleOutput;
}
public Collection<Class<? extends AssociationContext>> multiplex() {
return contexts;
}
public String transformArgument(final Class<? extends AssociationContext> context, String arg) {
if ( bedGraphOut ) {
return String.format("%s.%s.bedgraph",arg,context.getSimpleName());
}
return String.format("%s.%s.tdf", arg, context.getSimpleName());
}
private Set<Class<? extends AssociationContext>> getAssociations(String[] associationsToUse) {
List<Class<? extends AssociationContext>> contexts = new PluginManager<AssociationContext>(AssociationContext.class).getPlugins();
if ( associationsToUse.length > 0 && associationsToUse[0].equals("ALL") ) {
return new HashSet<Class<? extends AssociationContext>>((Collection)contexts);
}
Map<String,Class<? extends AssociationContext>> classNameToClass = new HashMap<String,Class<? extends AssociationContext>>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
classNameToClass.put(clazz.getSimpleName(),clazz);
}
Set<Class<? extends AssociationContext>> validAssociations = new HashSet<Class<? extends AssociationContext>>();
for ( String s : associationsToUse ) {
validAssociations.add(classNameToClass.get(s));
}
return validAssociations;
}
}

View File

@ -1,270 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import cern.colt.function.DoubleDoubleFunction;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.impl.DenseDoubleMatrix1D;
import cern.colt.matrix.impl.DenseDoubleMatrix2D;
import cern.colt.matrix.linalg.Algebra;
import cern.colt.matrix.linalg.EigenvalueDecomposition;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.table.BedTableCodec;
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/25/11
* Time: 11:05 AM
* To change this template use File | Settings | File Templates.
*/
public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociationRecalibrator.LocHashingPair,Set<RegionalAssociationRecalibrator.LocHashingPair>> implements TreeReducible<Set<RegionalAssociationRecalibrator.LocHashingPair>> {
@Argument(shortName="w",fullName="whiteningFile",doc="File containing the mean, max vectors and covariance matrix",required=false)
File whitenFile = null;
@Argument(shortName="p",fullName="normParameter",doc="Exponent for vector norm; p > 4 will induce a p=infinity approximation",required=false)
double pNorm = 1.0;
@Output
public PrintStream out;
private DataWhitener whitener;
private double[] dataHolder;
private int boundTables = 0;
public void initialize() {
for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) {
logger.debug(source.getType().getSimpleName());
if ( source.getType().equals(BedTableCodec.class)) {
++boundTables;
if ( ! source.getFile().getName().endsWith(".bedgraph") ) {
throw new UserException("Regional association requires bedgraph files. The file "+source.getFile().getAbsolutePath()+" does not have the proper extension.");
}
}
}
if ( whitenFile != null ) {
whitener = getWhitenerFromFile(whitenFile);
}
}
public Set<LocHashingPair> reduceInit() { return new TreeSet<LocHashingPair>(); }
public LocHashingPair map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext ali ) {
if ( tracker == null ) { return null; }
ArrayList<TableFeature> features = new ArrayList<TableFeature>(boundTables);
for ( GATKFeature feature : tracker.getAllRods()) {
Object uo = feature.getUnderlyingObject();
if ( uo instanceof TableFeature && feature.getLocation().getStart() == ref.getLocus().getStart()) {
features.add((TableFeature)uo);
}
}
if ( features.size() == 0 ) { return null; }
if ( features.size() != boundTables ) {
throw new UserException("Features do not line up at position "+ref.getLocus().toString());
}
if ( dataHolder == null ) {
dataHolder = new double[features.size()];
}
int idx = 0;
for ( TableFeature f : features ) {
dataHolder[idx] = Double.parseDouble(f.getValue(3));
idx++;
}
double normVal;
if ( whitener != null ) {
normVal = calculateNorm(whitener.whiten(dataHolder),pNorm);
} else {
normVal = calculateNorm(dataHolder,pNorm);
}
return new LocHashingPair(features.get(0).getLocation(),normVal);
}
public Set<LocHashingPair> reduce(LocHashingPair map, Set<LocHashingPair> red) {
if ( map != null ) { red.add(map); }
return red;
}
public Set<LocHashingPair> treeReduce(Set<LocHashingPair> left, Set<LocHashingPair> right) {
if ( left == null ) {
return right;
} else if ( right == null ) {
return left;
} else {
left.addAll(right);
return left;
}
}
public void onTraversalDone(Set<LocHashingPair> normByLoc) {
double[] normRanks = new double[(normByLoc.size())];
int offset = 0;
for ( LocHashingPair lhp : normByLoc ) {
normRanks[offset]=(lhp.second);
offset++;
}
Arrays.sort(normRanks);
for ( LocHashingPair lhp : normByLoc ) {
int rank = Arrays.binarySearch(normRanks,lhp.second);
// note - equal values will always be assigned the same rank -- however
// it is proper, no need for random assignment.
double prob = (1.0 + normRanks.length - rank)/(1.0+normRanks.length);
int qual = AssociationTestRunner.pToQ(prob);
out.printf("%s\t%d\t%d\t%d\t%.2e\t%d\t%.2e%n",lhp.getFirst().getContig(),lhp.getFirst().getStart(),lhp.getFirst().getStop(),
qual,prob,rank,lhp.getSecond());
}
}
private DataWhitener getWhitenerFromFile(File file){
XReadLines lineReader;
try {
lineReader = new XReadLines(file);
} catch (FileNotFoundException e) {
throw new UserException("The provided whiten file could not be found",e);
}
// first line is the mean vector
String[] meanLine = lineReader.next().split("\t");
if ( meanLine.length != boundTables ) {
String msg = String.format("The number of bound bedgraphs does not match the size of the mean value in the whitening file. bound: %d mean : %d",boundTables,meanLine.length);
throw new UserException(msg);
}
double[] mean = new double[meanLine.length];
for ( int i = 0 ; i < meanLine.length; i ++ ) {
mean[i] = Double.parseDouble(meanLine[i]);
}
// skip
lineReader.next();
// now the max
String[] maxLine = lineReader.next().split("\t");
double[] max = new double[maxLine.length];
for ( int i = 0; i < maxLine.length ; i++ ) {
max[i] = Double.parseDouble(maxLine[i]);
}
// skip
lineReader.next();
// now the covariance matrix
double[][] cov = new double[max.length][max.length];
for ( int i = 0; i < max.length; i ++ ) {
String[] row = lineReader.next().split("\t");
for ( int j = 0; j < max.length; j ++ ) {
cov[i][j] = Double.parseDouble(row[j]);
}
}
DataWhitener dataWhitener = new DataWhitener();
dataWhitener.setCov(cov);
dataWhitener.setMaxs(max);
dataWhitener.setMeans(mean);
return dataWhitener;
}
private static double calculateNorm(double[] vec, double p) {
double sumExp = 0.0;
for ( double v : vec ) {
sumExp += Math.pow(Math.abs(v),p);
}
return Math.pow(sumExp,1.0/p);
}
class DataWhitener {
private final Algebra ALGEBRA = new Algebra();
private final DoubleDoubleFunction SUBTRACT = new DoubleDoubleFunction() {
// note: want vectors that have NaN to have norm calculated within the
// n-k dimensional subspace. This is equivalent to setting the NaNs to
// a post-whitening value of zero.
@Override
public double apply(double v, double v1) {
if ( Double.compare(v,Double.NaN) == 0 || Double.compare(v1,Double.NaN) == 0 ) {
return 0.0;
}
if ( Double.compare(v,Double.POSITIVE_INFINITY) == 0 ) { return 50.0; } // todo -- fixme
if ( Double.compare(v,Double.NEGATIVE_INFINITY) == 0 ) { return -50.0; } // todo -- fixme, really should be an exception
return v - v1;
}
};
private DenseDoubleMatrix1D means;
private DenseDoubleMatrix1D maxAbs;
private DoubleMatrix2D transform;
public DataWhitener() {
}
public void setMeans(double[] inMeans) {
means = new DenseDoubleMatrix1D(inMeans);
}
public void setMaxs(double[] maxs) {
maxAbs = new DenseDoubleMatrix1D(maxs);
}
public void setCov(double[][] cov) {
EigenvalueDecomposition decomp = new EigenvalueDecomposition(new DenseDoubleMatrix2D(cov));
DoubleMatrix2D diag = decomp.getD();
for ( int i = 0; i < diag.rows(); i ++ ) {
// do not artificially inflate signal that is not there
if ( diag.get(i,i) < 1.0 ) {
diag.set(i,i, 1.0);
} else {
diag.set(i,i, Math.pow(diag.get(i,i),-0.5));
}
}
transform = ALGEBRA.mult(diag,ALGEBRA.transpose(decomp.getV()));
logger.debug("TRANSFORM:");
logger.debug(transform);
}
public double[] whiten(double[] value) {
DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
}
}
class LocHashingPair extends Pair<GenomeLoc,Double> implements Comparable {
public LocHashingPair(GenomeLoc g, Double d) {
super(g,d);
}
public int hashCode() { return first.hashCode(); }
public int compareTo(Object o) {
if ( o instanceof LocHashingPair ) {
return this.first.compareTo(((LocHashingPair) o).first);
} else {
return Integer.MIN_VALUE;
}
}
}
}

View File

@ -1,156 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Downsample;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Multiplex;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.wiggle.WiggleWriter;
import java.io.PrintStream;
import java.lang.reflect.Modifier;
import java.util.*;
/**
* @Author chartl
* @Date 2011-02-23
* Generalized framework for regional (windowed) associations
* -- todos --
* : todo --
* Cap statistics output (sometimes see Infinity or -Infinity) [fixed cap, or by k*max_seen_so_far]
*/
@Downsample(by= DownsampleType.NONE)
public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAssociationHandler> implements TreeReducible<RegionalAssociationHandler> {
@Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false)
public String[] associationsToUse = null;
@Argument(doc="Change output file to bedgraph format (s p q, not STAT: s P: p Q: q",shortName="bg",fullName="bedgraph",required=false)
public boolean bedGraph = false;
@Argument(doc="Set the window size for associations to this value",shortName="w",fullName="window",required=false)
public int windowSize = 50;
@Argument(doc="Set the window sliding value for associations to this value",shortName="s",fullName="slide",required=false)
public int slideBy = 10;
@Argument(doc="Set the exercise-wide constant Z-value for delta-measure",shortName="z",fullName="zValue",required=false)
public double zVal = 6.0;
// for now apply this to t-tests too -- though df means the percentile is not constant, most
// dfs are large, so it doesn't really vary all that much
@Output
@Multiplex(value=RegionalAssociationMultiplexer.class,arguments={"associationsToUse","bedGraph"})
Map<AssociationContext,PrintStream> out;
public void initialize() {
if ( windowSize < 1 ) { throw new UserException("Window size cannot be less than one."); }
for ( Sample s : getSamples() ) {
if ( s.getProperty("cohort") == null ) {
throw new UserException("Sample "+s.getId()+" does not have a cohort property associated with it. "+
"Please ensure that metadata is bound with -SM and that sample "+s.getId()+" has the cohort property assigned.");
}
}
Set<AssociationContext> validAssociations = getAssociations();
if ( bedGraph ) {
writeBedGraphHeaders(validAssociations);
}
}
public RegionalAssociationHandler reduceInit() {
Set<AssociationContext> validAssociations = getAssociations();
RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples(),bedGraph);
return wac;
}
public MapHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return new MapHolder(tracker,ref,context);
}
public RegionalAssociationHandler reduce(MapHolder map, RegionalAssociationHandler rac) {
rac.updateExtender(map);
Map<AssociationContext,String> testResults;
try {
testResults = rac.runMapReduce();
} catch (Exception e) {
throw new StingException("Error in map reduce",e);
}
if ( testResults.size() > 0 ) {
for ( Map.Entry<AssociationContext,String> result : testResults.entrySet() ) {
out.get(result.getKey().getClass()).printf("%s%n",result.getValue());
}
}
return rac;
}
public Set<AssociationContext> getAssociations() {
List<Class<? extends AssociationContext>> contexts = new PluginManager<AssociationContext>(AssociationContext.class).getPlugins();
if ( associationsToUse.length > 0 && associationsToUse[0].equals("ALL") ) {
HashSet<AssociationContext> allAssoc = new HashSet<AssociationContext>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
AssociationContext context;
try {
context = clazz.getConstructor(new Class[] {}).newInstance(new Object[] {});
context.init(this);
} catch (Exception e ) {
throw new StingException("The class "+clazz.getSimpleName()+" could not be instantiated",e);
}
allAssoc.add(context);
}
return allAssoc;
}
Map<String,Class<? extends AssociationContext>> classNameToClass = new HashMap<String,Class<? extends AssociationContext>>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
classNameToClass.put(clazz.getSimpleName(),clazz);
}
Set<AssociationContext> validAssociations = new HashSet<AssociationContext>();
for ( String s : associationsToUse ) {
AssociationContext context;
try {
context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[] {});
context.init(this);
} catch ( Exception e ) {
throw new StingException("The class "+s+" could not be instantiated.",e);
}
validAssociations.add(context);
}
return validAssociations;
}
public RegionalAssociationHandler treeReduce(RegionalAssociationHandler left, RegionalAssociationHandler right) {
// for now be dumb; in future fix the fact that left-most intervals of a 16kb shard won't see the context from
// the right-most locus of the previous shard
return right;
}
public void onTraversalDone(RegionalAssociationHandler rac) {
// do nothing
}
public Set<Sample> getSamples() {
return getToolkit().getSAMFileSamples();
}
public void writeBedGraphHeaders(Set<AssociationContext> cons) {
for ( AssociationContext con : cons ) {
String header = String.format("track type=bedGraph name=%s description=stat,p,q,dichot,logdichot",con.getClass().getSimpleName());
out.get(con.getClass()).printf("%s%n",header);
}
}
}

View File

@ -1,29 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:52 PM
* To change this template use File | Settings | File Templates.
*/
public class BaseQualityScore extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> baseQuals = new ArrayList<Integer>(rbp.size());
for (PileupElement e : rbp ) {
baseQuals.add((int)e.getQual());
}
return (Collection) baseQuals;
}
public boolean usePreviouslySeenReads() { return true; }
public boolean useTStatistic() { return true; }
}

View File

@ -1,62 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationContext;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* @author chartl
*/
public abstract class CaseControl<X> extends AssociationContext<X,X> {
public Map<Cohort,X> getCaseControl() {
return reduce(window);
}
public Map<Cohort,X> reduce(List<Map<Sample,X>> window) {
X accumCase = null;
X accumControl = null;
for ( Map<Sample,X> sampleXMap : window ) {
for ( Map.Entry<Sample,X> entry : sampleXMap.entrySet() ) {
boolean isCase;
try {
isCase = entry.getKey().getProperty("cohort").equals("case");
} catch (NullPointerException e) {
throw new UserException("Sample "+entry.getKey().getId()+" does not have a cohort assigned");
}
if ( entry.getKey().getProperty("cohort").equals("case") ) {
accumCase = accum(accumCase, entry.getValue());
} else if ( entry.getKey().getProperty("cohort").equals("control") ) {
accumControl = accum(accumControl,entry.getValue());
}
}
}
Map<Cohort,X> cohortMap = new HashMap<Cohort,X>(2);
cohortMap.put(Cohort.CASE,accumCase);
cohortMap.put(Cohort.CONTROL,accumControl);
return cohortMap;
}
protected X accum(X left, X right) {
if ( left == null ) {
return right;
}
if ( right == null ) {
return left;
}
return add(left,right);
}
public abstract X map(ReadBackedPileup rbp);
public abstract X add(X left, X right);
public enum Cohort { CASE,CONTROL }
}

View File

@ -1,14 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 12:58:56 PM
* To change this template use File | Settings | File Templates.
*/
public interface FisherExact {
public abstract int[][] getCounts();
public abstract String[] getRowNames();
public abstract String[] getColNames();
}

View File

@ -1,29 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* @author chartl
*/
public class InsertSizeDistribution extends ValueTest {
private final int MAPQ_THRESHOLD = 5;
public boolean usePreviouslySeenReads() { return false; }
public Collection<Number> map(ReadBackedPileup pileup) {
List<Integer> insertSizes = new ArrayList<Integer>(pileup.size());
for ( PileupElement e : pileup ) {
if ( e.getMappingQual() >= MAPQ_THRESHOLD ) {
insertSizes.add(Math.abs(e.getRead().getInferredInsertSize()));
}
}
return (Collection) insertSizes;
}
}

View File

@ -1,30 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:29 PM
* To change this template use File | Settings | File Templates.
*/
public class MappingQuality0 extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int total = 0;
int mq0 = 0;
for ( PileupElement e : rbp ) {
++total;
if (e.getMappingQual() == 0)
++mq0;
}
return new Pair<Number,Number>(mq0,total);
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,30 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:43 PM
* To change this template use File | Settings | File Templates.
*/
public class MateMappingQuality extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> mateMapQ = new ArrayList<Integer>(rbp.size());
for ( PileupElement e : rbp ) {
if ( e.getRead().getReadPairedFlag() && e.getRead().getAttribute("MQ") != null) {
mateMapQ.add((Integer) e.getRead().getAttribute("MQ"));
}
}
return (Collection) mateMapQ;
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,35 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/1/11
* Time: 3:00 PM
* To change this template use File | Settings | File Templates.
*/
public class MateOtherContig extends ProportionTest {
private final int MAPQ_THRESHOLD = 5;
public boolean usePreviouslySeenReads() { return false; }
public Pair<Number,Number> map(ReadBackedPileup pileup) {
int tot = 0;
int otherCon = 0;
for ( PileupElement e : pileup ) {
if ( e.getRead().getReadPairedFlag() && e.getRead().getMappingQuality() >= MAPQ_THRESHOLD ) {
++tot;
if ( ! e.getRead().getMateReferenceIndex().equals(e.getRead().getReferenceIndex()) ) {
++otherCon;
}
}
}
return new Pair<Number,Number>(otherCon,tot);
}
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:47 PM
* To change this template use File | Settings | File Templates.
*/
public class MateSameStrand extends ProportionTest {
private final int MAPQ_THRESHOLD = 5;
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int numPairs = 0;
int mateSameStrand = 0;
for (PileupElement e : rbp ) {
if ( e.getRead().getReadPairedFlag() && e.getMappingQual() >= MAPQ_THRESHOLD) {
++numPairs;
if ( e.getRead().getMateNegativeStrandFlag() == e.getRead().getReadNegativeStrandFlag() ) {
++mateSameStrand;
}
}
}
return new Pair<Number,Number>(mateSameStrand,numPairs);
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/2/11
* Time: 2:09 PM
* To change this template use File | Settings | File Templates.
*/
public class MateUnmapped extends ProportionTest {
private final int MAPQ_THRESHOLD = 5;
public Pair<Number,Number> map(ReadBackedPileup pileup) {
int numMatedReads = 0;
int numPairUnmapped = 0;
for (PileupElement e : pileup ) {
if (e.getRead().getReadPairedFlag() && e.getMappingQual() >= MAPQ_THRESHOLD ) {
++numMatedReads;
if ( e.getRead().getMateUnmappedFlag() ) {
++numPairUnmapped;
}
}
}
return new Pair<Number,Number>(numPairUnmapped,numMatedReads);
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,90 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.MapExtender;
import org.broadinstitute.sting.oneoffprojects.walkers.association.RegionalAssociationWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/8/11
* Time: 2:32 PM
* To change this template use File | Settings | File Templates.
*/
public class MismatchRate extends ValueTest {
private Map<String,Object> sampleStats = new HashMap<String,Object>();
private int currentRefBase = 0;
public void init(RegionalAssociationWalker walker) {
super.init(walker);
Set<Sample> samples = walker.getSamples();
for ( Sample s : samples ) {
if ( s.hasProperty("mismatch_rate.mean") && s.hasProperty("mismatch_rate.std") ) {
double mn = (Double) s.getProperty("mismatch_rate.mean");
double std = (Double) s.getProperty("mismatch_rate.std");
sampleStats.put(s.getId(),new Pair<Double,Double>(mn,std));
} else {
sampleStats.put(s.getId(),new MathUtils.RunningAverage());
}
}
}
@Override
public Map<Sample,Object> mapLocus(MapExtender extender) {
currentRefBase = BaseUtils.simpleBaseToBaseIndex(extender.getReferenceContext().getBase());
Map<Sample,ReadBackedPileup> pileups = extender.getReadFilteredPileup();
Map<Sample,Object> maps = new HashMap<Sample,Object>(pileups.size());
for ( Map.Entry<Sample,ReadBackedPileup> samPileup : pileups.entrySet() ) {
maps.put(samPileup.getKey(),map(samPileup.getKey(),samPileup.getValue()));
}
return maps;
}
public Collection<Number> map(Sample sample, ReadBackedPileup pileup) {
Object stats = sampleStats.get(sample.getId());
double mn;
double std;
double mmr = calcMMR(pileup);
if ( stats instanceof Pair ) {
mn = ((Pair<Double,Double>)stats).first;
std = ((Pair<Double,Double>)stats).second;
} else {
MathUtils.RunningAverage ra = (MathUtils.RunningAverage) stats;
mn = ra.mean();
std = ra.stddev();
if ( std <= 0.0 ) {
std = 1.0;
}
ra.add(mmr);
}
return Arrays.asList((Number) ((mmr - mn) / std));
}
public double calcMMR(ReadBackedPileup rbp) {
int[] counts = rbp.getBaseCounts();
int total = 0;
int nonref = 0;
for ( int base : new int[]{0,1,2,3} ) {
total += counts[base];
if ( base != currentRefBase ) {
nonref += counts[base];
}
}
return ((double)nonref)/total;
}
// note: this is to satisfy the interface, and is never called due to override
public Collection<Number> map(ReadBackedPileup pileup) { return null; }
public boolean usePreviouslySeenReads() { return true; }
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:50 PM
* To change this template use File | Settings | File Templates.
*/
public class ProperPairs extends ProportionTest {
private final int MAPQ_THRESHOLD = 5;
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int numReads = 0;
int numPropPair = 0;
for (PileupElement e : rbp ) {
if ( e.getRead().getReadPairedFlag() && e.getMappingQual() >= MAPQ_THRESHOLD ) {
++numReads;
if ( e.getRead().getReadPairedFlag() && e.getRead().getProperPairFlag() ) {
++numPropPair;
}
}
}
return new Pair<Number,Number>(numPropPair,numReads);
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,91 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import cern.jet.random.Normal;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizable;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Feb 24, 2011
* Time: 12:58:38 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class ProportionTest extends CaseControl<Pair<Number,Number>> implements Dichotomizable, StatisticalTest {
public static final Normal standardNormal = new Normal(0.0,1.0,null);
public abstract Pair<Number,Number> map(ReadBackedPileup rbp );
public Pair<Number,Number> add(Pair<Number,Number> left, Pair<Number,Number> right) {
return new Pair<Number,Number>(left.first.doubleValue()+right.first.doubleValue(),
left.second.doubleValue()+right.second.doubleValue());
}
public Pair<Collection<Number>,Collection<Number>> getDichotomizedData() {
Collection<Number> caseProps = new ArrayList<Number>();
Collection<Number> controlProps = new ArrayList<Number>();
for (Map<Sample,Pair<Number,Number>> sampleNumberMap : window ) {
for ( Map.Entry<Sample,Pair<Number,Number>> samplePairEntry : sampleNumberMap.entrySet() ) {
Pair<Number,Number> val = samplePairEntry.getValue();
if ( samplePairEntry.getKey().getProperty("cohort").equals("case")) {
caseProps.add(val.first.doubleValue()/val.second.doubleValue());
} else if ( samplePairEntry.getKey().getProperty("cohort").equals("control") ) {
controlProps.add(val.first.doubleValue()/val.second.doubleValue());
}
}
}
return new Pair<Collection<Number>,Collection<Number>>(caseProps,controlProps);
}
public Pair<Double,Double> getStatisticAndPValue() {
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = getCaseControl();
if ( caseControlCounts == null || caseControlCounts.get(CaseControl.Cohort.CASE) == null || caseControlCounts.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue());
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
double z = (pCase-pControl)/se;
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
return new Pair<Double,Double>(z,p);
}
// todo -- this is a temporary method, it needs to be merged in with others if it proves useful
public Double getZConfDelta() {
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = getCaseControl();
if ( caseControlCounts == null || caseControlCounts.get(CaseControl.Cohort.CASE) == null || caseControlCounts.get(CaseControl.Cohort.CONTROL) == null ) {
return Double.NaN;
}
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue());
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
double z = (pCase-pControl)/se;
return ( z < 0 ? -1.0*se*(z-zVal) : se*(z-zVal));
}
public String getStatisticName() { return "Z"; }
}

View File

@ -1,36 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:31 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadClipping extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> clipping = new ArrayList<Integer>(rbp.size());
for ( PileupElement e : rbp ) {
int clips = 0;
for( CigarElement c : e.getRead().getCigar().getCigarElements() ) {
if ( c.getOperator() == CigarOperator.SOFT_CLIP || c.getOperator() == CigarOperator.HARD_CLIP ) {
clips += c.getLength();
}
}
clipping.add(clips);
}
return (Collection) clipping;
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,36 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:54 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadIndels extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> indelElements = new ArrayList<Integer>(rbp.size());
for (PileupElement e : rbp ) {
int indelOps = 0;
for ( CigarElement c : e.getRead().getCigar().getCigarElements()) {
if ( c.getOperator() == CigarOperator.DELETION || c.getOperator() == CigarOperator.INSERTION )
++indelOps;
}
indelElements.add(indelOps);
}
return (Collection) indelElements;
}
public boolean usePreviouslySeenReads() { return false; }
public boolean useTStatistic() { return true; }
}

View File

@ -1,28 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/6/11
* Time: 1:41 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadMappingQuality extends ValueTest {
public Collection<Number> map(ReadBackedPileup rbp) {
ArrayList<Integer> mapQuals = new ArrayList<Integer>(rbp.size());
for ( PileupElement e : rbp ) {
mapQuals.add(e.getMappingQual());
}
return (Collection) mapQuals;
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,44 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import ch.qos.logback.classic.filter.ThresholdFilter;
import org.broadinstitute.sting.oneoffprojects.walkers.association.RegionalAssociationWalker;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 4/9/11
* Time: 3:31 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadsAberrantInsertSize extends ProportionTest {
private int HI_THRESHOLD;
private int LO_THRESHOLD;
@Override
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int total = 0;
int wonky = 0;
for (PileupElement e : rbp ) {
if ( e.getRead().getReadPairedFlag() ) {
++total;
if ( e.getRead().getInferredInsertSize() > HI_THRESHOLD || e.getRead().getInferredInsertSize() < LO_THRESHOLD ) {
++wonky;
}
}
}
return new Pair<Number,Number>(wonky,total);
}
public void init(RegionalAssociationWalker parent) {
super.init(parent);
HI_THRESHOLD = 200;
LO_THRESHOLD = 50;
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,35 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 4/15/11
* Time: 12:54 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadsWithIndels extends ProportionTest {
public Pair<Number,Number> map(ReadBackedPileup pileup) {
int numReads = 0;
int numWithIndels = 0;
for ( PileupElement e : pileup ) {
++numReads;
for ( CigarElement element : e.getRead().getCigar().getCigarElements() ) {
if ( element.getOperator().equals(CigarOperator.DELETION) || element.getOperator().equals(CigarOperator.INSERTION) ) {
++numWithIndels;
break;
}
}
}
return new Pair<Number,Number>(numWithIndels,numReads);
}
public boolean usePreviouslySeenReads() { return false; }
}

View File

@ -1,45 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.MapExtender;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/8/11
* Time: 1:17 PM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceMismatches extends ProportionTest {
final static int[] BASE_INDEX = {0,1,2,3};
int currentRefBase = 0;
@Override
public Map<Sample,Object> mapLocus(MapExtender extender) {
currentRefBase = BaseUtils.simpleBaseToBaseIndex(extender.getReferenceContext().getBase());
return super.mapLocus(extender);
}
public Pair<Number,Number> map(ReadBackedPileup rbp) {
int[] counts = rbp.getBaseCounts();
int total = 0;
int nonref = 0;
for ( int base : BASE_INDEX ) {
total += counts[base];
if ( base != currentRefBase ) {
nonref += counts[base];
}
}
return new Pair<Number,Number>(nonref,total);
}
public boolean usePreviouslySeenReads() { return true; }
}

View File

@ -1,73 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.MapExtender;
import org.broadinstitute.sting.oneoffprojects.walkers.association.RegionalAssociationWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* @author chartl
*/
public class SampleDepth extends ValueTest {
public Map<String,Object> sampleStats = null;
// either: the user associates data with the sample (doc.mean, doc.std)
// >OR we calculate the values on-the-fly (e.g. running mean/stdev)
public void init(RegionalAssociationWalker walker) {
super.init(walker);
sampleStats = new HashMap<String,Object>();
Set<Sample> samples = walker.getSamples();
for ( Sample s : samples ) {
if ( s.hasProperty("doc.mean") && s.hasProperty("doc.std") ) {
double mn = (Double) s.getProperty("doc.mean");
double std = (Double) s.getProperty("doc.std");
sampleStats.put(s.getId(),new Pair<Double,Double>(mn,std));
} else {
sampleStats.put(s.getId(),new MathUtils.RunningAverage());
}
}
}
@Override
public Map<Sample,Object> mapLocus(MapExtender extender) {
Map<Sample,ReadBackedPileup> pileups = extender.getReadFilteredPileup();
Map<Sample,Object> maps = new HashMap<Sample,Object>(pileups.size());
for ( Map.Entry<Sample,ReadBackedPileup> samPileup : pileups.entrySet() ) {
maps.put(samPileup.getKey(),map(samPileup.getKey(),samPileup.getValue()));
}
return maps;
}
public Collection<Number> map(Sample sample, ReadBackedPileup pileup) {
Object stats = sampleStats.get(sample.getId());
double mn;
double std;
if ( stats instanceof Pair ) {
mn = ((Pair<Double,Double>)stats).first;
std = ((Pair<Double,Double>)stats).second;
} else {
MathUtils.RunningAverage ra = (MathUtils.RunningAverage) stats;
mn = ra.mean();
std = ra.stddev();
if ( std <= 0.0 ) {
std = 1.0;
}
ra.add(pileup.size());
}
return Arrays.asList((Number)((pileup.size()-mn)/std));
}
// note: this is to satisfy the interface, and is never called due to override
public Collection<Number> map(ReadBackedPileup pileup) { return null; }
public boolean usePreviouslySeenReads() { return true; }
}

View File

@ -1,121 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol;
import cern.jet.random.StudentT;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.Dichotomizable;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.StatisticalTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/2/11
* Time: 1:53 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class ValueTest extends CaseControl<Collection<Number>> implements StatisticalTest, Dichotomizable {
public abstract Collection<Number> map(ReadBackedPileup rbp );
public boolean useTStatistic() { return false; } // default to using the U statistic
public Collection<Number> add(Collection<Number> left, Collection<Number> right) {
if ( left instanceof ArrayList ) {
((ArrayList) left).addAll(right);
return left;
} else if ( left instanceof Set) {
((Set) left).addAll(right);
return left;
} else {
List<Number> newList = new ArrayList<Number>(left.size()+right.size());
newList.addAll(left);
newList.addAll(right);
return newList;
}
}
public Pair<Collection<Number>,Collection<Number>> getDichotomizedData() {
Collection<Number> caseMeans = new ArrayList<Number>();
Collection<Number> controlMeans = new ArrayList<Number>();
for ( Map<Sample,Collection<Number>> sampleMap : window ) {
for ( Map.Entry<Sample,Collection<Number>> sampleEntry : sampleMap.entrySet() ) {
if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) {
caseMeans.add(MathUtils.average(sampleEntry.getValue(),true));
} else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) {
controlMeans.add(MathUtils.average(sampleEntry.getValue(),true));
}
}
}
return new Pair<Collection<Number>,Collection<Number>>(caseMeans,controlMeans);
}
public Pair<Double,Double> getUStatisticAndPValue() {
MannWhitneyU mwu = new MannWhitneyU();
for ( Map.Entry<Cohort,Collection<Number>> cohortEntry : getCaseControl().entrySet() ) {
if ( cohortEntry.getKey().equals(Cohort.CASE) && cohortEntry.getValue() != null ) {
for ( Number n : cohortEntry.getValue() ) {
mwu.add(n,MannWhitneyU.USet.SET1);
}
} else if ( cohortEntry.getKey().equals(Cohort.CONTROL) && cohortEntry.getValue() != null) {
for ( Number n : cohortEntry.getValue() ) {
mwu.add(n,MannWhitneyU.USet.SET2);
}
}
}
return mwu.runTwoSidedTest();
}
public Pair<Double,Double> getStatisticAndPValue() { return useTStatistic() ? getTStatisticAndPValue() : getUStatisticAndPValue(); }
public String getStatisticName() { return useTStatistic() ? "T" : "V"; }
public Pair<Double,Double> getTStatisticAndPValue() {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true);
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true);
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1);
double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl);
StudentT studentT = new StudentT(df_num/df_denom,null);
double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t));
return new Pair<Double,Double>(t,p);
}
public Double getZConfDelta() {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = getCaseControl();
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return Double.NaN;
}
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true);
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true);
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double dnom = Math.sqrt(varCase/nCase+varControl/nControl);
double t = (meanCase-meanControl)/dnom;
return ( t < 0 ? -1.0*dnom*(t-zVal) : dnom*(t-zVal));
}
}

View File

@ -1,16 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 4/5/11
* Time: 11:35 AM
* To change this template use File | Settings | File Templates.
*/
public interface Dichotomizable {
public abstract Pair<Collection<Number>,Collection<Number>> getDichotomizedData();
}

View File

@ -1,79 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.ArrayList;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 4/4/11
* Time: 7:18 PM
* To change this template use File | Settings | File Templates.
*/
public class Dichotomizer1D {
public Dichotomizer1D() { } // instantiation required for access to transform
public abstract class Transform {
public abstract double transform(double n);
public Collection<Number> apply(Collection<Number> collection) {
ArrayList<Number> tData = new ArrayList<Number>(collection.size());
for ( Number n : collection ) {
tData.add(transform(n.doubleValue()));
}
return tData;
}
public Transform() {
}
}
/**
* Tries to measure effect size by the separation of setOne and setTwo clusters, with a window spread factor of 3
* @param setOne - collection of data from set one
* @param setTwo - colleciton of data from set two
* @return - the so-called "Z"-factor (effect size/spread)
*/
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo) {
double meanOne = MathUtils.average(setOne,true);
double meanTwo = MathUtils.average(setTwo,true);
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne,true));
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo,true));
/*
System.out.print("setOne: ");
for ( Number n : setOne ) {
System.out.printf(",%.2f",n.doubleValue());
}
System.out.print("\tsetTwo: ");
for ( Number n : setTwo ) {
System.out.printf(",%.2f",n.doubleValue());
}
System.out.printf("\tmn1: %.2f mn2: %.2f var1: %.2f var2: %.2f%n",meanOne,meanTwo,stdOne,stdTwo);
*/
return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo);
}
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo, Transform transform) {
return simpleGaussianDichotomy(transform.apply(setOne),transform.apply(setTwo));
}
public static double simpleGaussianDichotomy(Dichotomizable dichotomizable) {
Pair<Collection<Number>,Collection<Number>> dichotomizedData = dichotomizable.getDichotomizedData();
return simpleGaussianDichotomy(dichotomizedData.first,dichotomizedData.second);
}
public static double simpleGaussianDichotomy(Dichotomizable dichotomizable, Transform transform) {
Pair<Collection<Number>,Collection<Number>> dichotomizedData = dichotomizable.getDichotomizedData();
return simpleGaussianDichotomy(dichotomizedData.first,dichotomizedData.second, transform);
}
public static Pair<Double,Double> twoMeansDichotomy() { return null; }
}

View File

@ -1,16 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association.statistics;
import org.broadinstitute.sting.utils.collections.Pair;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 4/5/11
* Time: 12:26 PM
* To change this template use File | Settings | File Templates.
*/
public interface StatisticalTest {
public abstract Pair<Double,Double> getStatisticAndPValue();
public abstract String getStatisticName();
public abstract Double getZConfDelta();
}

View File

@ -0,0 +1,91 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ProportionTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ValueTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationTestRunner;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 3/5/11
* Time: 2:06 PM
* To change this template use File | Settings | File Templates.
*/
public class RFAUnitTest extends BaseTest {
@BeforeClass
public void init() { }
@Test
private void testMWU() {
logger.warn("Testing MWU");
MannWhitneyU mwu = new MannWhitneyU();
mwu.add(0, MannWhitneyU.USet.SET1);
mwu.add(1,MannWhitneyU.USet.SET2);
mwu.add(2,MannWhitneyU.USet.SET2);
mwu.add(3,MannWhitneyU.USet.SET2);
mwu.add(4,MannWhitneyU.USet.SET2);
mwu.add(5,MannWhitneyU.USet.SET2);
mwu.add(6,MannWhitneyU.USet.SET1);
mwu.add(7,MannWhitneyU.USet.SET1);
mwu.add(8,MannWhitneyU.USet.SET1);
mwu.add(9,MannWhitneyU.USet.SET1);
mwu.add(10,MannWhitneyU.USet.SET1);
mwu.add(11,MannWhitneyU.USet.SET2);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
MannWhitneyU mwu2 = new MannWhitneyU();
for ( int dp : new int[]{2,4,5,6,8} ) {
mwu2.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
mwu2.add(dp,MannWhitneyU.USet.SET2);
}
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
Pair<Integer,Integer> sizes = mwu2.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
// tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
logger.warn("Set 1");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
logger.warn("Set 2");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
MannWhitneyU mwu3 = new MannWhitneyU();
for ( int dp : new int[]{0,2,4} ) {
mwu3.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
mwu3.add(dp,MannWhitneyU.USet.SET2);
}
long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
Pair<Integer,Integer> nums = mwu3.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
}
}

View File

@ -1,222 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ProportionTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.modules.casecontrol.ValueTest;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.oneoffprojects.walkers.association.AssociationTestRunner;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 3/5/11
* Time: 2:06 PM
* To change this template use File | Settings | File Templates.
*/
public class RegionalAssociationUnitTest extends BaseTest {
@BeforeClass
public void init() { }
@Test
private void testTStatistics() {
logger.warn("Testing T statistics");
TTest test1 = new TTest();
test1.setCaseData((Collection) Arrays.asList(1,1,2,3,4));
test1.setControlData((Collection) Arrays.asList(10, 10, 20, 30, 40));
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).second.first,0.02697891,1e-8);
TTest test2 = new TTest();
test2.setCaseData((Collection) Arrays.asList(5, 6, 5, 2, 3, 8, 7, 12, 10, 6, 4, 2, 8, 7, 3));
test2.setControlData((Collection) Arrays.asList(1, 6, 7, 2, 3, 3, 4, 1, 2, 5, 7, 3, 10, 3, 3, 2, 3));
Assert.assertEquals(AssociationTestRunner.getTestValues(test2).second.first, 0.04100948, 1e-8);
TTest test3 = new TTest();
test3.setCaseData((Collection) Arrays.asList(94,25,68,4,27,51,9,10,91,61,61,37,39,44,36,27,86,33,3,38,5,6,28,93,30,56,81,8,40,44));
test3.setControlData((Collection) Arrays.asList(6,64,96,85,20,74,93,18,31,20,88,38,80,50,33,81,35,8,2,69,49,6,26,74,79,63,63,96,45,18));
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).second.first,0.2309763,1e-7);
TTest test4 = new TTest();
test4.setCaseData((Collection) Arrays.asList(14,8,8,17,8,12,10,10,13,9,13,9,9,12,12,11,16,12,13,16,10,13,11,16,13,16,11,13,9,16,16,14,9,14,17,10,15,15,9,15,17,15,17,12,10,13,11,14,8,14));
test4.setControlData((Collection) Arrays.asList(7,1,4,2,3,7,8,5,5,4,10,6,4,9,2,9,9,3,3,10,1,8,9,5,3,7,2,7,10,9,4,9,2,10,10,3,2,3,4,4,5,10,9,4,3,5,6,10,5,10));
Assert.assertEquals(AssociationTestRunner.getTestValues(test4).second.first,4.291725e-20,1e-19);
Assert.assertEquals(AssociationTestRunner.getTestValues(test4).first.first,11.60592,1e-5);
}
@Test
private void testZStatistics() {
logger.warn("Testing Z statistics");
ZTest test1 = new ZTest();
test1.setCaseData(new Pair<Number,Number>(100,500));
test1.setControlData(new Pair<Number,Number>(55,300));
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).first.first,0.57742362050306,2e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).second.first,0.56367,2e-5);
ZTest test2 = new ZTest();
test1.setCaseData(new Pair<Number, Number>(1020, 1800));
test1.setControlData(new Pair<Number, Number>(680, 1670));
Assert.assertEquals(AssociationTestRunner.getTestValues(test1).first.first,9.3898178216531,2e-6);
ZTest test3 = new ZTest();
test3.setCaseData(new Pair<Number,Number>(20,60));
test3.setControlData(new Pair<Number,Number>(30,80));
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).first.first,-0.50917511840392,2e-6);
Assert.assertEquals(AssociationTestRunner.getTestValues(test3).second.first,0.610643593,2e-4);
}
@Test (enabled = false)
private void testUStatistic() {
logger.warn("Testing U statistics");
UTest test1 = new UTest();
test1.setCaseData((Collection) Arrays.asList(2,4,5,6,8));
test1.setControlData((Collection) Arrays.asList(1,3,7,9,10,11,12,13));
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test1).first,-1.537,1e-4);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.092292,5e-2); // z-approximation, off by about 0.05
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test1).second,0.044444,1e-3); // recursive calculation
UTest test2 = new UTest();
test2.setCaseData((Collection) Arrays.asList(1,7,8,9,10,11,15,18));
test2.setControlData((Collection) Arrays.asList(2,3,4,5,6,12,13,14,16,17));
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test2).first,-0.3109831608,1e-10);
UTest test3 = new UTest();
test3.setCaseData((Collection)Arrays.asList(13,14,7,18,5,2,9,17,8,10,3,15,19,6,20,16,11,4,12,1));
test3.setControlData((Collection) Arrays.asList(29,21,14,10,12,11,28,19,18,13,7,27,20,5,17,16,9,23,22,26));
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test3).first,-2.907884571802469,1e-14);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test3).second,2*0.00302,1e-3);
UTest test4 = new UTest();
test4.setCaseData((Collection) Arrays.asList(1,2,4,5,6,9));
test4.setControlData((Collection) Arrays.asList(3,8,11,12,13));
Assert.assertEquals((double) AssociationTestRunner.mannWhitneyUTest(test4).first,-1.9170289512680814,1e-14);
Assert.assertEquals(AssociationTestRunner.mannWhitneyUTest(test4).second,0.0303,1e-4);
}
@Test
private void testMWU() {
logger.warn("Testing MWU");
MannWhitneyU mwu = new MannWhitneyU();
mwu.add(0, MannWhitneyU.USet.SET1);
mwu.add(1,MannWhitneyU.USet.SET2);
mwu.add(2,MannWhitneyU.USet.SET2);
mwu.add(3,MannWhitneyU.USet.SET2);
mwu.add(4,MannWhitneyU.USet.SET2);
mwu.add(5,MannWhitneyU.USet.SET2);
mwu.add(6,MannWhitneyU.USet.SET1);
mwu.add(7,MannWhitneyU.USet.SET1);
mwu.add(8,MannWhitneyU.USet.SET1);
mwu.add(9,MannWhitneyU.USet.SET1);
mwu.add(10,MannWhitneyU.USet.SET1);
mwu.add(11,MannWhitneyU.USet.SET2);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11l);
MannWhitneyU mwu2 = new MannWhitneyU();
for ( int dp : new int[]{2,4,5,6,8} ) {
mwu2.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
mwu2.add(dp,MannWhitneyU.USet.SET2);
}
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10l);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30l);
Pair<Integer,Integer> sizes = mwu2.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10l),0.4180519701814064,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10l),0.021756021756021756,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10l),0.06214143703127617,1e-14);
Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
// tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30l),0.08216463976903321,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30l),0.0023473625009328147,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30l),0.021756021756021756,1e-14); // note -- exactly same value as above
logger.warn("Set 1");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
logger.warn("Set 2");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
MannWhitneyU mwu3 = new MannWhitneyU();
for ( int dp : new int[]{0,2,4} ) {
mwu3.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
mwu3.add(dp,MannWhitneyU.USet.SET2);
}
long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
Pair<Integer,Integer> nums = mwu3.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u),3.665689149560116E-4,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u),0.0032240865760884696,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
}
private class TTest extends ValueTest {
Map<Cohort,Collection<Number>> toTest = new HashMap<Cohort,Collection<Number>>(2);
@Override
public Map<Cohort,Collection<Number>> getCaseControl() {
return toTest;
}
public void setCaseData(Collection<Number> data) {
toTest.put(Cohort.CASE,data);
}
public void setControlData(Collection<Number> data) {
toTest.put(Cohort.CONTROL,data);
}
public Collection<Number> map(ReadBackedPileup rbp) { return null; }
public int getWindowSize() { return 1; }
@Override
public boolean useTStatistic() { return true; }
public boolean usePreviouslySeenReads() { return false; }
}
private class ZTest extends ProportionTest {
Map<Cohort,Pair<Number,Number>> toTest = new HashMap<Cohort,Pair<Number,Number>>(2);
@Override
public Map<Cohort,Pair<Number,Number>> getCaseControl() {
return toTest;
}
public void setCaseData(Pair<Number,Number> data) {
toTest.put(Cohort.CASE,data);
}
public void setControlData(Pair<Number,Number> data) {
toTest.put(Cohort.CONTROL,data);
}
public Pair<Number,Number> map(ReadBackedPileup p) { return null; }
public int getWindowSize() { return 1; }
public int slideByValue() { return 1; }
public boolean usePreviouslySeenReads() { return true; }
}
private class UTest extends ValueTest {
TTest test = new TTest();
public boolean usePreviouslySeenReads() { return false; }
public int getWindowSize() { return 1; }
public int slideByValue() { return 1; }
public Collection<Number> map(ReadBackedPileup p ){ return null; }
@Override
public Map<Cohort,Collection<Number>> getCaseControl() {
return test.getCaseControl();
}
public void setCaseData(Collection<Number> data) { test.setCaseData(data);}
public void setControlData(Collection<Number> data) { test.setControlData(data); }
}
}