Miscellaneous changes:
- RefMetaDataTracker: grabbing variant contexts given a prefix (not sure where else this was implemented, if someone can show me I'll remove it)
- VCFUtils: grabbing VCF headers given a prefix
- MathUtils: Useful functions for calculating statistics on collections of Numbers
- VariantAnnotator: Made isUniqueHeaderLine a public static method -- maybe this should go into a different class. Not sure.
- Associations: PluginManager now used to propagate classes, implementations for Z,T,U tests, slight alteration to format to make the objects stored
in the window optionally different from those returned by whatever statistic is run across the window
Added:
- MannWhitneyU. Started to fix up WilcoxonRankSum but there are comments in there questioning the validity of some of the code, and I'm sure that
it's actually doing a U test. This implementation includes the direct calculation of p-values for small sample sizes, and a uniform approximation
for when one of the sample sets is small, and the other large. Unit tests to follow.
- BootstrapCallsMerger: takes n VCFs which have been called on the same samples; merges them together while averaging the annotations
- BootstrapCalls.q: qscript for testing the effectiveness of boostrap low-pass calling on the exome
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5372 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8ae42b70ac
commit
9ca1dd5d62
|
|
@ -276,6 +276,19 @@ public class RefMetaDataTracker {
|
|||
return contexts;
|
||||
}
|
||||
|
||||
public Collection<VariantContext> getVariantContextsByPrefix(ReferenceContext ref, Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
|
||||
|
||||
for ( String name : names ) {
|
||||
RODRecordList rodList = getTrackDataByName(name,false); // require that the name is an exact match
|
||||
|
||||
if ( rodList != null )
|
||||
addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
|
||||
* see getVariantContexts for more information.
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) {
|
||||
public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) {
|
||||
if ( !(line instanceof VCFCompoundHeaderLine) )
|
||||
return true;
|
||||
|
||||
|
|
|
|||
|
|
@ -15,12 +15,12 @@ import java.util.Map;
|
|||
* Time: 11:58 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class AssociationContext<X> {
|
||||
public abstract class AssociationContext<X,Y> {
|
||||
|
||||
protected List<Map<Sample,Object>> window;
|
||||
protected List<Map<Sample,Y>> window;
|
||||
|
||||
public AssociationContext() {
|
||||
window = new ArrayList<Map<Sample,Object>>(getWindowSize());
|
||||
window = new ArrayList<Map<Sample,Y>>(getWindowSize());
|
||||
}
|
||||
|
||||
// specifies size of window
|
||||
|
|
@ -36,7 +36,7 @@ public abstract class AssociationContext<X> {
|
|||
public abstract Object map(ReadBackedPileup rbp);
|
||||
|
||||
// specifies how to take the per-sample data and reduce them into testable pairs
|
||||
public abstract Map<?,X> reduce(List<Map<Sample,X>> win);
|
||||
public abstract Map<?,X> reduce(List<Map<Sample,Y>> win);
|
||||
|
||||
// do we filter the current location (e.g. omit from window)
|
||||
public boolean filter(MapExtender m) { return true; }
|
||||
|
|
@ -60,7 +60,7 @@ public abstract class AssociationContext<X> {
|
|||
}
|
||||
|
||||
|
||||
public void addData(Map<Sample,Object> sampleData) {
|
||||
public void addData(Map<Sample,Y> sampleData) {
|
||||
window.add(sampleData);
|
||||
}
|
||||
|
||||
|
|
@ -70,7 +70,7 @@ public abstract class AssociationContext<X> {
|
|||
}
|
||||
|
||||
public void slide() {
|
||||
ArrayList<Map<Sample,Object>> newWindow = new ArrayList<Map<Sample,Object>>((window.subList(slideByValue(),window.size())));
|
||||
ArrayList<Map<Sample,Y>> newWindow = new ArrayList<Map<Sample,Y>>((window.subList(slideByValue(),window.size())));
|
||||
newWindow.ensureCapacity(getWindowSize());
|
||||
window = newWindow;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.association;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import cern.jet.random.Normal;
|
||||
import cern.jet.random.StudentT;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.*;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.WilcoxonRankSum;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -28,26 +33,58 @@ public class AssociationTestRunner {
|
|||
results.add(runZ((ZStatistic) context));
|
||||
}
|
||||
|
||||
if ( context instanceof FisherExact ) {
|
||||
results.add(runFisherExact(context));
|
||||
if ( context instanceof UStatistic ) {
|
||||
results.add(runU((UStatistic) context));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
public static String runStudentT(TStatistic context) { /*
|
||||
StudentT studentT = new StudentT(dfnum/dfdenom,null);
|
||||
public static String runStudentT(TStatistic context) {
|
||||
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
|
||||
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
|
||||
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
|
||||
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
|
||||
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
|
||||
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
|
||||
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
|
||||
|
||||
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
|
||||
double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1);
|
||||
double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl);
|
||||
|
||||
StudentT studentT = new StudentT(df_num/df_denom,null);
|
||||
double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t));
|
||||
return String.format("T: %.2f\tP: %.2e",t,p);*/
|
||||
return null;
|
||||
return String.format("T: %.2f\tP: %.2e",t,p);
|
||||
}
|
||||
|
||||
public static String runZ(ZStatistic context) { /*
|
||||
double z = num/Math.sqrt(se);
|
||||
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
|
||||
return String.format("Z: %.2f\tP: %.2e",z,p);*/
|
||||
public static String runZ(ZStatistic context) {
|
||||
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = context.getCaseControl();
|
||||
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
|
||||
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
|
||||
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
|
||||
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
|
||||
|
||||
return null;
|
||||
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
|
||||
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue());
|
||||
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
|
||||
|
||||
double z = (pCase-pControl)/se;
|
||||
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
|
||||
return String.format("Z: %.2f\tP: %.2e",z,p);
|
||||
}
|
||||
|
||||
public static String runU(UStatistic context) {
|
||||
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
|
||||
MannWhitneyU mwu = new MannWhitneyU();
|
||||
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) {
|
||||
mwu.add(n, MannWhitneyU.USet.SET1);
|
||||
}
|
||||
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CONTROL) ) {
|
||||
mwu.add(n,MannWhitneyU.USet.SET2);
|
||||
}
|
||||
Pair<Integer,Double> results = mwu.runTwoSidedTest();
|
||||
return String.format("U: %d\tP: %.2e",results.first,results.second);
|
||||
}
|
||||
|
||||
public static String runFisherExact(AssociationContext context) {
|
||||
|
|
|
|||
|
|
@ -9,13 +9,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @Author chartl
|
||||
|
|
@ -23,7 +23,7 @@ import java.util.Set;
|
|||
* Generalized framework for regional (windowed) associations
|
||||
*/
|
||||
public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAssociationHandler> implements TreeReducible<RegionalAssociationHandler> {
|
||||
@Argument(doc="foo",shortName="AT",fullName="associationType",required=false)
|
||||
@Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false)
|
||||
public String[] associationsToUse = null;
|
||||
|
||||
@Output
|
||||
|
|
@ -57,17 +57,23 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
|
|||
}
|
||||
return rac;
|
||||
}
|
||||
|
||||
private AssociationContext stringToAssociationContext(String s) {
|
||||
|
||||
throw new UserException(String.format("AssociationContextOld type %s not found.",s));
|
||||
}
|
||||
|
||||
private Set<AssociationContext> getAssociations() {
|
||||
// todo -- this should use the package handler like variant eval
|
||||
List<Class<? extends AssociationContext>> contexts = new PluginManager<AssociationContext>(AssociationContext.class).getPlugins();
|
||||
Map<String,Class<? extends AssociationContext>> classNameToClass = new HashMap<String,Class<? extends AssociationContext>>(contexts.size());
|
||||
for ( Class<? extends AssociationContext> clazz : contexts ) {
|
||||
if (! Modifier.isAbstract(clazz.getModifiers())) {
|
||||
classNameToClass.put(clazz.getSimpleName(),clazz);
|
||||
}
|
||||
}
|
||||
|
||||
Set<AssociationContext> validAssociations = new HashSet<AssociationContext>();
|
||||
for ( String s : associationsToUse ) {
|
||||
AssociationContext context = stringToAssociationContext(s);
|
||||
AssociationContext context;
|
||||
try {
|
||||
context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[]{});
|
||||
} catch ( Exception e ) {
|
||||
throw new StingException("The class "+s+" could not be instantiated.",e);
|
||||
}
|
||||
context.init(this);
|
||||
validAssociations.add(context);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,11 @@ import java.util.Map;
|
|||
* Created by IntelliJ IDEA.
|
||||
* @author chartl
|
||||
*/
|
||||
public abstract class CaseControl<X> extends AssociationContext<X> {
|
||||
public abstract class CaseControl<X> extends AssociationContext<X,X> {
|
||||
|
||||
public Map<Cohort,X> getCaseControl() {
|
||||
return reduce(window);
|
||||
}
|
||||
|
||||
public Map<Cohort,X> reduce(List<Map<Sample,X>> window) {
|
||||
X accumCase = null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,180 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
|
||||
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import javax.management.NotificationBroadcaster;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @doc Merges N callsets that have been made on the same set of samples, and averages specific annotations.
|
||||
* @Author chartl
|
||||
*
|
||||
*/
|
||||
public class BootstrapCallsMerger extends RodWalker<BootstrapCallsMerger.VCHolder,VCFWriter> implements TreeReducible<VCFWriter>{
|
||||
|
||||
@Output
|
||||
VCFWriter vcfWriter = null;
|
||||
|
||||
// todo -- remove this, can be done just by looking at the type and iterating over ANNOTS_TO_AVG
|
||||
// todo -- multi-allelic sites (e.g. what happens here:)
|
||||
// todo -- Set 1: A G,T AC=2,4 ??
|
||||
// todo -- Set 1: A G AC=2 Set 2: A T AC=4
|
||||
// todo -- fix above behavior
|
||||
final static private Set<String> INTEGER_ANNOTS_CAN_MEAN = new HashSet<String>(Arrays.asList("AC","AN"));
|
||||
final static private Set<String> ANNOTS_TO_AVG = new HashSet<String>(Arrays.asList(
|
||||
"QD","SB","HaplotypeScore","Dels","MQ","MQ0","sumGLByD","AC","AF","AN"));
|
||||
|
||||
public void initialize() {
|
||||
// grab the samples
|
||||
Set<String> samples = new HashSet<String>();
|
||||
// setup the header fields
|
||||
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
for ( Map.Entry<String,VCFHeader> headers : VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), "bootstrap").entrySet() ) {
|
||||
samples.addAll(headers.getValue().getGenotypeSamples());
|
||||
for ( VCFHeaderLine line : headers.getValue().getMetaData() ) {
|
||||
logger.debug(line);
|
||||
VCFHeaderLine altered = alterHeaderLine(line);
|
||||
if ( VariantAnnotator.isUniqueHeaderLine(altered, hInfo) )
|
||||
hInfo.add(altered);
|
||||
}
|
||||
}
|
||||
hInfo.add(new VCFInfoHeaderLine("NB",1,VCFHeaderLineType.Integer,"Number of bootsrap sets site was seen in"));
|
||||
hInfo.add(new VCFFormatHeaderLine("BC",4,VCFHeaderLineType.Integer,"Genotype counts across bootsraps: ref,het,var,nocall"));
|
||||
HashSet<String> rodName = new HashSet<String>();
|
||||
rodName.add("variant");
|
||||
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: integer annotations will need to become floats, others will not
|
||||
* (e.g. HRun won't change, but counts will)
|
||||
* @param line
|
||||
* @return line with type changed
|
||||
*/
|
||||
private VCFHeaderLine alterHeaderLine(VCFHeaderLine line) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
if(INTEGER_ANNOTS_CAN_MEAN.contains(((VCFInfoHeaderLine) line).getName())) {
|
||||
return new VCFInfoHeaderLine(((VCFInfoHeaderLine) line).getName(),
|
||||
((VCFInfoHeaderLine) line).getCount(),
|
||||
VCFHeaderLineType.Float,
|
||||
((VCFInfoHeaderLine) line).getDescription());
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
public VCFWriter reduceInit() {
|
||||
return vcfWriter;
|
||||
}
|
||||
|
||||
public VCHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) {
|
||||
if ( tracker == null ) { return null; }
|
||||
Collection<VariantContext> bootstraps = tracker.getVariantContextsByPrefix(ref,Arrays.asList("bootstrap"),null,ref.getLocus(),true,false);
|
||||
int num_bootstraps = bootstraps.size();
|
||||
if ( num_bootstraps == 0 ) { return null; }
|
||||
Map<String,Double> avgInfo = new HashMap<String,Double>(ANNOTS_TO_AVG.size());
|
||||
Map<String,Integer[]> genotypeCountsBySample = new HashMap<String,Integer[]>();
|
||||
for ( VariantContext vc : bootstraps ) {
|
||||
// update genotype counts
|
||||
for ( Map.Entry<String,Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
if ( ! genotypeCountsBySample.containsKey(genotype.getKey())) {
|
||||
genotypeCountsBySample.put(genotype.getKey(),new Integer[]{0,0,0,0});
|
||||
}
|
||||
genotypeCountsBySample.get(genotype.getKey())[genotype2offset(genotype.getValue())]++;
|
||||
}
|
||||
// update info field annotations
|
||||
for ( String anno : ANNOTS_TO_AVG ) {
|
||||
if ( ! avgInfo.containsKey(anno) ) {
|
||||
avgInfo.put(anno,0.0);
|
||||
}
|
||||
Object value = vc.getAttribute(anno);
|
||||
if ( value instanceof Number ) {
|
||||
//logger.debug(value);
|
||||
avgInfo.put(anno,avgInfo.get(anno) + ((Number)value).doubleValue()/num_bootstraps);
|
||||
}
|
||||
if ( value instanceof String ) {
|
||||
//logger.debug("string: "+value.toString());
|
||||
avgInfo.put(anno,avgInfo.get(anno) + Double.valueOf((String)value)/num_bootstraps);
|
||||
}
|
||||
}
|
||||
}
|
||||
VariantContext first = bootstraps.iterator().next();
|
||||
Map<String,Object> finalInfo = new HashMap<String,Object>(first.getAttributes().size());
|
||||
for ( Map.Entry<String,Object> attrib : first.getAttributes().entrySet() ) {
|
||||
if ( ANNOTS_TO_AVG.contains(attrib.getKey()) ) {
|
||||
finalInfo.put(attrib.getKey(),avgInfo.get(attrib.getKey()));
|
||||
} else {
|
||||
finalInfo.put(attrib.getKey(),attrib.getValue());
|
||||
}
|
||||
}
|
||||
Map<String,Genotype> finalGenotypes = new HashMap<String,Genotype>(first.getSampleNames().size());
|
||||
for ( Map.Entry<String,Genotype> g : first.getGenotypes().entrySet() ) {
|
||||
Map<String,Object> att = new HashMap<String,Object>(g.getValue().getAttributes());
|
||||
att.put("BC",countsToString(genotypeCountsBySample.get(g.getKey())));
|
||||
//logger.debug(g.getValue());
|
||||
finalGenotypes.put(g.getKey(),Genotype.modifyAttributes(g.getValue(),att));
|
||||
//logger.debug("final:");
|
||||
//logger.debug(finalGenotypes.get(g.getKey()));
|
||||
}
|
||||
|
||||
finalInfo.put("NB",String.format("%d",num_bootstraps));
|
||||
|
||||
VariantContext attributeModified = VariantContext.modifyAttributes(first,finalInfo);
|
||||
logger.debug(attributeModified.hasGenotypes() ? "attributes have genotypes" : "VERY BAD");
|
||||
VariantContext genotypeModified = VariantContext.modifyGenotypes(attributeModified,finalGenotypes);
|
||||
logger.debug(genotypeModified.hasGenotypes() ? "modified genotypes have genotypes" : "NOT SO BAD");
|
||||
|
||||
return new VCHolder(genotypeModified,ref.getBase());
|
||||
|
||||
//return new VCHolder(VariantContext.modifyGenotypes(VariantContext.modifyAttributes(first, finalInfo), finalGenotypes),
|
||||
// ref.getBase());
|
||||
}
|
||||
|
||||
private static String countsToString(Integer[] c) {
|
||||
return String.format("%d,%d,%d,%d",c[0],c[1],c[2],c[3]);
|
||||
}
|
||||
|
||||
public VCFWriter treeReduce(VCFWriter l, VCFWriter r) {
|
||||
return l;
|
||||
}
|
||||
|
||||
public VCFWriter reduce(VCHolder h, VCFWriter w) {
|
||||
if ( h != null ) {
|
||||
w.add(h.v,h.b);
|
||||
}
|
||||
|
||||
return w;
|
||||
}
|
||||
|
||||
private static int genotype2offset(Genotype g) {
|
||||
if ( g.isHomRef() ) { return 0; }
|
||||
if ( g.isHet() ) { return 1; }
|
||||
if ( g.isHomVar() ) { return 2; }
|
||||
return 3;
|
||||
}
|
||||
|
||||
class VCHolder {
|
||||
public VariantContext v;
|
||||
public Byte b;
|
||||
public VCHolder(VariantContext v, Byte b) {
|
||||
this.v = v;
|
||||
this.b = b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import cern.jet.math.Arithmetic;
|
||||
import cern.jet.random.Normal;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
*/
|
||||
public class MannWhitneyU {
|
||||
|
||||
private TreeSet<Pair<Number,USet>> observations;
|
||||
private int sizeSet1;
|
||||
private int sizeSet2;
|
||||
|
||||
public MannWhitneyU() {
|
||||
observations = new TreeSet<Pair<Number,USet>>(new DitheringComparator());
|
||||
sizeSet1 = 0;
|
||||
sizeSet2 = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an observation into the observation tree
|
||||
* @param n: the observation (a number)
|
||||
* @param set: whether the observation comes from set 1 or set 2
|
||||
*/
|
||||
public void add(Number n, USet set) {
|
||||
observations.add(new Pair<Number,USet>(n,set));
|
||||
if ( set == USet.SET1 ) {
|
||||
++sizeSet1;
|
||||
} else {
|
||||
++sizeSet2;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* temporary method that will be generalized. Runs the standard two-sided test,
|
||||
* returns the u and p values.
|
||||
* @Returns a pair holding the u and p-value.
|
||||
*/
|
||||
public Pair<Integer,Double> runTwoSidedTest() {
|
||||
Pair<Integer,USet> uPair = calculateTwoSidedU(observations);
|
||||
int u = uPair.first;
|
||||
int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2;
|
||||
int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1;
|
||||
double pval = calculateP(n,m,u,true);
|
||||
return new Pair<Integer,Double>(u,pval);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
|
||||
* @param n - The number of entries in the DOMINATED set
|
||||
* @param m - The number of entries in the DOMINANT set
|
||||
* @param u - the Mann-Whitney U value
|
||||
* @param twoSided - is the test twosided
|
||||
* @return the (possibly approximate) p-value associated with the MWU test
|
||||
*/
|
||||
public static double calculateP(int n, int m, int u, boolean twoSided) {
|
||||
double pval;
|
||||
if ( n > 8 && m > 8 ) {
|
||||
pval = calculatePNormalApproximation(n,m,u);
|
||||
} else if ( n > 4 && m > 8 ) {
|
||||
pval = calculatePUniformApproximation(n,m,u);
|
||||
} else {
|
||||
pval = calculatePRecursively(n,m,u);
|
||||
}
|
||||
|
||||
return twoSided ? 2*pval : pval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
|
||||
* @param n - The number of entries in the DOMINATED set
|
||||
* @param m - The number of entries in the DOMINANT set
|
||||
* @param u - the Mann-Whitney U value
|
||||
* @return p-value associated with the normal approximation
|
||||
*/
|
||||
public static double calculatePNormalApproximation(int n,int m,int u) {
|
||||
double mean = ((double) m*n+1)/2;
|
||||
Normal normal = new Normal( mean , ((double) n*m*(n+m+1))/12, null);
|
||||
return u < mean ? normal.cdf(u) : 1.0-normal.cdf(u);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
|
||||
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
|
||||
* @param n -
|
||||
* @param m -
|
||||
* @param u -
|
||||
* @return
|
||||
*/
|
||||
public static double calculatePUniformApproximation(int n, int m, int u) {
|
||||
int R = u + (n*(n+1))/2;
|
||||
double a = Math.sqrt(m*(n+m+1));
|
||||
double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
|
||||
double z = b + R/a;
|
||||
if ( z < 0 ) { return 0.0; }
|
||||
else if ( z > n ) { return 1.0; }
|
||||
else {
|
||||
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function for the sum of n uniform random variables
|
||||
* @param z - value at which to compute the (un-normalized) cdf
|
||||
* @param m - a cutoff integer (defined by m <= z < m + 1)
|
||||
* @param n - the number of uniform random variables
|
||||
* @param k - holder variable for the recursion (alternatively, the index of the term in the sequence)
|
||||
* @return the (un-normalized) cdf for the sum of n random variables
|
||||
*/
|
||||
private static double uniformSumHelper(double z, int m, int n, int k) {
|
||||
if ( k > m ) { return 0; }
|
||||
int coef = (k % 2 == 0) ? 1 : -1;
|
||||
return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
|
||||
* stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
|
||||
* later on simply by multiplying the p-value by 2
|
||||
* @param observed
|
||||
* @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
|
||||
*/
|
||||
public static Pair<Integer,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
|
||||
int set1SeenSoFar = 0;
|
||||
int set2SeenSoFar = 0;
|
||||
int uSet1DomSet2 = 0;
|
||||
int uSet2DomSet1 = 0;
|
||||
USet previous = null;
|
||||
for ( Pair<Number,USet> dataPoint : observed ) {
|
||||
if ( previous != null && previous != dataPoint.second ) {
|
||||
if ( dataPoint.second == USet.SET1 ) {
|
||||
uSet2DomSet1 += set2SeenSoFar;
|
||||
} else {
|
||||
uSet1DomSet2 += set1SeenSoFar;
|
||||
}
|
||||
}
|
||||
|
||||
if ( dataPoint.second == USet.SET1 ) {
|
||||
++set1SeenSoFar;
|
||||
} else {
|
||||
++set2SeenSoFar;
|
||||
}
|
||||
|
||||
previous = dataPoint.second;
|
||||
}
|
||||
|
||||
return uSet1DomSet2 < uSet2DomSet1 ? new Pair<Integer,USet>(uSet1DomSet2,USet.SET1) : new Pair<Integer,USet>(uSet2DomSet1,USet.SET2);
|
||||
}
|
||||
|
||||
/**
|
||||
* The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
|
||||
* binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
|
||||
* calls into that recursive calculation.
|
||||
* @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
|
||||
* @param m: number of set-two entries
|
||||
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
|
||||
* @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
|
||||
*/
|
||||
public static double calculatePRecursively(int n, int m, int u) {
|
||||
if ( m > 6 && n > 4 || m + n > 16 ) { throw new StingException("Please use the appropriate (normal or sum of uniform) approximation"); }
|
||||
return cpr(n,m,u);
|
||||
}
|
||||
|
||||
/**
|
||||
* @doc: just a shorter name for calculatePRecursively. See Mann, Whitney, [1947]
|
||||
* @n: number of set-1 entries
|
||||
* @m: number of set-2 entries
|
||||
* @u: number of times a set-2 entry as preceded a set-1 entry
|
||||
*/
|
||||
private static double cpr(int n, int m, int u) {
|
||||
if ( u < 0 || n == 0 && m == 0 ) {
|
||||
return 0.0;
|
||||
}
|
||||
if ( m == 0 || n == 0 ) {
|
||||
// there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero.
|
||||
// note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway
|
||||
return ( u == 0 ) ? 1.0 : 0.0;
|
||||
}
|
||||
|
||||
|
||||
return (n/(n+m))*cpr(n-1,m,u-m) + (m/(n+m))*cpr(n,m-1,u);
|
||||
}
|
||||
|
||||
/**
|
||||
* A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
|
||||
* and to ensure that rank ties are broken at random.
|
||||
*/
|
||||
private class DitheringComparator implements Comparator<Pair<Number,USet>> {
|
||||
|
||||
public DitheringComparator() {}
|
||||
|
||||
public boolean equals(Object other) { return false; }
|
||||
|
||||
public int compare(Pair<Number,USet> left, Pair<Number,USet> right) {
|
||||
double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue());
|
||||
if ( comp > 0 ) { return 1; }
|
||||
if ( comp < 0 ) { return -1; }
|
||||
return MathUtils.rand.nextBoolean() ? -1 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
public enum USet { SET1, SET2 }
|
||||
|
||||
}
|
||||
|
|
@ -52,6 +52,30 @@ public class MathUtils {
|
|||
/** Private constructor. No instantiating this class! */
|
||||
private MathUtils() {}
|
||||
|
||||
public static double sum( Collection<Number> numbers ) {
|
||||
double sum = 0;
|
||||
for ( Number n : numbers ) {
|
||||
sum += n.doubleValue();
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static double average( Collection<Number> numbers ) {
|
||||
return sum(numbers)/numbers.size();
|
||||
}
|
||||
|
||||
public static double variance( Collection<Number> numbers, Number mean ) {
|
||||
double mn = mean.doubleValue();
|
||||
double var = 0;
|
||||
for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); }
|
||||
return var;
|
||||
}
|
||||
|
||||
public static double variance(Collection<Number> numbers) {
|
||||
return variance(numbers,average(numbers));
|
||||
}
|
||||
|
||||
public static double sum(double[] values) {
|
||||
double s = 0.0;
|
||||
for ( double v : values) s += v;
|
||||
|
|
|
|||
|
|
@ -60,6 +60,23 @@ public class VCFUtils {
|
|||
return data;
|
||||
}
|
||||
|
||||
public static Map<String,VCFHeader> getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) {
|
||||
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
// ignore the rod if lacks the prefix
|
||||
if ( ! source.getName().startsWith(prefix) )
|
||||
continue;
|
||||
|
||||
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
|
||||
data.put(source.getName(), (VCFHeader)source.getHeader());
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header fields from all VCF rods input by the user
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,182 @@
|
|||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType}
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
|
||||
class BootstrapCalls extends QScript {
|
||||
@Argument(doc="Bam file list",shortName="I",required=true)
|
||||
var bamList: File = _
|
||||
@Argument(doc="Intervals file",shortName="L",required=true)
|
||||
var intervalFile: File = _
|
||||
@Argument(doc="Output file",shortName="o",required=true)
|
||||
var bootstrapMergedOut: File = _
|
||||
@Argument(doc="Reference file",shortName="R",required=true)
|
||||
var reference: File = _
|
||||
@Argument(doc="Downsampling Level",shortName="D",required=false)
|
||||
var downsamplingLevel: Int = 4
|
||||
@Argument(doc="Num Bootstrap Callsets",shortName="B",required=false)
|
||||
var numberOfBootstraps: Int = 25
|
||||
@Argument(doc="call confidence",shortName="conf",required=false)
|
||||
var standCallConf: Double = 4.0
|
||||
@Argument(doc="dbsnp file (vcf version)",shortName="dbsnp",required=false)
|
||||
var dbsnp: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf")
|
||||
@Argument(doc="sting jar",shortName="s",required=true)
|
||||
var sting: File = _
|
||||
|
||||
/**********************
|
||||
* URGENT NOTE:
|
||||
* for this to do any good you need to take out the random seeds in
|
||||
* ReservoirDownsampler: 20
|
||||
* MathUtils: 649
|
||||
*
|
||||
* You will also need to hack the recalibrator to always trust AC (which are no longer integer-valued)
|
||||
* and to deal with double-valued AC fields
|
||||
*/
|
||||
|
||||
def script = {
|
||||
val bams: List[File] = extractFileEntries(bamList)
|
||||
trait UGArgs extends UnifiedGenotyper {
|
||||
this.input_file = bams
|
||||
this.reference_sequence = reference
|
||||
this.dcov = Some(downsamplingLevel)
|
||||
this.intervals :+= intervalFile
|
||||
this.stand_call_conf = Some(standCallConf)
|
||||
this.stand_emit_conf = Some(standCallConf)
|
||||
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
|
||||
this.scatterCount = 20
|
||||
this.jarFile = sting
|
||||
this.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
val bootstrapBase = swapExt(bootstrapMergedOut,".vcf",".boot%d.vcf").getAbsolutePath
|
||||
var calls : List[UnifiedGenotyper] = Nil
|
||||
for ( i <- 0 until (numberOfBootstraps+1) ) {
|
||||
var ug : UnifiedGenotyper = new UnifiedGenotyper with UGArgs
|
||||
ug.out = new File(bootstrapBase.format(i))
|
||||
ug.analysisName = "Boostrap%d".format(i)
|
||||
calls :+= ug
|
||||
}
|
||||
|
||||
addAll(calls)
|
||||
|
||||
trait MergeArgs extends BootstrapCallsMerger {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.scatterCount = 40
|
||||
this.jarFile = sting
|
||||
this.memoryLimit = Some(4)
|
||||
this.rodBind ++= calls.map(u => u.out).zipWithIndex.map(u => new RodBind("bootstrap_%d".format(u._2),"vcf",u._1))
|
||||
this.out = bootstrapMergedOut
|
||||
}
|
||||
|
||||
var merge : BootstrapCallsMerger = new BootstrapCallsMerger with MergeArgs
|
||||
add(merge)
|
||||
|
||||
trait ClusterArgs extends GenerateVariantClusters {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.rodBind :+= new RodBind("input","vcf",merge.out)
|
||||
this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
|
||||
this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
|
||||
this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
|
||||
this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
|
||||
this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster")
|
||||
this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
|
||||
this.qual = Some(100)
|
||||
this.std = Some(3.5)
|
||||
this.mG = Some(8)
|
||||
this.trustAllPolymorphic = true
|
||||
this.memoryLimit = Some(8)
|
||||
this.jarFile = sting
|
||||
}
|
||||
|
||||
var clust : GenerateVariantClusters = new GenerateVariantClusters with ClusterArgs
|
||||
add(clust)
|
||||
|
||||
trait VQSRArgs extends VariantRecalibrator {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.out = swapExt(bootstrapMergedOut,"vcf","recal.vcf")
|
||||
this.rodBind :+= new RodBind("input","vcf",merge.out)
|
||||
this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
|
||||
this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
|
||||
this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
|
||||
this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
|
||||
this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster")
|
||||
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
|
||||
this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0")
|
||||
this.trustAllPolymorphic = true
|
||||
this.tranchesFile = swapExt(bootstrapMergedOut,"vcf","tranche")
|
||||
this.memoryLimit=Some(8)
|
||||
this.jarFile = sting
|
||||
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
|
||||
}
|
||||
|
||||
var recal : VariantRecalibrator = new VariantRecalibrator with VQSRArgs
|
||||
add(recal)
|
||||
|
||||
trait CutArgs extends ApplyVariantCuts {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.rodBind :+= new RodBind("input","vcf",recal.out)
|
||||
this.tranchesFile = recal.tranchesFile
|
||||
this.fdr_filter_level = Some(1.0)
|
||||
this.out = swapExt(bootstrapMergedOut,".vcf",".recal.cut.vcf")
|
||||
this.jarFile = sting
|
||||
this.memoryLimit = Some(4)
|
||||
this.scatterCount = 5
|
||||
}
|
||||
|
||||
var cut : ApplyVariantCuts = new ApplyVariantCuts with CutArgs
|
||||
add(cut)
|
||||
|
||||
class RmHeader extends CommandLineFunction {
|
||||
@Input(doc="vcf") var vcf : File = _
|
||||
@Output(doc="headerless vcf") var noheadvcf : File = _
|
||||
|
||||
def commandLine : String = {
|
||||
"head -n 1 %s > %s ; grep -v \\#\\# %s >> %s".format(vcf.getAbsolutePath,noheadvcf.getAbsolutePath,
|
||||
vcf.getAbsolutePath,noheadvcf.getAbsolutePath)
|
||||
}
|
||||
}
|
||||
|
||||
var rm : RmHeader = new RmHeader
|
||||
rm.vcf = cut.out
|
||||
rm.noheadvcf = swapExt(cut.out,".vcf",".nohead.vcf")
|
||||
add(rm)
|
||||
|
||||
trait CombineArgs extends CombineVariants {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.rodBind :+= new RodBind("hiCov","vcf",rm.noheadvcf)
|
||||
this.rodBind :+= new RodBind("loCov","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf"))
|
||||
this.variantMergeOptions = Some(VariantMergeType.UNION)
|
||||
this.genotypeMergeOptions = Some(GenotypeMergeType.PRIORITIZE)
|
||||
this.priority = "hiCov,loCov"
|
||||
this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.vcf")
|
||||
this.jarFile = sting
|
||||
}
|
||||
|
||||
var combine : CombineVariants = new CombineVariants with CombineArgs
|
||||
add(combine)
|
||||
|
||||
trait EvalArgs extends VariantEval {
|
||||
this.reference_sequence = reference
|
||||
this.intervals :+= intervalFile
|
||||
this.rodBind :+= new RodBind("eval","vcf",combine.out)
|
||||
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
|
||||
this.jarFile = sting
|
||||
this.ST = List("Filter","Novelty","JexlExpression")
|
||||
this.select_names = List("lowOnly","filteredInLow","Intersection","filteredInHi","hiOnly","filteredInAll")
|
||||
this.select_exps = List("\"set == 'loCov'\"","\"set == 'hiCov-filterInloCov'\"",
|
||||
"\"set == 'Intersection'\"", "\"set == 'filterInhiCov-loCov'\"",
|
||||
"\"set == 'hiCov'\"","\"set == 'FilteredInAll'\"")
|
||||
this.EV = List("TiTvVariantEvaluator","CountVariants","CompOverlap")
|
||||
this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.eval")
|
||||
this.nt = Some(8)
|
||||
this.memoryLimit = Some(12)
|
||||
}
|
||||
|
||||
var eval : VariantEval = new VariantEval with EvalArgs
|
||||
add(eval)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue