Miscellaneous changes:

- RefMetaDataTracker: grabbing variant contexts given a prefix (not sure where else this was implemented, if someone can show me I'll remove it)
 - VCFUtils: grabbing VCF headers given a prefix 
 - MathUtils: Useful functions for calculating statistics on collections of Numbers
 - VariantAnnotator: Made isUniqueHeaderLine a public static method -- maybe this should go into a different class. Not sure.
 - Associations: PluginManager now used to propagate classes, implementations for Z,T,U tests, slight alteration to format to make the objects stored
      in the window optionally different from those returned by whatever statistic is run across the window
Added:
 - MannWhitneyU. Started to fix up WilcoxonRankSum but there are comments in there questioning the validity of some of the code, and I'm sure that
    it's actually doing a U test. This implementation includes the direct calculation of p-values for small sample sizes, and a uniform approximation
    for when one of the sample sets is small, and the other large. Unit tests to follow.
 - BootstrapCallsMerger: takes n VCFs which have been called on the same samples; merges them together while averaging the annotations
 - BootstrapCalls.q: qscript for testing the effectiveness of boostrap low-pass calling on the exome
 


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5372 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-03-03 22:43:36 +00:00
parent 8ae42b70ac
commit 9ca1dd5d62
11 changed files with 705 additions and 31 deletions

View File

@ -276,6 +276,19 @@ public class RefMetaDataTracker {
return contexts;
}
public Collection<VariantContext> getVariantContextsByPrefix(ReferenceContext ref, Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
for ( String name : names ) {
RODRecordList rodList = getTrackDataByName(name,false); // require that the name is an exact match
if ( rodList != null )
addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
}
return contexts;
}
/**
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
* see getVariantContexts for more information.

View File

@ -156,7 +156,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
}
}
private static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) {
public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) {
if ( !(line instanceof VCFCompoundHeaderLine) )
return true;

View File

@ -15,12 +15,12 @@ import java.util.Map;
* Time: 11:58 AM
* To change this template use File | Settings | File Templates.
*/
public abstract class AssociationContext<X> {
public abstract class AssociationContext<X,Y> {
protected List<Map<Sample,Object>> window;
protected List<Map<Sample,Y>> window;
public AssociationContext() {
window = new ArrayList<Map<Sample,Object>>(getWindowSize());
window = new ArrayList<Map<Sample,Y>>(getWindowSize());
}
// specifies size of window
@ -36,7 +36,7 @@ public abstract class AssociationContext<X> {
public abstract Object map(ReadBackedPileup rbp);
// specifies how to take the per-sample data and reduce them into testable pairs
public abstract Map<?,X> reduce(List<Map<Sample,X>> win);
public abstract Map<?,X> reduce(List<Map<Sample,Y>> win);
// do we filter the current location (e.g. omit from window)
public boolean filter(MapExtender m) { return true; }
@ -60,7 +60,7 @@ public abstract class AssociationContext<X> {
}
public void addData(Map<Sample,Object> sampleData) {
public void addData(Map<Sample,Y> sampleData) {
window.add(sampleData);
}
@ -70,7 +70,7 @@ public abstract class AssociationContext<X> {
}
public void slide() {
ArrayList<Map<Sample,Object>> newWindow = new ArrayList<Map<Sample,Object>>((window.subList(slideByValue(),window.size())));
ArrayList<Map<Sample,Y>> newWindow = new ArrayList<Map<Sample,Y>>((window.subList(slideByValue(),window.size())));
newWindow.ensureCapacity(getWindowSize());
window = newWindow;
}

View File

@ -1,12 +1,17 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import cern.jet.random.Normal;
import cern.jet.random.StudentT;
import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.*;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.WilcoxonRankSum;
import org.broadinstitute.sting.utils.collections.Pair;
/**
* Created by IntelliJ IDEA.
@ -28,26 +33,58 @@ public class AssociationTestRunner {
results.add(runZ((ZStatistic) context));
}
if ( context instanceof FisherExact ) {
results.add(runFisherExact(context));
if ( context instanceof UStatistic ) {
results.add(runU((UStatistic) context));
}
return results;
}
public static String runStudentT(TStatistic context) { /*
StudentT studentT = new StudentT(dfnum/dfdenom,null);
public static String runStudentT(TStatistic context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1);
double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl);
StudentT studentT = new StudentT(df_num/df_denom,null);
double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t));
return String.format("T: %.2f\tP: %.2e",t,p);*/
return null;
return String.format("T: %.2f\tP: %.2e",t,p);
}
public static String runZ(ZStatistic context) { /*
double z = num/Math.sqrt(se);
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
return String.format("Z: %.2f\tP: %.2e",z,p);*/
public static String runZ(ZStatistic context) {
Map<CaseControl.Cohort,Pair<Number,Number>> caseControlCounts = context.getCaseControl();
double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue();
double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue();
return null;
double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/
(caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue());
double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl));
double z = (pCase-pControl)/se;
double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z));
return String.format("Z: %.2f\tP: %.2e",z,p);
}
public static String runU(UStatistic context) {
Map<CaseControl.Cohort,Collection<Number>> caseControlVectors = context.getCaseControl();
MannWhitneyU mwu = new MannWhitneyU();
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) {
mwu.add(n, MannWhitneyU.USet.SET1);
}
for ( Number n : caseControlVectors.get(CaseControl.Cohort.CONTROL) ) {
mwu.add(n,MannWhitneyU.USet.SET2);
}
Pair<Integer,Double> results = mwu.runTwoSidedTest();
return String.format("U: %d\tP: %.2e",results.first,results.second);
}
public static String runFisherExact(AssociationContext context) {

View File

@ -9,13 +9,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.PrintStream;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.lang.reflect.Modifier;
import java.util.*;
/**
* @Author chartl
@ -23,7 +23,7 @@ import java.util.Set;
* Generalized framework for regional (windowed) associations
*/
public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAssociationHandler> implements TreeReducible<RegionalAssociationHandler> {
@Argument(doc="foo",shortName="AT",fullName="associationType",required=false)
@Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false)
public String[] associationsToUse = null;
@Output
@ -57,17 +57,23 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
}
return rac;
}
private AssociationContext stringToAssociationContext(String s) {
throw new UserException(String.format("AssociationContextOld type %s not found.",s));
}
private Set<AssociationContext> getAssociations() {
// todo -- this should use the package handler like variant eval
List<Class<? extends AssociationContext>> contexts = new PluginManager<AssociationContext>(AssociationContext.class).getPlugins();
Map<String,Class<? extends AssociationContext>> classNameToClass = new HashMap<String,Class<? extends AssociationContext>>(contexts.size());
for ( Class<? extends AssociationContext> clazz : contexts ) {
if (! Modifier.isAbstract(clazz.getModifiers())) {
classNameToClass.put(clazz.getSimpleName(),clazz);
}
}
Set<AssociationContext> validAssociations = new HashSet<AssociationContext>();
for ( String s : associationsToUse ) {
AssociationContext context = stringToAssociationContext(s);
AssociationContext context;
try {
context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[]{});
} catch ( Exception e ) {
throw new StingException("The class "+s+" could not be instantiated.",e);
}
context.init(this);
validAssociations.add(context);
}

View File

@ -12,7 +12,11 @@ import java.util.Map;
* Created by IntelliJ IDEA.
* @author chartl
*/
public abstract class CaseControl<X> extends AssociationContext<X> {
public abstract class CaseControl<X> extends AssociationContext<X,X> {
public Map<Cohort,X> getCaseControl() {
return reduce(window);
}
public Map<Cohort,X> reduce(List<Map<Sample,X>> window) {
X accumCase = null;

View File

@ -0,0 +1,180 @@
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
import com.google.common.collect.ArrayListMultimap;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import javax.management.NotificationBroadcaster;
import java.util.*;
/**
* @doc Merges N callsets that have been made on the same set of samples, and averages specific annotations.
* @Author chartl
*
*/
public class BootstrapCallsMerger extends RodWalker<BootstrapCallsMerger.VCHolder,VCFWriter> implements TreeReducible<VCFWriter>{
@Output
VCFWriter vcfWriter = null;
// todo -- remove this, can be done just by looking at the type and iterating over ANNOTS_TO_AVG
// todo -- multi-allelic sites (e.g. what happens here:)
// todo -- Set 1: A G,T AC=2,4 ??
// todo -- Set 1: A G AC=2 Set 2: A T AC=4
// todo -- fix above behavior
final static private Set<String> INTEGER_ANNOTS_CAN_MEAN = new HashSet<String>(Arrays.asList("AC","AN"));
final static private Set<String> ANNOTS_TO_AVG = new HashSet<String>(Arrays.asList(
"QD","SB","HaplotypeScore","Dels","MQ","MQ0","sumGLByD","AC","AF","AN"));
public void initialize() {
// grab the samples
Set<String> samples = new HashSet<String>();
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
for ( Map.Entry<String,VCFHeader> headers : VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), "bootstrap").entrySet() ) {
samples.addAll(headers.getValue().getGenotypeSamples());
for ( VCFHeaderLine line : headers.getValue().getMetaData() ) {
logger.debug(line);
VCFHeaderLine altered = alterHeaderLine(line);
if ( VariantAnnotator.isUniqueHeaderLine(altered, hInfo) )
hInfo.add(altered);
}
}
hInfo.add(new VCFInfoHeaderLine("NB",1,VCFHeaderLineType.Integer,"Number of bootsrap sets site was seen in"));
hInfo.add(new VCFFormatHeaderLine("BC",4,VCFHeaderLineType.Integer,"Genotype counts across bootsraps: ref,het,var,nocall"));
HashSet<String> rodName = new HashSet<String>();
rodName.add("variant");
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
/**
* Note: integer annotations will need to become floats, others will not
* (e.g. HRun won't change, but counts will)
* @param line
* @return line with type changed
*/
private VCFHeaderLine alterHeaderLine(VCFHeaderLine line) {
if ( line instanceof VCFInfoHeaderLine ) {
if(INTEGER_ANNOTS_CAN_MEAN.contains(((VCFInfoHeaderLine) line).getName())) {
return new VCFInfoHeaderLine(((VCFInfoHeaderLine) line).getName(),
((VCFInfoHeaderLine) line).getCount(),
VCFHeaderLineType.Float,
((VCFInfoHeaderLine) line).getDescription());
}
}
return line;
}
public VCFWriter reduceInit() {
return vcfWriter;
}
public VCHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) {
if ( tracker == null ) { return null; }
Collection<VariantContext> bootstraps = tracker.getVariantContextsByPrefix(ref,Arrays.asList("bootstrap"),null,ref.getLocus(),true,false);
int num_bootstraps = bootstraps.size();
if ( num_bootstraps == 0 ) { return null; }
Map<String,Double> avgInfo = new HashMap<String,Double>(ANNOTS_TO_AVG.size());
Map<String,Integer[]> genotypeCountsBySample = new HashMap<String,Integer[]>();
for ( VariantContext vc : bootstraps ) {
// update genotype counts
for ( Map.Entry<String,Genotype> genotype : vc.getGenotypes().entrySet() ) {
if ( ! genotypeCountsBySample.containsKey(genotype.getKey())) {
genotypeCountsBySample.put(genotype.getKey(),new Integer[]{0,0,0,0});
}
genotypeCountsBySample.get(genotype.getKey())[genotype2offset(genotype.getValue())]++;
}
// update info field annotations
for ( String anno : ANNOTS_TO_AVG ) {
if ( ! avgInfo.containsKey(anno) ) {
avgInfo.put(anno,0.0);
}
Object value = vc.getAttribute(anno);
if ( value instanceof Number ) {
//logger.debug(value);
avgInfo.put(anno,avgInfo.get(anno) + ((Number)value).doubleValue()/num_bootstraps);
}
if ( value instanceof String ) {
//logger.debug("string: "+value.toString());
avgInfo.put(anno,avgInfo.get(anno) + Double.valueOf((String)value)/num_bootstraps);
}
}
}
VariantContext first = bootstraps.iterator().next();
Map<String,Object> finalInfo = new HashMap<String,Object>(first.getAttributes().size());
for ( Map.Entry<String,Object> attrib : first.getAttributes().entrySet() ) {
if ( ANNOTS_TO_AVG.contains(attrib.getKey()) ) {
finalInfo.put(attrib.getKey(),avgInfo.get(attrib.getKey()));
} else {
finalInfo.put(attrib.getKey(),attrib.getValue());
}
}
Map<String,Genotype> finalGenotypes = new HashMap<String,Genotype>(first.getSampleNames().size());
for ( Map.Entry<String,Genotype> g : first.getGenotypes().entrySet() ) {
Map<String,Object> att = new HashMap<String,Object>(g.getValue().getAttributes());
att.put("BC",countsToString(genotypeCountsBySample.get(g.getKey())));
//logger.debug(g.getValue());
finalGenotypes.put(g.getKey(),Genotype.modifyAttributes(g.getValue(),att));
//logger.debug("final:");
//logger.debug(finalGenotypes.get(g.getKey()));
}
finalInfo.put("NB",String.format("%d",num_bootstraps));
VariantContext attributeModified = VariantContext.modifyAttributes(first,finalInfo);
logger.debug(attributeModified.hasGenotypes() ? "attributes have genotypes" : "VERY BAD");
VariantContext genotypeModified = VariantContext.modifyGenotypes(attributeModified,finalGenotypes);
logger.debug(genotypeModified.hasGenotypes() ? "modified genotypes have genotypes" : "NOT SO BAD");
return new VCHolder(genotypeModified,ref.getBase());
//return new VCHolder(VariantContext.modifyGenotypes(VariantContext.modifyAttributes(first, finalInfo), finalGenotypes),
// ref.getBase());
}
private static String countsToString(Integer[] c) {
return String.format("%d,%d,%d,%d",c[0],c[1],c[2],c[3]);
}
public VCFWriter treeReduce(VCFWriter l, VCFWriter r) {
return l;
}
public VCFWriter reduce(VCHolder h, VCFWriter w) {
if ( h != null ) {
w.add(h.v,h.b);
}
return w;
}
private static int genotype2offset(Genotype g) {
if ( g.isHomRef() ) { return 0; }
if ( g.isHet() ) { return 1; }
if ( g.isHomVar() ) { return 2; }
return 3;
}
class VCHolder {
public VariantContext v;
public Byte b;
public VCHolder(VariantContext v, Byte b) {
this.v = v;
this.b = b;
}
}
}

View File

@ -0,0 +1,211 @@
package org.broadinstitute.sting.utils;
import cern.jet.math.Arithmetic;
import cern.jet.random.Normal;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.Comparator;
import java.util.TreeSet;
/**
* Created by IntelliJ IDEA.
* User: chartl
*/
public class MannWhitneyU {
private TreeSet<Pair<Number,USet>> observations;
private int sizeSet1;
private int sizeSet2;
public MannWhitneyU() {
observations = new TreeSet<Pair<Number,USet>>(new DitheringComparator());
sizeSet1 = 0;
sizeSet2 = 0;
}
/**
* Add an observation into the observation tree
* @param n: the observation (a number)
* @param set: whether the observation comes from set 1 or set 2
*/
public void add(Number n, USet set) {
observations.add(new Pair<Number,USet>(n,set));
if ( set == USet.SET1 ) {
++sizeSet1;
} else {
++sizeSet2;
}
}
/**
* temporary method that will be generalized. Runs the standard two-sided test,
* returns the u and p values.
* @Returns a pair holding the u and p-value.
*/
public Pair<Integer,Double> runTwoSidedTest() {
Pair<Integer,USet> uPair = calculateTwoSidedU(observations);
int u = uPair.first;
int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2;
int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1;
double pval = calculateP(n,m,u,true);
return new Pair<Integer,Double>(u,pval);
}
/**
* Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param u - the Mann-Whitney U value
* @param twoSided - is the test twosided
* @return the (possibly approximate) p-value associated with the MWU test
*/
public static double calculateP(int n, int m, int u, boolean twoSided) {
double pval;
if ( n > 8 && m > 8 ) {
pval = calculatePNormalApproximation(n,m,u);
} else if ( n > 4 && m > 8 ) {
pval = calculatePUniformApproximation(n,m,u);
} else {
pval = calculatePRecursively(n,m,u);
}
return twoSided ? 2*pval : pval;
}
/**
* Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
* @param n - The number of entries in the DOMINATED set
* @param m - The number of entries in the DOMINANT set
* @param u - the Mann-Whitney U value
* @return p-value associated with the normal approximation
*/
public static double calculatePNormalApproximation(int n,int m,int u) {
double mean = ((double) m*n+1)/2;
Normal normal = new Normal( mean , ((double) n*m*(n+m+1))/12, null);
return u < mean ? normal.cdf(u) : 1.0-normal.cdf(u);
}
/**
* Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF)
* @param n -
* @param m -
* @param u -
* @return
*/
public static double calculatePUniformApproximation(int n, int m, int u) {
int R = u + (n*(n+1))/2;
double a = Math.sqrt(m*(n+m+1));
double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
double z = b + R/a;
if ( z < 0 ) { return 0.0; }
else if ( z > n ) { return 1.0; }
else {
return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
}
}
/**
* Helper function for the sum of n uniform random variables
* @param z - value at which to compute the (un-normalized) cdf
* @param m - a cutoff integer (defined by m <= z < m + 1)
* @param n - the number of uniform random variables
* @param k - holder variable for the recursion (alternatively, the index of the term in the sequence)
* @return the (un-normalized) cdf for the sum of n random variables
*/
private static double uniformSumHelper(double z, int m, int n, int k) {
if ( k > m ) { return 0; }
int coef = (k % 2 == 0) ? 1 : -1;
return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1);
}
/**
* Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
* stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
* later on simply by multiplying the p-value by 2
* @param observed
* @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
*/
public static Pair<Integer,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed ) {
int set1SeenSoFar = 0;
int set2SeenSoFar = 0;
int uSet1DomSet2 = 0;
int uSet2DomSet1 = 0;
USet previous = null;
for ( Pair<Number,USet> dataPoint : observed ) {
if ( previous != null && previous != dataPoint.second ) {
if ( dataPoint.second == USet.SET1 ) {
uSet2DomSet1 += set2SeenSoFar;
} else {
uSet1DomSet2 += set1SeenSoFar;
}
}
if ( dataPoint.second == USet.SET1 ) {
++set1SeenSoFar;
} else {
++set2SeenSoFar;
}
previous = dataPoint.second;
}
return uSet1DomSet2 < uSet2DomSet1 ? new Pair<Integer,USet>(uSet1DomSet2,USet.SET1) : new Pair<Integer,USet>(uSet2DomSet1,USet.SET2);
}
/**
* The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
* binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
* calls into that recursive calculation.
* @param n: number of set-one entries (hypothesis: set-one is dominated by set-two)
* @param m: number of set-two entries
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
* @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
*/
public static double calculatePRecursively(int n, int m, int u) {
if ( m > 6 && n > 4 || m + n > 16 ) { throw new StingException("Please use the appropriate (normal or sum of uniform) approximation"); }
return cpr(n,m,u);
}
/**
* @doc: just a shorter name for calculatePRecursively. See Mann, Whitney, [1947]
* @n: number of set-1 entries
* @m: number of set-2 entries
* @u: number of times a set-2 entry as preceded a set-1 entry
*/
private static double cpr(int n, int m, int u) {
if ( u < 0 || n == 0 && m == 0 ) {
return 0.0;
}
if ( m == 0 || n == 0 ) {
// there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero.
// note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway
return ( u == 0 ) ? 1.0 : 0.0;
}
return (n/(n+m))*cpr(n-1,m,u-m) + (m/(n+m))*cpr(n,m-1,u);
}
/**
* A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
* and to ensure that rank ties are broken at random.
*/
private class DitheringComparator implements Comparator<Pair<Number,USet>> {
public DitheringComparator() {}
public boolean equals(Object other) { return false; }
public int compare(Pair<Number,USet> left, Pair<Number,USet> right) {
double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue());
if ( comp > 0 ) { return 1; }
if ( comp < 0 ) { return -1; }
return MathUtils.rand.nextBoolean() ? -1 : 1;
}
}
public enum USet { SET1, SET2 }
}

View File

@ -52,6 +52,30 @@ public class MathUtils {
/** Private constructor. No instantiating this class! */
private MathUtils() {}
public static double sum( Collection<Number> numbers ) {
double sum = 0;
for ( Number n : numbers ) {
sum += n.doubleValue();
}
return sum;
}
public static double average( Collection<Number> numbers ) {
return sum(numbers)/numbers.size();
}
public static double variance( Collection<Number> numbers, Number mean ) {
double mn = mean.doubleValue();
double var = 0;
for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); }
return var;
}
public static double variance(Collection<Number> numbers) {
return variance(numbers,average(numbers));
}
public static double sum(double[] values) {
double s = 0.0;
for ( double v : values) s += v;

View File

@ -60,6 +60,23 @@ public class VCFUtils {
return data;
}
public static Map<String,VCFHeader> getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
// ignore the rod if lacks the prefix
if ( ! source.getName().startsWith(prefix) )
continue;
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
data.put(source.getName(), (VCFHeader)source.getHeader());
}
return data;
}
/**
* Gets the header fields from all VCF rods input by the user
*

View File

@ -0,0 +1,182 @@
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType}
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
class BootstrapCalls extends QScript {
@Argument(doc="Bam file list",shortName="I",required=true)
var bamList: File = _
@Argument(doc="Intervals file",shortName="L",required=true)
var intervalFile: File = _
@Argument(doc="Output file",shortName="o",required=true)
var bootstrapMergedOut: File = _
@Argument(doc="Reference file",shortName="R",required=true)
var reference: File = _
@Argument(doc="Downsampling Level",shortName="D",required=false)
var downsamplingLevel: Int = 4
@Argument(doc="Num Bootstrap Callsets",shortName="B",required=false)
var numberOfBootstraps: Int = 25
@Argument(doc="call confidence",shortName="conf",required=false)
var standCallConf: Double = 4.0
@Argument(doc="dbsnp file (vcf version)",shortName="dbsnp",required=false)
var dbsnp: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf")
@Argument(doc="sting jar",shortName="s",required=true)
var sting: File = _
/**********************
* URGENT NOTE:
* for this to do any good you need to take out the random seeds in
* ReservoirDownsampler: 20
* MathUtils: 649
*
* You will also need to hack the recalibrator to always trust AC (which are no longer integer-valued)
* and to deal with double-valued AC fields
*/
def script = {
val bams: List[File] = extractFileEntries(bamList)
trait UGArgs extends UnifiedGenotyper {
this.input_file = bams
this.reference_sequence = reference
this.dcov = Some(downsamplingLevel)
this.intervals :+= intervalFile
this.stand_call_conf = Some(standCallConf)
this.stand_emit_conf = Some(standCallConf)
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
this.scatterCount = 20
this.jarFile = sting
this.memoryLimit = Some(4)
}
val bootstrapBase = swapExt(bootstrapMergedOut,".vcf",".boot%d.vcf").getAbsolutePath
var calls : List[UnifiedGenotyper] = Nil
for ( i <- 0 until (numberOfBootstraps+1) ) {
var ug : UnifiedGenotyper = new UnifiedGenotyper with UGArgs
ug.out = new File(bootstrapBase.format(i))
ug.analysisName = "Boostrap%d".format(i)
calls :+= ug
}
addAll(calls)
trait MergeArgs extends BootstrapCallsMerger {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.scatterCount = 40
this.jarFile = sting
this.memoryLimit = Some(4)
this.rodBind ++= calls.map(u => u.out).zipWithIndex.map(u => new RodBind("bootstrap_%d".format(u._2),"vcf",u._1))
this.out = bootstrapMergedOut
}
var merge : BootstrapCallsMerger = new BootstrapCallsMerger with MergeArgs
add(merge)
trait ClusterArgs extends GenerateVariantClusters {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.rodBind :+= new RodBind("input","vcf",merge.out)
this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster")
this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
this.qual = Some(100)
this.std = Some(3.5)
this.mG = Some(8)
this.trustAllPolymorphic = true
this.memoryLimit = Some(8)
this.jarFile = sting
}
var clust : GenerateVariantClusters = new GenerateVariantClusters with ClusterArgs
add(clust)
trait VQSRArgs extends VariantRecalibrator {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.out = swapExt(bootstrapMergedOut,"vcf","recal.vcf")
this.rodBind :+= new RodBind("input","vcf",merge.out)
this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"))
this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf"))
this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster")
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0")
this.trustAllPolymorphic = true
this.tranchesFile = swapExt(bootstrapMergedOut,"vcf","tranche")
this.memoryLimit=Some(8)
this.jarFile = sting
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
}
var recal : VariantRecalibrator = new VariantRecalibrator with VQSRArgs
add(recal)
trait CutArgs extends ApplyVariantCuts {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.rodBind :+= new RodBind("input","vcf",recal.out)
this.tranchesFile = recal.tranchesFile
this.fdr_filter_level = Some(1.0)
this.out = swapExt(bootstrapMergedOut,".vcf",".recal.cut.vcf")
this.jarFile = sting
this.memoryLimit = Some(4)
this.scatterCount = 5
}
var cut : ApplyVariantCuts = new ApplyVariantCuts with CutArgs
add(cut)
class RmHeader extends CommandLineFunction {
@Input(doc="vcf") var vcf : File = _
@Output(doc="headerless vcf") var noheadvcf : File = _
def commandLine : String = {
"head -n 1 %s > %s ; grep -v \\#\\# %s >> %s".format(vcf.getAbsolutePath,noheadvcf.getAbsolutePath,
vcf.getAbsolutePath,noheadvcf.getAbsolutePath)
}
}
var rm : RmHeader = new RmHeader
rm.vcf = cut.out
rm.noheadvcf = swapExt(cut.out,".vcf",".nohead.vcf")
add(rm)
trait CombineArgs extends CombineVariants {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.rodBind :+= new RodBind("hiCov","vcf",rm.noheadvcf)
this.rodBind :+= new RodBind("loCov","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf"))
this.variantMergeOptions = Some(VariantMergeType.UNION)
this.genotypeMergeOptions = Some(GenotypeMergeType.PRIORITIZE)
this.priority = "hiCov,loCov"
this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.vcf")
this.jarFile = sting
}
var combine : CombineVariants = new CombineVariants with CombineArgs
add(combine)
trait EvalArgs extends VariantEval {
this.reference_sequence = reference
this.intervals :+= intervalFile
this.rodBind :+= new RodBind("eval","vcf",combine.out)
this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp)
this.jarFile = sting
this.ST = List("Filter","Novelty","JexlExpression")
this.select_names = List("lowOnly","filteredInLow","Intersection","filteredInHi","hiOnly","filteredInAll")
this.select_exps = List("\"set == 'loCov'\"","\"set == 'hiCov-filterInloCov'\"",
"\"set == 'Intersection'\"", "\"set == 'filterInhiCov-loCov'\"",
"\"set == 'hiCov'\"","\"set == 'FilteredInAll'\"")
this.EV = List("TiTvVariantEvaluator","CountVariants","CompOverlap")
this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.eval")
this.nt = Some(8)
this.memoryLimit = Some(12)
}
var eval : VariantEval = new VariantEval with EvalArgs
add(eval)
}
}