From 9ca1dd5d62a45d8a485a70deb158f29d459aa6cf Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 3 Mar 2011 22:43:36 +0000 Subject: [PATCH] Miscellaneous changes: - RefMetaDataTracker: grabbing variant contexts given a prefix (not sure where else this was implemented, if someone can show me I'll remove it) - VCFUtils: grabbing VCF headers given a prefix - MathUtils: Useful functions for calculating statistics on collections of Numbers - VariantAnnotator: Made isUniqueHeaderLine a public static method -- maybe this should go into a different class. Not sure. - Associations: PluginManager now used to propagate classes, implementations for Z,T,U tests, slight alteration to format to make the objects stored in the window optionally different from those returned by whatever statistic is run across the window Added: - MannWhitneyU. Started to fix up WilcoxonRankSum but there are comments in there questioning the validity of some of the code, and I'm sure that it's actually doing a U test. This implementation includes the direct calculation of p-values for small sample sizes, and a uniform approximation for when one of the sample sets is small, and the other large. Unit tests to follow. - BootstrapCallsMerger: takes n VCFs which have been called on the same samples; merges them together while averaging the annotations - BootstrapCalls.q: qscript for testing the effectiveness of boostrap low-pass calling on the exome git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5372 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/RefMetaDataTracker.java | 13 ++ .../walkers/annotator/VariantAnnotator.java | 2 +- .../association/AssociationContext.java | 12 +- .../association/AssociationTestRunner.java | 59 ++++- .../RegionalAssociationWalker.java | 30 ++- .../statistics/casecontrol/CaseControl.java | 6 +- .../vcftools/BootstrapCallsMerger.java | 180 +++++++++++++++ .../sting/utils/MannWhitneyU.java | 211 ++++++++++++++++++ .../broadinstitute/sting/utils/MathUtils.java | 24 ++ .../sting/utils/vcf/VCFUtils.java | 17 ++ scala/qscript/oneoffs/chartl/BootstrapCalls.q | 182 +++++++++++++++ 11 files changed, 705 insertions(+), 31 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java create mode 100755 java/src/org/broadinstitute/sting/utils/MannWhitneyU.java create mode 100755 scala/qscript/oneoffs/chartl/BootstrapCalls.q diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 4a9c6f819..48f3354d2 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -276,6 +276,19 @@ public class RefMetaDataTracker { return contexts; } + public Collection getVariantContextsByPrefix(ReferenceContext ref, Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + Collection contexts = new ArrayList(); + + for ( String name : names ) { + RODRecordList rodList = getTrackDataByName(name,false); // require that the name is an exact match + + if ( rodList != null ) + addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); + } + + return contexts; + } + /** * Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not. * see getVariantContexts for more information. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index b9cddb91f..b3c99b5ed 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -156,7 +156,7 @@ public class VariantAnnotator extends RodWalker { } } - private static boolean isUniqueHeaderLine(VCFHeaderLine line, Set currentSet) { + public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set currentSet) { if ( !(line instanceof VCFCompoundHeaderLine) ) return true; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java index 33d23e33d..99f9f52ef 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java @@ -15,12 +15,12 @@ import java.util.Map; * Time: 11:58 AM * To change this template use File | Settings | File Templates. */ -public abstract class AssociationContext { +public abstract class AssociationContext { - protected List> window; + protected List> window; public AssociationContext() { - window = new ArrayList>(getWindowSize()); + window = new ArrayList>(getWindowSize()); } // specifies size of window @@ -36,7 +36,7 @@ public abstract class AssociationContext { public abstract Object map(ReadBackedPileup rbp); // specifies how to take the per-sample data and reduce them into testable pairs - public abstract Map reduce(List> win); + public abstract Map reduce(List> win); // do we filter the current location (e.g. omit from window) public boolean filter(MapExtender m) { return true; } @@ -60,7 +60,7 @@ public abstract class AssociationContext { } - public void addData(Map sampleData) { + public void addData(Map sampleData) { window.add(sampleData); } @@ -70,7 +70,7 @@ public abstract class AssociationContext { } public void slide() { - ArrayList> newWindow = new ArrayList>((window.subList(slideByValue(),window.size()))); + ArrayList> newWindow = new ArrayList>((window.subList(slideByValue(),window.size()))); newWindow.ensureCapacity(getWindowSize()); window = newWindow; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java index 2eb092d34..376f0747c 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java @@ -1,12 +1,17 @@ package org.broadinstitute.sting.oneoffprojects.walkers.association; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Map; import cern.jet.random.Normal; import cern.jet.random.StudentT; import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.*; +import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.WilcoxonRankSum; +import org.broadinstitute.sting.utils.collections.Pair; /** * Created by IntelliJ IDEA. @@ -28,26 +33,58 @@ public class AssociationTestRunner { results.add(runZ((ZStatistic) context)); } - if ( context instanceof FisherExact ) { - results.add(runFisherExact(context)); + if ( context instanceof UStatistic ) { + results.add(runU((UStatistic) context)); } return results; } - public static String runStudentT(TStatistic context) { /* - StudentT studentT = new StudentT(dfnum/dfdenom,null); + public static String runStudentT(TStatistic context) { + Map> caseControlVectors = context.getCaseControl(); + double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE)); + double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase); + double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size(); + double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL)); + double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl); + double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size(); + + double df_num = Math.pow(varCase/nCase + varControl/nControl,2); + double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1); + double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl); + + StudentT studentT = new StudentT(df_num/df_denom,null); double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t)); - return String.format("T: %.2f\tP: %.2e",t,p);*/ - return null; + return String.format("T: %.2f\tP: %.2e",t,p); } - public static String runZ(ZStatistic context) { /* - double z = num/Math.sqrt(se); - double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z)); - return String.format("Z: %.2f\tP: %.2e",z,p);*/ + public static String runZ(ZStatistic context) { + Map> caseControlCounts = context.getCaseControl(); + double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue(); + double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue(); + double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue(); + double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue(); - return null; + double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/ + (caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()); + double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl)); + + double z = (pCase-pControl)/se; + double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z)); + return String.format("Z: %.2f\tP: %.2e",z,p); + } + + public static String runU(UStatistic context) { + Map> caseControlVectors = context.getCaseControl(); + MannWhitneyU mwu = new MannWhitneyU(); + for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) { + mwu.add(n, MannWhitneyU.USet.SET1); + } + for ( Number n : caseControlVectors.get(CaseControl.Cohort.CONTROL) ) { + mwu.add(n,MannWhitneyU.USet.SET2); + } + Pair results = mwu.runTwoSidedTest(); + return String.format("U: %d\tP: %.2e",results.first,results.second); } public static String runFisherExact(AssociationContext context) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java index 70a8d2ff4..e652c8c3d 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java @@ -9,13 +9,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.PrintStream; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.lang.reflect.Modifier; +import java.util.*; /** * @Author chartl @@ -23,7 +23,7 @@ import java.util.Set; * Generalized framework for regional (windowed) associations */ public class RegionalAssociationWalker extends LocusWalker implements TreeReducible { - @Argument(doc="foo",shortName="AT",fullName="associationType",required=false) + @Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false) public String[] associationsToUse = null; @Output @@ -57,17 +57,23 @@ public class RegionalAssociationWalker extends LocusWalker getAssociations() { - // todo -- this should use the package handler like variant eval + List> contexts = new PluginManager(AssociationContext.class).getPlugins(); + Map> classNameToClass = new HashMap>(contexts.size()); + for ( Class clazz : contexts ) { + if (! Modifier.isAbstract(clazz.getModifiers())) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + } + Set validAssociations = new HashSet(); for ( String s : associationsToUse ) { - AssociationContext context = stringToAssociationContext(s); + AssociationContext context; + try { + context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[]{}); + } catch ( Exception e ) { + throw new StingException("The class "+s+" could not be instantiated.",e); + } context.init(this); validAssociations.add(context); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java index f5764ff23..c43898327 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java @@ -12,7 +12,11 @@ import java.util.Map; * Created by IntelliJ IDEA. * @author chartl */ -public abstract class CaseControl extends AssociationContext { +public abstract class CaseControl extends AssociationContext { + + public Map getCaseControl() { + return reduce(window); + } public Map reduce(List> window) { X accumCase = null; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java new file mode 100755 index 000000000..4d9de6eda --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java @@ -0,0 +1,180 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import com.google.common.collect.ArrayListMultimap; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import javax.management.NotificationBroadcaster; +import java.util.*; + +/** + * @doc Merges N callsets that have been made on the same set of samples, and averages specific annotations. + * @Author chartl + * + */ +public class BootstrapCallsMerger extends RodWalker implements TreeReducible{ + + @Output + VCFWriter vcfWriter = null; + + // todo -- remove this, can be done just by looking at the type and iterating over ANNOTS_TO_AVG + // todo -- multi-allelic sites (e.g. what happens here:) + // todo -- Set 1: A G,T AC=2,4 ?? + // todo -- Set 1: A G AC=2 Set 2: A T AC=4 + // todo -- fix above behavior + final static private Set INTEGER_ANNOTS_CAN_MEAN = new HashSet(Arrays.asList("AC","AN")); + final static private Set ANNOTS_TO_AVG = new HashSet(Arrays.asList( + "QD","SB","HaplotypeScore","Dels","MQ","MQ0","sumGLByD","AC","AF","AN")); + + public void initialize() { + // grab the samples + Set samples = new HashSet(); + // setup the header fields + // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones + Set hInfo = new HashSet(); + for ( Map.Entry headers : VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), "bootstrap").entrySet() ) { + samples.addAll(headers.getValue().getGenotypeSamples()); + for ( VCFHeaderLine line : headers.getValue().getMetaData() ) { + logger.debug(line); + VCFHeaderLine altered = alterHeaderLine(line); + if ( VariantAnnotator.isUniqueHeaderLine(altered, hInfo) ) + hInfo.add(altered); + } + } + hInfo.add(new VCFInfoHeaderLine("NB",1,VCFHeaderLineType.Integer,"Number of bootsrap sets site was seen in")); + hInfo.add(new VCFFormatHeaderLine("BC",4,VCFHeaderLineType.Integer,"Genotype counts across bootsraps: ref,het,var,nocall")); + HashSet rodName = new HashSet(); + rodName.add("variant"); + VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } + + /** + * Note: integer annotations will need to become floats, others will not + * (e.g. HRun won't change, but counts will) + * @param line + * @return line with type changed + */ + private VCFHeaderLine alterHeaderLine(VCFHeaderLine line) { + if ( line instanceof VCFInfoHeaderLine ) { + if(INTEGER_ANNOTS_CAN_MEAN.contains(((VCFInfoHeaderLine) line).getName())) { + return new VCFInfoHeaderLine(((VCFInfoHeaderLine) line).getName(), + ((VCFInfoHeaderLine) line).getCount(), + VCFHeaderLineType.Float, + ((VCFInfoHeaderLine) line).getDescription()); + } + } + return line; + } + + public VCFWriter reduceInit() { + return vcfWriter; + } + + public VCHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { + if ( tracker == null ) { return null; } + Collection bootstraps = tracker.getVariantContextsByPrefix(ref,Arrays.asList("bootstrap"),null,ref.getLocus(),true,false); + int num_bootstraps = bootstraps.size(); + if ( num_bootstraps == 0 ) { return null; } + Map avgInfo = new HashMap(ANNOTS_TO_AVG.size()); + Map genotypeCountsBySample = new HashMap(); + for ( VariantContext vc : bootstraps ) { + // update genotype counts + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + if ( ! genotypeCountsBySample.containsKey(genotype.getKey())) { + genotypeCountsBySample.put(genotype.getKey(),new Integer[]{0,0,0,0}); + } + genotypeCountsBySample.get(genotype.getKey())[genotype2offset(genotype.getValue())]++; + } + // update info field annotations + for ( String anno : ANNOTS_TO_AVG ) { + if ( ! avgInfo.containsKey(anno) ) { + avgInfo.put(anno,0.0); + } + Object value = vc.getAttribute(anno); + if ( value instanceof Number ) { + //logger.debug(value); + avgInfo.put(anno,avgInfo.get(anno) + ((Number)value).doubleValue()/num_bootstraps); + } + if ( value instanceof String ) { + //logger.debug("string: "+value.toString()); + avgInfo.put(anno,avgInfo.get(anno) + Double.valueOf((String)value)/num_bootstraps); + } + } + } + VariantContext first = bootstraps.iterator().next(); + Map finalInfo = new HashMap(first.getAttributes().size()); + for ( Map.Entry attrib : first.getAttributes().entrySet() ) { + if ( ANNOTS_TO_AVG.contains(attrib.getKey()) ) { + finalInfo.put(attrib.getKey(),avgInfo.get(attrib.getKey())); + } else { + finalInfo.put(attrib.getKey(),attrib.getValue()); + } + } + Map finalGenotypes = new HashMap(first.getSampleNames().size()); + for ( Map.Entry g : first.getGenotypes().entrySet() ) { + Map att = new HashMap(g.getValue().getAttributes()); + att.put("BC",countsToString(genotypeCountsBySample.get(g.getKey()))); + //logger.debug(g.getValue()); + finalGenotypes.put(g.getKey(),Genotype.modifyAttributes(g.getValue(),att)); + //logger.debug("final:"); + //logger.debug(finalGenotypes.get(g.getKey())); + } + + finalInfo.put("NB",String.format("%d",num_bootstraps)); + + VariantContext attributeModified = VariantContext.modifyAttributes(first,finalInfo); + logger.debug(attributeModified.hasGenotypes() ? "attributes have genotypes" : "VERY BAD"); + VariantContext genotypeModified = VariantContext.modifyGenotypes(attributeModified,finalGenotypes); + logger.debug(genotypeModified.hasGenotypes() ? "modified genotypes have genotypes" : "NOT SO BAD"); + + return new VCHolder(genotypeModified,ref.getBase()); + + //return new VCHolder(VariantContext.modifyGenotypes(VariantContext.modifyAttributes(first, finalInfo), finalGenotypes), + // ref.getBase()); + } + + private static String countsToString(Integer[] c) { + return String.format("%d,%d,%d,%d",c[0],c[1],c[2],c[3]); + } + + public VCFWriter treeReduce(VCFWriter l, VCFWriter r) { + return l; + } + + public VCFWriter reduce(VCHolder h, VCFWriter w) { + if ( h != null ) { + w.add(h.v,h.b); + } + + return w; + } + + private static int genotype2offset(Genotype g) { + if ( g.isHomRef() ) { return 0; } + if ( g.isHet() ) { return 1; } + if ( g.isHomVar() ) { return 2; } + return 3; + } + + class VCHolder { + public VariantContext v; + public Byte b; + public VCHolder(VariantContext v, Byte b) { + this.v = v; + this.b = b; + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java new file mode 100755 index 000000000..195cd1f84 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -0,0 +1,211 @@ +package org.broadinstitute.sting.utils; + +import cern.jet.math.Arithmetic; +import cern.jet.random.Normal; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.Comparator; +import java.util.TreeSet; + +/** + * Created by IntelliJ IDEA. + * User: chartl + */ +public class MannWhitneyU { + + private TreeSet> observations; + private int sizeSet1; + private int sizeSet2; + + public MannWhitneyU() { + observations = new TreeSet>(new DitheringComparator()); + sizeSet1 = 0; + sizeSet2 = 0; + } + + /** + * Add an observation into the observation tree + * @param n: the observation (a number) + * @param set: whether the observation comes from set 1 or set 2 + */ + public void add(Number n, USet set) { + observations.add(new Pair(n,set)); + if ( set == USet.SET1 ) { + ++sizeSet1; + } else { + ++sizeSet2; + } + } + + /** + * temporary method that will be generalized. Runs the standard two-sided test, + * returns the u and p values. + * @Returns a pair holding the u and p-value. + */ + public Pair runTwoSidedTest() { + Pair uPair = calculateTwoSidedU(observations); + int u = uPair.first; + int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; + int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; + double pval = calculateP(n,m,u,true); + return new Pair(u,pval); + } + + /** + * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate + * @param n - The number of entries in the DOMINATED set + * @param m - The number of entries in the DOMINANT set + * @param u - the Mann-Whitney U value + * @param twoSided - is the test twosided + * @return the (possibly approximate) p-value associated with the MWU test + */ + public static double calculateP(int n, int m, int u, boolean twoSided) { + double pval; + if ( n > 8 && m > 8 ) { + pval = calculatePNormalApproximation(n,m,u); + } else if ( n > 4 && m > 8 ) { + pval = calculatePUniformApproximation(n,m,u); + } else { + pval = calculatePRecursively(n,m,u); + } + + return twoSided ? 2*pval : pval; + } + + /** + * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] + * @param n - The number of entries in the DOMINATED set + * @param m - The number of entries in the DOMINANT set + * @param u - the Mann-Whitney U value + * @return p-value associated with the normal approximation + */ + public static double calculatePNormalApproximation(int n,int m,int u) { + double mean = ((double) m*n+1)/2; + Normal normal = new Normal( mean , ((double) n*m*(n+m+1))/12, null); + return u < mean ? normal.cdf(u) : 1.0-normal.cdf(u); + } + + /** + * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate + * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF) + * @param n - + * @param m - + * @param u - + * @return + */ + public static double calculatePUniformApproximation(int n, int m, int u) { + int R = u + (n*(n+1))/2; + double a = Math.sqrt(m*(n+m+1)); + double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); + double z = b + R/a; + if ( z < 0 ) { return 0.0; } + else if ( z > n ) { return 1.0; } + else { + return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } + } + + /** + * Helper function for the sum of n uniform random variables + * @param z - value at which to compute the (un-normalized) cdf + * @param m - a cutoff integer (defined by m <= z < m + 1) + * @param n - the number of uniform random variables + * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) + * @return the (un-normalized) cdf for the sum of n random variables + */ + private static double uniformSumHelper(double z, int m, int n, int k) { + if ( k > m ) { return 0; } + int coef = (k % 2 == 0) ? 1 : -1; + return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); + } + + /** + * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn + * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for + * later on simply by multiplying the p-value by 2 + * @param observed + * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) + */ + public static Pair calculateTwoSidedU(TreeSet> observed ) { + int set1SeenSoFar = 0; + int set2SeenSoFar = 0; + int uSet1DomSet2 = 0; + int uSet2DomSet1 = 0; + USet previous = null; + for ( Pair dataPoint : observed ) { + if ( previous != null && previous != dataPoint.second ) { + if ( dataPoint.second == USet.SET1 ) { + uSet2DomSet1 += set2SeenSoFar; + } else { + uSet1DomSet2 += set1SeenSoFar; + } + } + + if ( dataPoint.second == USet.SET1 ) { + ++set1SeenSoFar; + } else { + ++set2SeenSoFar; + } + + previous = dataPoint.second; + } + + return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); + } + + /** + * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible + * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor + * calls into that recursive calculation. + * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two) + * @param m: number of set-two entries + * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. + */ + public static double calculatePRecursively(int n, int m, int u) { + if ( m > 6 && n > 4 || m + n > 16 ) { throw new StingException("Please use the appropriate (normal or sum of uniform) approximation"); } + return cpr(n,m,u); + } + + /** + * @doc: just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] + * @n: number of set-1 entries + * @m: number of set-2 entries + * @u: number of times a set-2 entry as preceded a set-1 entry + */ + private static double cpr(int n, int m, int u) { + if ( u < 0 || n == 0 && m == 0 ) { + return 0.0; + } + if ( m == 0 || n == 0 ) { + // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. + // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway + return ( u == 0 ) ? 1.0 : 0.0; + } + + + return (n/(n+m))*cpr(n-1,m,u-m) + (m/(n+m))*cpr(n,m-1,u); + } + + /** + * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values + * and to ensure that rank ties are broken at random. + */ + private class DitheringComparator implements Comparator> { + + public DitheringComparator() {} + + public boolean equals(Object other) { return false; } + + public int compare(Pair left, Pair right) { + double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); + if ( comp > 0 ) { return 1; } + if ( comp < 0 ) { return -1; } + return MathUtils.rand.nextBoolean() ? -1 : 1; + } + } + + public enum USet { SET1, SET2 } + +} diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7deee1d75..f1459f3b5 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -52,6 +52,30 @@ public class MathUtils { /** Private constructor. No instantiating this class! */ private MathUtils() {} + public static double sum( Collection numbers ) { + double sum = 0; + for ( Number n : numbers ) { + sum += n.doubleValue(); + } + + return sum; + } + + public static double average( Collection numbers ) { + return sum(numbers)/numbers.size(); + } + + public static double variance( Collection numbers, Number mean ) { + double mn = mean.doubleValue(); + double var = 0; + for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); } + return var; + } + + public static double variance(Collection numbers) { + return variance(numbers,average(numbers)); + } + public static double sum(double[] values) { double s = 0.0; for ( double v : values) s += v; diff --git a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java index 79b99c7d8..887a29c47 100755 --- a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java @@ -60,6 +60,23 @@ public class VCFUtils { return data; } + public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if lacks the prefix + if ( ! source.getName().startsWith(prefix) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + /** * Gets the header fields from all VCF rods input by the user * diff --git a/scala/qscript/oneoffs/chartl/BootstrapCalls.q b/scala/qscript/oneoffs/chartl/BootstrapCalls.q new file mode 100755 index 000000000..db8ce545f --- /dev/null +++ b/scala/qscript/oneoffs/chartl/BootstrapCalls.q @@ -0,0 +1,182 @@ +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType} +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript + +class BootstrapCalls extends QScript { + @Argument(doc="Bam file list",shortName="I",required=true) + var bamList: File = _ + @Argument(doc="Intervals file",shortName="L",required=true) + var intervalFile: File = _ + @Argument(doc="Output file",shortName="o",required=true) + var bootstrapMergedOut: File = _ + @Argument(doc="Reference file",shortName="R",required=true) + var reference: File = _ + @Argument(doc="Downsampling Level",shortName="D",required=false) + var downsamplingLevel: Int = 4 + @Argument(doc="Num Bootstrap Callsets",shortName="B",required=false) + var numberOfBootstraps: Int = 25 + @Argument(doc="call confidence",shortName="conf",required=false) + var standCallConf: Double = 4.0 + @Argument(doc="dbsnp file (vcf version)",shortName="dbsnp",required=false) + var dbsnp: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") + @Argument(doc="sting jar",shortName="s",required=true) + var sting: File = _ + + /********************** + * URGENT NOTE: + * for this to do any good you need to take out the random seeds in + * ReservoirDownsampler: 20 + * MathUtils: 649 + * + * You will also need to hack the recalibrator to always trust AC (which are no longer integer-valued) + * and to deal with double-valued AC fields + */ + + def script = { + val bams: List[File] = extractFileEntries(bamList) + trait UGArgs extends UnifiedGenotyper { + this.input_file = bams + this.reference_sequence = reference + this.dcov = Some(downsamplingLevel) + this.intervals :+= intervalFile + this.stand_call_conf = Some(standCallConf) + this.stand_emit_conf = Some(standCallConf) + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + this.scatterCount = 20 + this.jarFile = sting + this.memoryLimit = Some(4) + } + + val bootstrapBase = swapExt(bootstrapMergedOut,".vcf",".boot%d.vcf").getAbsolutePath + var calls : List[UnifiedGenotyper] = Nil + for ( i <- 0 until (numberOfBootstraps+1) ) { + var ug : UnifiedGenotyper = new UnifiedGenotyper with UGArgs + ug.out = new File(bootstrapBase.format(i)) + ug.analysisName = "Boostrap%d".format(i) + calls :+= ug + } + + addAll(calls) + + trait MergeArgs extends BootstrapCallsMerger { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.scatterCount = 40 + this.jarFile = sting + this.memoryLimit = Some(4) + this.rodBind ++= calls.map(u => u.out).zipWithIndex.map(u => new RodBind("bootstrap_%d".format(u._2),"vcf",u._1)) + this.out = bootstrapMergedOut + } + + var merge : BootstrapCallsMerger = new BootstrapCallsMerger with MergeArgs + add(merge) + + trait ClusterArgs extends GenerateVariantClusters { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("input","vcf",merge.out) + this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") + this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") + this.qual = Some(100) + this.std = Some(3.5) + this.mG = Some(8) + this.trustAllPolymorphic = true + this.memoryLimit = Some(8) + this.jarFile = sting + } + + var clust : GenerateVariantClusters = new GenerateVariantClusters with ClusterArgs + add(clust) + + trait VQSRArgs extends VariantRecalibrator { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.out = swapExt(bootstrapMergedOut,"vcf","recal.vcf") + this.rodBind :+= new RodBind("input","vcf",merge.out) + this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") + this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY) + this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0") + this.trustAllPolymorphic = true + this.tranchesFile = swapExt(bootstrapMergedOut,"vcf","tranche") + this.memoryLimit=Some(8) + this.jarFile = sting + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + } + + var recal : VariantRecalibrator = new VariantRecalibrator with VQSRArgs + add(recal) + + trait CutArgs extends ApplyVariantCuts { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("input","vcf",recal.out) + this.tranchesFile = recal.tranchesFile + this.fdr_filter_level = Some(1.0) + this.out = swapExt(bootstrapMergedOut,".vcf",".recal.cut.vcf") + this.jarFile = sting + this.memoryLimit = Some(4) + this.scatterCount = 5 + } + + var cut : ApplyVariantCuts = new ApplyVariantCuts with CutArgs + add(cut) + + class RmHeader extends CommandLineFunction { + @Input(doc="vcf") var vcf : File = _ + @Output(doc="headerless vcf") var noheadvcf : File = _ + + def commandLine : String = { + "head -n 1 %s > %s ; grep -v \\#\\# %s >> %s".format(vcf.getAbsolutePath,noheadvcf.getAbsolutePath, + vcf.getAbsolutePath,noheadvcf.getAbsolutePath) + } + } + + var rm : RmHeader = new RmHeader + rm.vcf = cut.out + rm.noheadvcf = swapExt(cut.out,".vcf",".nohead.vcf") + add(rm) + + trait CombineArgs extends CombineVariants { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("hiCov","vcf",rm.noheadvcf) + this.rodBind :+= new RodBind("loCov","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf")) + this.variantMergeOptions = Some(VariantMergeType.UNION) + this.genotypeMergeOptions = Some(GenotypeMergeType.PRIORITIZE) + this.priority = "hiCov,loCov" + this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.vcf") + this.jarFile = sting + } + + var combine : CombineVariants = new CombineVariants with CombineArgs + add(combine) + + trait EvalArgs extends VariantEval { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("eval","vcf",combine.out) + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + this.jarFile = sting + this.ST = List("Filter","Novelty","JexlExpression") + this.select_names = List("lowOnly","filteredInLow","Intersection","filteredInHi","hiOnly","filteredInAll") + this.select_exps = List("\"set == 'loCov'\"","\"set == 'hiCov-filterInloCov'\"", + "\"set == 'Intersection'\"", "\"set == 'filterInhiCov-loCov'\"", + "\"set == 'hiCov'\"","\"set == 'FilteredInAll'\"") + this.EV = List("TiTvVariantEvaluator","CountVariants","CompOverlap") + this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.eval") + this.nt = Some(8) + this.memoryLimit = Some(12) + } + + var eval : VariantEval = new VariantEval with EvalArgs + add(eval) + } +}