diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 4a9c6f819..48f3354d2 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -276,6 +276,19 @@ public class RefMetaDataTracker { return contexts; } + public Collection getVariantContextsByPrefix(ReferenceContext ref, Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + Collection contexts = new ArrayList(); + + for ( String name : names ) { + RODRecordList rodList = getTrackDataByName(name,false); // require that the name is an exact match + + if ( rodList != null ) + addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); + } + + return contexts; + } + /** * Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not. * see getVariantContexts for more information. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index b9cddb91f..b3c99b5ed 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -156,7 +156,7 @@ public class VariantAnnotator extends RodWalker { } } - private static boolean isUniqueHeaderLine(VCFHeaderLine line, Set currentSet) { + public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set currentSet) { if ( !(line instanceof VCFCompoundHeaderLine) ) return true; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java index 33d23e33d..99f9f52ef 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java @@ -15,12 +15,12 @@ import java.util.Map; * Time: 11:58 AM * To change this template use File | Settings | File Templates. */ -public abstract class AssociationContext { +public abstract class AssociationContext { - protected List> window; + protected List> window; public AssociationContext() { - window = new ArrayList>(getWindowSize()); + window = new ArrayList>(getWindowSize()); } // specifies size of window @@ -36,7 +36,7 @@ public abstract class AssociationContext { public abstract Object map(ReadBackedPileup rbp); // specifies how to take the per-sample data and reduce them into testable pairs - public abstract Map reduce(List> win); + public abstract Map reduce(List> win); // do we filter the current location (e.g. omit from window) public boolean filter(MapExtender m) { return true; } @@ -60,7 +60,7 @@ public abstract class AssociationContext { } - public void addData(Map sampleData) { + public void addData(Map sampleData) { window.add(sampleData); } @@ -70,7 +70,7 @@ public abstract class AssociationContext { } public void slide() { - ArrayList> newWindow = new ArrayList>((window.subList(slideByValue(),window.size()))); + ArrayList> newWindow = new ArrayList>((window.subList(slideByValue(),window.size()))); newWindow.ensureCapacity(getWindowSize()); window = newWindow; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java index 2eb092d34..376f0747c 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java @@ -1,12 +1,17 @@ package org.broadinstitute.sting.oneoffprojects.walkers.association; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Map; import cern.jet.random.Normal; import cern.jet.random.StudentT; import org.broadinstitute.sting.oneoffprojects.walkers.association.statistics.casecontrol.*; +import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.WilcoxonRankSum; +import org.broadinstitute.sting.utils.collections.Pair; /** * Created by IntelliJ IDEA. @@ -28,26 +33,58 @@ public class AssociationTestRunner { results.add(runZ((ZStatistic) context)); } - if ( context instanceof FisherExact ) { - results.add(runFisherExact(context)); + if ( context instanceof UStatistic ) { + results.add(runU((UStatistic) context)); } return results; } - public static String runStudentT(TStatistic context) { /* - StudentT studentT = new StudentT(dfnum/dfdenom,null); + public static String runStudentT(TStatistic context) { + Map> caseControlVectors = context.getCaseControl(); + double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE)); + double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase); + double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size(); + double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL)); + double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl); + double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size(); + + double df_num = Math.pow(varCase/nCase + varControl/nControl,2); + double df_denom = Math.pow(varCase/nCase,2)/(nCase-1) + Math.pow(varControl/nControl,2)/(nControl-1); + double t = (meanCase-meanControl)/Math.sqrt(varCase/nCase+varControl/nControl); + + StudentT studentT = new StudentT(df_num/df_denom,null); double p = t < 0 ? 2*studentT.cdf(t) : 2*(1-studentT.cdf(t)); - return String.format("T: %.2f\tP: %.2e",t,p);*/ - return null; + return String.format("T: %.2f\tP: %.2e",t,p); } - public static String runZ(ZStatistic context) { /* - double z = num/Math.sqrt(se); - double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z)); - return String.format("Z: %.2f\tP: %.2e",z,p);*/ + public static String runZ(ZStatistic context) { + Map> caseControlCounts = context.getCaseControl(); + double pCase = caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue(); + double pControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()/caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue(); + double nCase = caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue(); + double nControl = caseControlCounts.get(CaseControl.Cohort.CONTROL).second.doubleValue(); - return null; + double p2 = (caseControlCounts.get(CaseControl.Cohort.CASE).first.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue())/ + (caseControlCounts.get(CaseControl.Cohort.CASE).second.doubleValue()+caseControlCounts.get(CaseControl.Cohort.CONTROL).first.doubleValue()); + double se = Math.sqrt(p2*(1-p2)*(1/nCase + 1/nControl)); + + double z = (pCase-pControl)/se; + double p = z < 0 ? 2*standardNormal.cdf(z) : 2*(1-standardNormal.cdf(z)); + return String.format("Z: %.2f\tP: %.2e",z,p); + } + + public static String runU(UStatistic context) { + Map> caseControlVectors = context.getCaseControl(); + MannWhitneyU mwu = new MannWhitneyU(); + for ( Number n : caseControlVectors.get(CaseControl.Cohort.CASE) ) { + mwu.add(n, MannWhitneyU.USet.SET1); + } + for ( Number n : caseControlVectors.get(CaseControl.Cohort.CONTROL) ) { + mwu.add(n,MannWhitneyU.USet.SET2); + } + Pair results = mwu.runTwoSidedTest(); + return String.format("U: %d\tP: %.2e",results.first,results.second); } public static String runFisherExact(AssociationContext context) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java index 70a8d2ff4..e652c8c3d 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java @@ -9,13 +9,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.PrintStream; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.lang.reflect.Modifier; +import java.util.*; /** * @Author chartl @@ -23,7 +23,7 @@ import java.util.Set; * Generalized framework for regional (windowed) associations */ public class RegionalAssociationWalker extends LocusWalker implements TreeReducible { - @Argument(doc="foo",shortName="AT",fullName="associationType",required=false) + @Argument(doc="Association type(s) to use. Supports multiple arguments (-AT thing1 -AT thing2).",shortName="AT",fullName="associationType",required=false) public String[] associationsToUse = null; @Output @@ -57,17 +57,23 @@ public class RegionalAssociationWalker extends LocusWalker getAssociations() { - // todo -- this should use the package handler like variant eval + List> contexts = new PluginManager(AssociationContext.class).getPlugins(); + Map> classNameToClass = new HashMap>(contexts.size()); + for ( Class clazz : contexts ) { + if (! Modifier.isAbstract(clazz.getModifiers())) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + } + Set validAssociations = new HashSet(); for ( String s : associationsToUse ) { - AssociationContext context = stringToAssociationContext(s); + AssociationContext context; + try { + context = classNameToClass.get(s).getConstructor(new Class[]{}).newInstance(new Object[]{}); + } catch ( Exception e ) { + throw new StingException("The class "+s+" could not be instantiated.",e); + } context.init(this); validAssociations.add(context); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java index f5764ff23..c43898327 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/casecontrol/CaseControl.java @@ -12,7 +12,11 @@ import java.util.Map; * Created by IntelliJ IDEA. * @author chartl */ -public abstract class CaseControl extends AssociationContext { +public abstract class CaseControl extends AssociationContext { + + public Map getCaseControl() { + return reduce(window); + } public Map reduce(List> window) { X accumCase = null; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java new file mode 100755 index 000000000..4d9de6eda --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BootstrapCallsMerger.java @@ -0,0 +1,180 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import com.google.common.collect.ArrayListMultimap; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import javax.management.NotificationBroadcaster; +import java.util.*; + +/** + * @doc Merges N callsets that have been made on the same set of samples, and averages specific annotations. + * @Author chartl + * + */ +public class BootstrapCallsMerger extends RodWalker implements TreeReducible{ + + @Output + VCFWriter vcfWriter = null; + + // todo -- remove this, can be done just by looking at the type and iterating over ANNOTS_TO_AVG + // todo -- multi-allelic sites (e.g. what happens here:) + // todo -- Set 1: A G,T AC=2,4 ?? + // todo -- Set 1: A G AC=2 Set 2: A T AC=4 + // todo -- fix above behavior + final static private Set INTEGER_ANNOTS_CAN_MEAN = new HashSet(Arrays.asList("AC","AN")); + final static private Set ANNOTS_TO_AVG = new HashSet(Arrays.asList( + "QD","SB","HaplotypeScore","Dels","MQ","MQ0","sumGLByD","AC","AF","AN")); + + public void initialize() { + // grab the samples + Set samples = new HashSet(); + // setup the header fields + // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones + Set hInfo = new HashSet(); + for ( Map.Entry headers : VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), "bootstrap").entrySet() ) { + samples.addAll(headers.getValue().getGenotypeSamples()); + for ( VCFHeaderLine line : headers.getValue().getMetaData() ) { + logger.debug(line); + VCFHeaderLine altered = alterHeaderLine(line); + if ( VariantAnnotator.isUniqueHeaderLine(altered, hInfo) ) + hInfo.add(altered); + } + } + hInfo.add(new VCFInfoHeaderLine("NB",1,VCFHeaderLineType.Integer,"Number of bootsrap sets site was seen in")); + hInfo.add(new VCFFormatHeaderLine("BC",4,VCFHeaderLineType.Integer,"Genotype counts across bootsraps: ref,het,var,nocall")); + HashSet rodName = new HashSet(); + rodName.add("variant"); + VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } + + /** + * Note: integer annotations will need to become floats, others will not + * (e.g. HRun won't change, but counts will) + * @param line + * @return line with type changed + */ + private VCFHeaderLine alterHeaderLine(VCFHeaderLine line) { + if ( line instanceof VCFInfoHeaderLine ) { + if(INTEGER_ANNOTS_CAN_MEAN.contains(((VCFInfoHeaderLine) line).getName())) { + return new VCFInfoHeaderLine(((VCFInfoHeaderLine) line).getName(), + ((VCFInfoHeaderLine) line).getCount(), + VCFHeaderLineType.Float, + ((VCFInfoHeaderLine) line).getDescription()); + } + } + return line; + } + + public VCFWriter reduceInit() { + return vcfWriter; + } + + public VCHolder map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { + if ( tracker == null ) { return null; } + Collection bootstraps = tracker.getVariantContextsByPrefix(ref,Arrays.asList("bootstrap"),null,ref.getLocus(),true,false); + int num_bootstraps = bootstraps.size(); + if ( num_bootstraps == 0 ) { return null; } + Map avgInfo = new HashMap(ANNOTS_TO_AVG.size()); + Map genotypeCountsBySample = new HashMap(); + for ( VariantContext vc : bootstraps ) { + // update genotype counts + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + if ( ! genotypeCountsBySample.containsKey(genotype.getKey())) { + genotypeCountsBySample.put(genotype.getKey(),new Integer[]{0,0,0,0}); + } + genotypeCountsBySample.get(genotype.getKey())[genotype2offset(genotype.getValue())]++; + } + // update info field annotations + for ( String anno : ANNOTS_TO_AVG ) { + if ( ! avgInfo.containsKey(anno) ) { + avgInfo.put(anno,0.0); + } + Object value = vc.getAttribute(anno); + if ( value instanceof Number ) { + //logger.debug(value); + avgInfo.put(anno,avgInfo.get(anno) + ((Number)value).doubleValue()/num_bootstraps); + } + if ( value instanceof String ) { + //logger.debug("string: "+value.toString()); + avgInfo.put(anno,avgInfo.get(anno) + Double.valueOf((String)value)/num_bootstraps); + } + } + } + VariantContext first = bootstraps.iterator().next(); + Map finalInfo = new HashMap(first.getAttributes().size()); + for ( Map.Entry attrib : first.getAttributes().entrySet() ) { + if ( ANNOTS_TO_AVG.contains(attrib.getKey()) ) { + finalInfo.put(attrib.getKey(),avgInfo.get(attrib.getKey())); + } else { + finalInfo.put(attrib.getKey(),attrib.getValue()); + } + } + Map finalGenotypes = new HashMap(first.getSampleNames().size()); + for ( Map.Entry g : first.getGenotypes().entrySet() ) { + Map att = new HashMap(g.getValue().getAttributes()); + att.put("BC",countsToString(genotypeCountsBySample.get(g.getKey()))); + //logger.debug(g.getValue()); + finalGenotypes.put(g.getKey(),Genotype.modifyAttributes(g.getValue(),att)); + //logger.debug("final:"); + //logger.debug(finalGenotypes.get(g.getKey())); + } + + finalInfo.put("NB",String.format("%d",num_bootstraps)); + + VariantContext attributeModified = VariantContext.modifyAttributes(first,finalInfo); + logger.debug(attributeModified.hasGenotypes() ? "attributes have genotypes" : "VERY BAD"); + VariantContext genotypeModified = VariantContext.modifyGenotypes(attributeModified,finalGenotypes); + logger.debug(genotypeModified.hasGenotypes() ? "modified genotypes have genotypes" : "NOT SO BAD"); + + return new VCHolder(genotypeModified,ref.getBase()); + + //return new VCHolder(VariantContext.modifyGenotypes(VariantContext.modifyAttributes(first, finalInfo), finalGenotypes), + // ref.getBase()); + } + + private static String countsToString(Integer[] c) { + return String.format("%d,%d,%d,%d",c[0],c[1],c[2],c[3]); + } + + public VCFWriter treeReduce(VCFWriter l, VCFWriter r) { + return l; + } + + public VCFWriter reduce(VCHolder h, VCFWriter w) { + if ( h != null ) { + w.add(h.v,h.b); + } + + return w; + } + + private static int genotype2offset(Genotype g) { + if ( g.isHomRef() ) { return 0; } + if ( g.isHet() ) { return 1; } + if ( g.isHomVar() ) { return 2; } + return 3; + } + + class VCHolder { + public VariantContext v; + public Byte b; + public VCHolder(VariantContext v, Byte b) { + this.v = v; + this.b = b; + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java new file mode 100755 index 000000000..195cd1f84 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -0,0 +1,211 @@ +package org.broadinstitute.sting.utils; + +import cern.jet.math.Arithmetic; +import cern.jet.random.Normal; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.Comparator; +import java.util.TreeSet; + +/** + * Created by IntelliJ IDEA. + * User: chartl + */ +public class MannWhitneyU { + + private TreeSet> observations; + private int sizeSet1; + private int sizeSet2; + + public MannWhitneyU() { + observations = new TreeSet>(new DitheringComparator()); + sizeSet1 = 0; + sizeSet2 = 0; + } + + /** + * Add an observation into the observation tree + * @param n: the observation (a number) + * @param set: whether the observation comes from set 1 or set 2 + */ + public void add(Number n, USet set) { + observations.add(new Pair(n,set)); + if ( set == USet.SET1 ) { + ++sizeSet1; + } else { + ++sizeSet2; + } + } + + /** + * temporary method that will be generalized. Runs the standard two-sided test, + * returns the u and p values. + * @Returns a pair holding the u and p-value. + */ + public Pair runTwoSidedTest() { + Pair uPair = calculateTwoSidedU(observations); + int u = uPair.first; + int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; + int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; + double pval = calculateP(n,m,u,true); + return new Pair(u,pval); + } + + /** + * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate + * @param n - The number of entries in the DOMINATED set + * @param m - The number of entries in the DOMINANT set + * @param u - the Mann-Whitney U value + * @param twoSided - is the test twosided + * @return the (possibly approximate) p-value associated with the MWU test + */ + public static double calculateP(int n, int m, int u, boolean twoSided) { + double pval; + if ( n > 8 && m > 8 ) { + pval = calculatePNormalApproximation(n,m,u); + } else if ( n > 4 && m > 8 ) { + pval = calculatePUniformApproximation(n,m,u); + } else { + pval = calculatePRecursively(n,m,u); + } + + return twoSided ? 2*pval : pval; + } + + /** + * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] + * @param n - The number of entries in the DOMINATED set + * @param m - The number of entries in the DOMINANT set + * @param u - the Mann-Whitney U value + * @return p-value associated with the normal approximation + */ + public static double calculatePNormalApproximation(int n,int m,int u) { + double mean = ((double) m*n+1)/2; + Normal normal = new Normal( mean , ((double) n*m*(n+m+1))/12, null); + return u < mean ? normal.cdf(u) : 1.0-normal.cdf(u); + } + + /** + * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate + * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens [1966] (sum of uniform CDF) + * @param n - + * @param m - + * @param u - + * @return + */ + public static double calculatePUniformApproximation(int n, int m, int u) { + int R = u + (n*(n+1))/2; + double a = Math.sqrt(m*(n+m+1)); + double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); + double z = b + R/a; + if ( z < 0 ) { return 0.0; } + else if ( z > n ) { return 1.0; } + else { + return 1/((double)Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } + } + + /** + * Helper function for the sum of n uniform random variables + * @param z - value at which to compute the (un-normalized) cdf + * @param m - a cutoff integer (defined by m <= z < m + 1) + * @param n - the number of uniform random variables + * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) + * @return the (un-normalized) cdf for the sum of n random variables + */ + private static double uniformSumHelper(double z, int m, int n, int k) { + if ( k > m ) { return 0; } + int coef = (k % 2 == 0) ? 1 : -1; + return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); + } + + /** + * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn + * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for + * later on simply by multiplying the p-value by 2 + * @param observed + * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) + */ + public static Pair calculateTwoSidedU(TreeSet> observed ) { + int set1SeenSoFar = 0; + int set2SeenSoFar = 0; + int uSet1DomSet2 = 0; + int uSet2DomSet1 = 0; + USet previous = null; + for ( Pair dataPoint : observed ) { + if ( previous != null && previous != dataPoint.second ) { + if ( dataPoint.second == USet.SET1 ) { + uSet2DomSet1 += set2SeenSoFar; + } else { + uSet1DomSet2 += set1SeenSoFar; + } + } + + if ( dataPoint.second == USet.SET1 ) { + ++set1SeenSoFar; + } else { + ++set2SeenSoFar; + } + + previous = dataPoint.second; + } + + return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); + } + + /** + * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible + * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor + * calls into that recursive calculation. + * @param n: number of set-one entries (hypothesis: set-one is dominated by set-two) + * @param m: number of set-two entries + * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. + */ + public static double calculatePRecursively(int n, int m, int u) { + if ( m > 6 && n > 4 || m + n > 16 ) { throw new StingException("Please use the appropriate (normal or sum of uniform) approximation"); } + return cpr(n,m,u); + } + + /** + * @doc: just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] + * @n: number of set-1 entries + * @m: number of set-2 entries + * @u: number of times a set-2 entry as preceded a set-1 entry + */ + private static double cpr(int n, int m, int u) { + if ( u < 0 || n == 0 && m == 0 ) { + return 0.0; + } + if ( m == 0 || n == 0 ) { + // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. + // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway + return ( u == 0 ) ? 1.0 : 0.0; + } + + + return (n/(n+m))*cpr(n-1,m,u-m) + (m/(n+m))*cpr(n,m-1,u); + } + + /** + * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values + * and to ensure that rank ties are broken at random. + */ + private class DitheringComparator implements Comparator> { + + public DitheringComparator() {} + + public boolean equals(Object other) { return false; } + + public int compare(Pair left, Pair right) { + double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); + if ( comp > 0 ) { return 1; } + if ( comp < 0 ) { return -1; } + return MathUtils.rand.nextBoolean() ? -1 : 1; + } + } + + public enum USet { SET1, SET2 } + +} diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7deee1d75..f1459f3b5 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -52,6 +52,30 @@ public class MathUtils { /** Private constructor. No instantiating this class! */ private MathUtils() {} + public static double sum( Collection numbers ) { + double sum = 0; + for ( Number n : numbers ) { + sum += n.doubleValue(); + } + + return sum; + } + + public static double average( Collection numbers ) { + return sum(numbers)/numbers.size(); + } + + public static double variance( Collection numbers, Number mean ) { + double mn = mean.doubleValue(); + double var = 0; + for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); } + return var; + } + + public static double variance(Collection numbers) { + return variance(numbers,average(numbers)); + } + public static double sum(double[] values) { double s = 0.0; for ( double v : values) s += v; diff --git a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java index 79b99c7d8..887a29c47 100755 --- a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java @@ -60,6 +60,23 @@ public class VCFUtils { return data; } + public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if lacks the prefix + if ( ! source.getName().startsWith(prefix) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + /** * Gets the header fields from all VCF rods input by the user * diff --git a/scala/qscript/oneoffs/chartl/BootstrapCalls.q b/scala/qscript/oneoffs/chartl/BootstrapCalls.q new file mode 100755 index 000000000..db8ce545f --- /dev/null +++ b/scala/qscript/oneoffs/chartl/BootstrapCalls.q @@ -0,0 +1,182 @@ +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.{GenotypeMergeType, VariantMergeType} +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript + +class BootstrapCalls extends QScript { + @Argument(doc="Bam file list",shortName="I",required=true) + var bamList: File = _ + @Argument(doc="Intervals file",shortName="L",required=true) + var intervalFile: File = _ + @Argument(doc="Output file",shortName="o",required=true) + var bootstrapMergedOut: File = _ + @Argument(doc="Reference file",shortName="R",required=true) + var reference: File = _ + @Argument(doc="Downsampling Level",shortName="D",required=false) + var downsamplingLevel: Int = 4 + @Argument(doc="Num Bootstrap Callsets",shortName="B",required=false) + var numberOfBootstraps: Int = 25 + @Argument(doc="call confidence",shortName="conf",required=false) + var standCallConf: Double = 4.0 + @Argument(doc="dbsnp file (vcf version)",shortName="dbsnp",required=false) + var dbsnp: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") + @Argument(doc="sting jar",shortName="s",required=true) + var sting: File = _ + + /********************** + * URGENT NOTE: + * for this to do any good you need to take out the random seeds in + * ReservoirDownsampler: 20 + * MathUtils: 649 + * + * You will also need to hack the recalibrator to always trust AC (which are no longer integer-valued) + * and to deal with double-valued AC fields + */ + + def script = { + val bams: List[File] = extractFileEntries(bamList) + trait UGArgs extends UnifiedGenotyper { + this.input_file = bams + this.reference_sequence = reference + this.dcov = Some(downsamplingLevel) + this.intervals :+= intervalFile + this.stand_call_conf = Some(standCallConf) + this.stand_emit_conf = Some(standCallConf) + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + this.scatterCount = 20 + this.jarFile = sting + this.memoryLimit = Some(4) + } + + val bootstrapBase = swapExt(bootstrapMergedOut,".vcf",".boot%d.vcf").getAbsolutePath + var calls : List[UnifiedGenotyper] = Nil + for ( i <- 0 until (numberOfBootstraps+1) ) { + var ug : UnifiedGenotyper = new UnifiedGenotyper with UGArgs + ug.out = new File(bootstrapBase.format(i)) + ug.analysisName = "Boostrap%d".format(i) + calls :+= ug + } + + addAll(calls) + + trait MergeArgs extends BootstrapCallsMerger { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.scatterCount = 40 + this.jarFile = sting + this.memoryLimit = Some(4) + this.rodBind ++= calls.map(u => u.out).zipWithIndex.map(u => new RodBind("bootstrap_%d".format(u._2),"vcf",u._1)) + this.out = bootstrapMergedOut + } + + var merge : BootstrapCallsMerger = new BootstrapCallsMerger with MergeArgs + add(merge) + + trait ClusterArgs extends GenerateVariantClusters { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("input","vcf",merge.out) + this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") + this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") + this.qual = Some(100) + this.std = Some(3.5) + this.mG = Some(8) + this.trustAllPolymorphic = true + this.memoryLimit = Some(8) + this.jarFile = sting + } + + var clust : GenerateVariantClusters = new GenerateVariantClusters with ClusterArgs + add(clust) + + trait VQSRArgs extends VariantRecalibrator { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.out = swapExt(bootstrapMergedOut,"vcf","recal.vcf") + this.rodBind :+= new RodBind("input","vcf",merge.out) + this.rodBind :+= new RodBind("hapmap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.rodBind :+= new RodBind("truthHapMap","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")) + this.rodBind :+= new RodBind("truth1kg","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.sites.vcf")) + this.cluster_file = swapExt(bootstrapMergedOut,"vcf","cluster") + this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY) + this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0") + this.trustAllPolymorphic = true + this.tranchesFile = swapExt(bootstrapMergedOut,"vcf","tranche") + this.memoryLimit=Some(8) + this.jarFile = sting + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + } + + var recal : VariantRecalibrator = new VariantRecalibrator with VQSRArgs + add(recal) + + trait CutArgs extends ApplyVariantCuts { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("input","vcf",recal.out) + this.tranchesFile = recal.tranchesFile + this.fdr_filter_level = Some(1.0) + this.out = swapExt(bootstrapMergedOut,".vcf",".recal.cut.vcf") + this.jarFile = sting + this.memoryLimit = Some(4) + this.scatterCount = 5 + } + + var cut : ApplyVariantCuts = new ApplyVariantCuts with CutArgs + add(cut) + + class RmHeader extends CommandLineFunction { + @Input(doc="vcf") var vcf : File = _ + @Output(doc="headerless vcf") var noheadvcf : File = _ + + def commandLine : String = { + "head -n 1 %s > %s ; grep -v \\#\\# %s >> %s".format(vcf.getAbsolutePath,noheadvcf.getAbsolutePath, + vcf.getAbsolutePath,noheadvcf.getAbsolutePath) + } + } + + var rm : RmHeader = new RmHeader + rm.vcf = cut.out + rm.noheadvcf = swapExt(cut.out,".vcf",".nohead.vcf") + add(rm) + + trait CombineArgs extends CombineVariants { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("hiCov","vcf",rm.noheadvcf) + this.rodBind :+= new RodBind("loCov","vcf",new File("/humgen/gsa-pipeline/PVQF4/all_batches_v001/batch_001/SnpCalls/ESPGO_Gabriel_NHLBI_EOMI_setone_EOMI_Project.cleaned.annotated.handfiltered.vcf")) + this.variantMergeOptions = Some(VariantMergeType.UNION) + this.genotypeMergeOptions = Some(GenotypeMergeType.PRIORITIZE) + this.priority = "hiCov,loCov" + this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.vcf") + this.jarFile = sting + } + + var combine : CombineVariants = new CombineVariants with CombineArgs + add(combine) + + trait EvalArgs extends VariantEval { + this.reference_sequence = reference + this.intervals :+= intervalFile + this.rodBind :+= new RodBind("eval","vcf",combine.out) + this.rodBind :+= new RodBind("dbsnp","vcf",dbsnp) + this.jarFile = sting + this.ST = List("Filter","Novelty","JexlExpression") + this.select_names = List("lowOnly","filteredInLow","Intersection","filteredInHi","hiOnly","filteredInAll") + this.select_exps = List("\"set == 'loCov'\"","\"set == 'hiCov-filterInloCov'\"", + "\"set == 'Intersection'\"", "\"set == 'filterInhiCov-loCov'\"", + "\"set == 'hiCov'\"","\"set == 'FilteredInAll'\"") + this.EV = List("TiTvVariantEvaluator","CountVariants","CompOverlap") + this.out = swapExt(bootstrapMergedOut,".vcf",".merged.combined.eval") + this.nt = Some(8) + this.memoryLimit = Some(12) + } + + var eval : VariantEval = new VariantEval with EvalArgs + add(eval) + } +}