mergeEvals allows you to treat -eval 1.vcf -eval 2.vcf as a single call set

-- A bit of code cleanup in VCFUtils -- VariantEval table to create 1000G Phase I variant summary table -- First version of 1000G Phase I summary table Qscript
2011-11-09 14:35:50 -05:00 · 2011-11-09 14:35:50 -05:00 · e639f0798e
parent bb448b27d2
commit e639f0798e
5 changed files with 214 additions and 13 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -143,10 +143,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    /**
     * See the -list argument to view available modules.
     */
-    @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
+    @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false)
    protected String[] MODULES_TO_USE = {};

-    @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false)
+    @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false)
    protected Boolean NO_STANDARD_MODULES = false;

    // Other arguments
@ -171,6 +171,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
    private boolean requireStrictAlleleMatch = false;

+    /**
+     * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
+     * variant set, and evaluate the union of the results.  Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
+     */
+    @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false)
+    public boolean mergeEvals = false;
+
    // Variables
    private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

@ -224,13 +231,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
                knowns.add(compRod);
        }

-        // Collect the eval rod names
-        Set<String> evalNames = new TreeSet<String>();
-        for ( RodBinding<VariantContext> evalRod : evals )
-            evalNames.add(evalRod.getName());
-
        // Now that we have all the rods categorized, determine the sample list from the eval rods.
-        Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames);
+        Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
        Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

        // Load the sample list
@ -289,8 +291,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
            String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());

            //      --------- track ---------           sample  - VariantContexts -
-            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled);
-            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false);
+            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
+            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false);

            // for each eval track
            for ( final RodBinding<VariantContext> evalRod : evals ) {
@ -353,6 +355,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
                        }
                    }
                }
+
+                if ( mergeEvals ) break; // stop processing the eval tracks
            }
        }

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java
@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
+import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
+import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.Map;
+
+@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table")
+public class G1KPhaseITable extends VariantEvaluator {
+    // basic counts on various rates found
+    @DataPoint(description = "Number of samples")
+    public long nSamples = 0;
+
+    @DataPoint(description = "Number of processed loci")
+    public long nProcessedLoci = 0;
+
+    @DataPoint(description = "Number of SNPs")
+    public long nSNPs = 0;
+    @DataPoint(description = "SNP Novelty Rate")
+    public double SNPNoveltyRate = 0;
+    @DataPoint(description = "Mean number of SNPs per individual")
+    public long nSNPsPerSample = 0;
+
+    @DataPoint(description = "Number of Indels")
+    public long nIndels = 0;
+    @DataPoint(description = "Indel Novelty Rate")
+    public double IndelNoveltyRate = 0;
+    @DataPoint(description = "Mean number of Indels per individual")
+    public long nIndelsPerSample = 0;
+
+    @DataPoint(description = "Number of SVs")
+    public long nSVs = 0;
+    @DataPoint(description = "SV Novelty Rate")
+    public double SVNoveltyRate = 0;
+    @DataPoint(description = "Mean number of SVs per individual")
+    public long nSVsPerSample = 0;
+
+    Map<VariantContext.Type, Integer> allVariantCounts, knownVariantCounts;
+    Map<String, Map<VariantContext.Type, Integer>> countsPerSample;
+
+    private final Map<VariantContext.Type, Integer> makeCounts() {
+        Map<VariantContext.Type, Integer> counts = new EnumMap<VariantContext.Type, Integer>(VariantContext.Type.class);
+        counts.put(VariantContext.Type.SNP, 0);
+        counts.put(VariantContext.Type.INDEL, 0);
+        counts.put(VariantContext.Type.SYMBOLIC, 0);
+        return counts;
+    }
+
+    public void initialize(VariantEvalWalker walker) {
+        countsPerSample = new HashMap<String, Map<VariantContext.Type, Integer>>();
+        nSamples = walker.getSampleNamesForEvaluation().size();
+
+        for ( String sample : walker.getSampleNamesForEvaluation() ) {
+            countsPerSample.put(sample, makeCounts());
+        }
+
+        allVariantCounts = makeCounts();
+        knownVariantCounts = makeCounts();
+    }
+
+    @Override public boolean enabled() { return true; }
+
+    public int getComparisonOrder() {
+        return 2;   // we only need to see each eval track
+    }
+
+    public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
+    }
+
+    public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( eval == null ) return null;
+
+        switch (eval.getType()) {
+//            case NO_VARIATION:
+//                // shouldn't get here
+//                break;
+            case SNP:
+            case INDEL:
+            case SYMBOLIC:
+                allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1);
+                if ( comp != null )
+                    knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1);
+                break;
+            default:
+                throw new UserException.BadInput("Unexpected variant context type: " + eval);
+        }
+
+        // count variants per sample
+        for (final Genotype g : eval.getGenotypes().values()) {
+            if ( ! g.isNoCall() && ! g.isHomRef() ) {
+                int count = countsPerSample.get(g.getSampleName()).get(eval.getType());
+                countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1);
+            }
+        }
+
+        return null; // we don't capture any interesting sites
+    }
+
+    private final int perSampleMean(VariantContext.Type type) {
+        long sum = 0;
+        for ( Map<VariantContext.Type, Integer> count : countsPerSample.values() ) {
+            sum += count.get(type);
+        }
+        return (int)(Math.round(sum / (1.0 * countsPerSample.size())));
+    }
+
+    private final double noveltyRate(VariantContext.Type type) {
+        int all = allVariantCounts.get(type);
+        int known = knownVariantCounts.get(type);
+        int novel = all - known;
+        return (novel / (1.0 * all));
+    }
+
+    public void finalizeEvaluation() {
+        nSNPs = allVariantCounts.get(VariantContext.Type.SNP);
+        nIndels = allVariantCounts.get(VariantContext.Type.INDEL);
+        nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC);
+
+        nSNPsPerSample = perSampleMean(VariantContext.Type.SNP);
+        nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL);
+        nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC);
+
+        SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP);
+        IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL);
+        SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC);
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java
@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;

 /**
@ -15,8 +16,11 @@ public class EvalRod extends VariantStratifier implements RequiredStratification
    @Override
    public void initialize() {
        states = new ArrayList<String>();
-        for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getEvals() )
+        for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getEvals() ) {
            states.add(rod.getName());
+            if ( getVariantEvalWalker().mergeEvals )
+                break;
+        }
    }

    public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
@ -312,12 +312,20 @@ public class VariantEvalUtils {
     *
     * @return the mapping of track to VC list that should be populated
     */
-    public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List<RodBinding<VariantContext>> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
+    public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>
+        bindVariantContexts(RefMetaDataTracker tracker,
+                            ReferenceContext ref,
+                            List<RodBinding<VariantContext>> tracks,
+                            boolean byFilter,
+                            boolean subsetBySample,
+                            boolean trackPerSample,
+                            boolean mergeTracks) {
        if ( tracker == null )
            return null;

        HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindings = new HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>();

+        RodBinding<VariantContext> firstTrack = tracks.isEmpty() ? null : tracks.get(0);
        for ( RodBinding<VariantContext> track : tracks ) {
            HashMap<String, Set<VariantContext>> mapping = new HashMap<String, Set<VariantContext>>();

@ -346,7 +354,20 @@ public class VariantEvalUtils {
                }
            }

-            bindings.put(track, mapping);
+            if ( mergeTracks && bindings.containsKey(firstTrack) ) {
+                // go through each binding of sample -> value and add all of the bindings from this entry
+                HashMap<String, Set<VariantContext>> firstMapping = bindings.get(firstTrack);
+                for ( Map.Entry<String, Set<VariantContext>> elt : mapping.entrySet() ) {
+                    Set<VariantContext> firstMappingSet = firstMapping.get(elt.getKey());
+                    if ( firstMappingSet != null ) {
+                        firstMappingSet.addAll(elt.getValue());
+                    } else {
+                        firstMapping.put(elt.getKey(), elt.getValue());
+                    }
+                }
+            } else {
+                bindings.put(track, mapping);
+            }
        }

        return bindings;
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
@ -26,6 +26,8 @@
 package org.broadinstitute.sting.utils.codecs.vcf;

 import org.apache.log4j.Logger;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -41,6 +43,14 @@ public class VCFUtils {
     */
    private VCFUtils() { }

+    public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
+        // Collect the eval rod names
+        final Set<String> names = new TreeSet<String>();
+        for ( final RodBinding<T> evalRod : rodBindings )
+            names.add(evalRod.getName());
+        return getVCFHeadersFromRods(toolkit, names);
+    }
+
    public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
        Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();