From e639f0798e07a93c90b0ead785dd3c9d98ba13a6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 9 Nov 2011 14:35:50 -0500 Subject: [PATCH] mergeEvals allows you to treat -eval 1.vcf -eval 2.vcf as a single call set -- A bit of code cleanup in VCFUtils -- VariantEval table to create 1000G Phase I variant summary table -- First version of 1000G Phase I summary table Qscript --- .../varianteval/VariantEvalWalker.java | 24 +-- .../evaluators/G1KPhaseITable.java | 162 ++++++++++++++++++ .../varianteval/stratifications/EvalRod.java | 6 +- .../varianteval/util/VariantEvalUtils.java | 25 ++- .../sting/utils/codecs/vcf/VCFUtils.java | 10 ++ 5 files changed, 214 insertions(+), 13 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 28f4f2a56..a9448bc06 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -143,10 +143,10 @@ public class VariantEvalWalker extends RodWalker implements Tr /** * See the -list argument to view available modules. */ - @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) + @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false) protected String[] MODULES_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false) + @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments @@ -171,6 +171,13 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) private boolean requireStrictAlleleMatch = false; + /** + * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying + * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. + */ + @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false) + public boolean mergeEvals = false; + // Variables private Set jexlExpressions = new TreeSet(); @@ -224,13 +231,8 @@ public class VariantEvalWalker extends RodWalker implements Tr knowns.add(compRod); } - // Collect the eval rod names - Set evalNames = new TreeSet(); - for ( RodBinding evalRod : evals ) - evalNames.add(evalRod.getName()); - // Now that we have all the rods categorized, determine the sample list from the eval rods. - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames); + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list @@ -289,8 +291,8 @@ public class VariantEvalWalker extends RodWalker implements Tr String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); // --------- track --------- sample - VariantContexts - - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled); - HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false); + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for ( final RodBinding evalRod : evals ) { @@ -353,6 +355,8 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } + + if ( mergeEvals ) break; // stop processing the eval tracks } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java new file mode 100644 index 000000000..0e2ee70ef --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; + +@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table") +public class G1KPhaseITable extends VariantEvaluator { + // basic counts on various rates found + @DataPoint(description = "Number of samples") + public long nSamples = 0; + + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public long nSNPs = 0; + @DataPoint(description = "SNP Novelty Rate") + public double SNPNoveltyRate = 0; + @DataPoint(description = "Mean number of SNPs per individual") + public long nSNPsPerSample = 0; + + @DataPoint(description = "Number of Indels") + public long nIndels = 0; + @DataPoint(description = "Indel Novelty Rate") + public double IndelNoveltyRate = 0; + @DataPoint(description = "Mean number of Indels per individual") + public long nIndelsPerSample = 0; + + @DataPoint(description = "Number of SVs") + public long nSVs = 0; + @DataPoint(description = "SV Novelty Rate") + public double SVNoveltyRate = 0; + @DataPoint(description = "Mean number of SVs per individual") + public long nSVsPerSample = 0; + + Map allVariantCounts, knownVariantCounts; + Map> countsPerSample; + + private final Map makeCounts() { + Map counts = new EnumMap(VariantContext.Type.class); + counts.put(VariantContext.Type.SNP, 0); + counts.put(VariantContext.Type.INDEL, 0); + counts.put(VariantContext.Type.SYMBOLIC, 0); + return counts; + } + + public void initialize(VariantEvalWalker walker) { + countsPerSample = new HashMap>(); + nSamples = walker.getSampleNamesForEvaluation().size(); + + for ( String sample : walker.getSampleNamesForEvaluation() ) { + countsPerSample.put(sample, makeCounts()); + } + + allVariantCounts = makeCounts(); + knownVariantCounts = makeCounts(); + } + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null ) return null; + + switch (eval.getType()) { +// case NO_VARIATION: +// // shouldn't get here +// break; + case SNP: + case INDEL: + case SYMBOLIC: + allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1); + if ( comp != null ) + knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1); + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + // count variants per sample + for (final Genotype g : eval.getGenotypes().values()) { + if ( ! g.isNoCall() && ! g.isHomRef() ) { + int count = countsPerSample.get(g.getSampleName()).get(eval.getType()); + countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1); + } + } + + return null; // we don't capture any interesting sites + } + + private final int perSampleMean(VariantContext.Type type) { + long sum = 0; + for ( Map count : countsPerSample.values() ) { + sum += count.get(type); + } + return (int)(Math.round(sum / (1.0 * countsPerSample.size()))); + } + + private final double noveltyRate(VariantContext.Type type) { + int all = allVariantCounts.get(type); + int known = knownVariantCounts.get(type); + int novel = all - known; + return (novel / (1.0 * all)); + } + + public void finalizeEvaluation() { + nSNPs = allVariantCounts.get(VariantContext.Type.SNP); + nIndels = allVariantCounts.get(VariantContext.Type.INDEL); + nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC); + + nSNPsPerSample = perSampleMean(VariantContext.Type.SNP); + nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL); + nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC); + + SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); + IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); + SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index e276adc32..b2b6d4165 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -15,8 +16,11 @@ public class EvalRod extends VariantStratifier implements RequiredStratification @Override public void initialize() { states = new ArrayList(); - for ( RodBinding rod : getVariantEvalWalker().getEvals() ) + for ( RodBinding rod : getVariantEvalWalker().getEvals() ) { states.add(rod.getName()); + if ( getVariantEvalWalker().mergeEvals ) + break; + } } public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 6a057a456..aa8c6cfb9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -312,12 +312,20 @@ public class VariantEvalUtils { * * @return the mapping of track to VC list that should be populated */ - public HashMap, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { + public HashMap, HashMap>> + bindVariantContexts(RefMetaDataTracker tracker, + ReferenceContext ref, + List> tracks, + boolean byFilter, + boolean subsetBySample, + boolean trackPerSample, + boolean mergeTracks) { if ( tracker == null ) return null; HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); for ( RodBinding track : tracks ) { HashMap> mapping = new HashMap>(); @@ -346,7 +354,20 @@ public class VariantEvalUtils { } } - bindings.put(track, mapping); + if ( mergeTracks && bindings.containsKey(firstTrack) ) { + // go through each binding of sample -> value and add all of the bindings from this entry + HashMap> firstMapping = bindings.get(firstTrack); + for ( Map.Entry> elt : mapping.entrySet() ) { + Set firstMappingSet = firstMapping.get(elt.getKey()); + if ( firstMappingSet != null ) { + firstMappingSet.addAll(elt.getValue()); + } else { + firstMapping.put(elt.getKey(), elt.getValue()); + } + } + } else { + bindings.put(track, mapping); + } } return bindings; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 2d8421507..3c55c1ff9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,6 +43,14 @@ public class VCFUtils { */ private VCFUtils() { } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Map data = new HashMap();