mergeEvals allows you to treat -eval 1.vcf -eval 2.vcf as a single call set

-- A bit of code cleanup in VCFUtils
-- VariantEval table to create 1000G Phase I variant summary table
-- First version of 1000G Phase I summary table Qscript
This commit is contained in:
Mark DePristo 2011-11-09 14:35:50 -05:00
parent bb448b27d2
commit e639f0798e
5 changed files with 214 additions and 13 deletions

View File

@ -143,10 +143,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
/**
* See the -list argument to view available modules.
*/
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false)
@Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false)
protected Boolean NO_STANDARD_MODULES = false;
// Other arguments
@ -171,6 +171,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
private boolean requireStrictAlleleMatch = false;
/**
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
*/
@Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false)
public boolean mergeEvals = false;
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -224,13 +231,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
knowns.add(compRod);
}
// Collect the eval rod names
Set<String> evalNames = new TreeSet<String>();
for ( RodBinding<VariantContext> evalRod : evals )
evalNames.add(evalRod.getName());
// Now that we have all the rods categorized, determine the sample list from the eval rods.
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames);
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
// Load the sample list
@ -289,8 +291,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());
// --------- track --------- sample - VariantContexts -
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled);
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false);
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false);
// for each eval track
for ( final RodBinding<VariantContext> evalRod : evals ) {
@ -353,6 +355,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
}
}
}
if ( mergeEvals ) break; // stop processing the eval tracks
}
}

View File

@ -0,0 +1,162 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;
@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table")
public class G1KPhaseITable extends VariantEvaluator {
// basic counts on various rates found
@DataPoint(description = "Number of samples")
public long nSamples = 0;
@DataPoint(description = "Number of processed loci")
public long nProcessedLoci = 0;
@DataPoint(description = "Number of SNPs")
public long nSNPs = 0;
@DataPoint(description = "SNP Novelty Rate")
public double SNPNoveltyRate = 0;
@DataPoint(description = "Mean number of SNPs per individual")
public long nSNPsPerSample = 0;
@DataPoint(description = "Number of Indels")
public long nIndels = 0;
@DataPoint(description = "Indel Novelty Rate")
public double IndelNoveltyRate = 0;
@DataPoint(description = "Mean number of Indels per individual")
public long nIndelsPerSample = 0;
@DataPoint(description = "Number of SVs")
public long nSVs = 0;
@DataPoint(description = "SV Novelty Rate")
public double SVNoveltyRate = 0;
@DataPoint(description = "Mean number of SVs per individual")
public long nSVsPerSample = 0;
Map<VariantContext.Type, Integer> allVariantCounts, knownVariantCounts;
Map<String, Map<VariantContext.Type, Integer>> countsPerSample;
private final Map<VariantContext.Type, Integer> makeCounts() {
Map<VariantContext.Type, Integer> counts = new EnumMap<VariantContext.Type, Integer>(VariantContext.Type.class);
counts.put(VariantContext.Type.SNP, 0);
counts.put(VariantContext.Type.INDEL, 0);
counts.put(VariantContext.Type.SYMBOLIC, 0);
return counts;
}
public void initialize(VariantEvalWalker walker) {
countsPerSample = new HashMap<String, Map<VariantContext.Type, Integer>>();
nSamples = walker.getSampleNamesForEvaluation().size();
for ( String sample : walker.getSampleNamesForEvaluation() ) {
countsPerSample.put(sample, makeCounts());
}
allVariantCounts = makeCounts();
knownVariantCounts = makeCounts();
}
@Override public boolean enabled() { return true; }
public int getComparisonOrder() {
return 2; // we only need to see each eval track
}
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null ) return null;
switch (eval.getType()) {
// case NO_VARIATION:
// // shouldn't get here
// break;
case SNP:
case INDEL:
case SYMBOLIC:
allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1);
if ( comp != null )
knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1);
break;
default:
throw new UserException.BadInput("Unexpected variant context type: " + eval);
}
// count variants per sample
for (final Genotype g : eval.getGenotypes().values()) {
if ( ! g.isNoCall() && ! g.isHomRef() ) {
int count = countsPerSample.get(g.getSampleName()).get(eval.getType());
countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1);
}
}
return null; // we don't capture any interesting sites
}
private final int perSampleMean(VariantContext.Type type) {
long sum = 0;
for ( Map<VariantContext.Type, Integer> count : countsPerSample.values() ) {
sum += count.get(type);
}
return (int)(Math.round(sum / (1.0 * countsPerSample.size())));
}
private final double noveltyRate(VariantContext.Type type) {
int all = allVariantCounts.get(type);
int known = knownVariantCounts.get(type);
int novel = all - known;
return (novel / (1.0 * all));
}
public void finalizeEvaluation() {
nSNPs = allVariantCounts.get(VariantContext.Type.SNP);
nIndels = allVariantCounts.get(VariantContext.Type.INDEL);
nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC);
nSNPsPerSample = perSampleMean(VariantContext.Type.SNP);
nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL);
nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC);
SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP);
IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL);
SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC);
}
}

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
@ -15,8 +16,11 @@ public class EvalRod extends VariantStratifier implements RequiredStratification
@Override
public void initialize() {
states = new ArrayList<String>();
for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getEvals() )
for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getEvals() ) {
states.add(rod.getName());
if ( getVariantEvalWalker().mergeEvals )
break;
}
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {

View File

@ -312,12 +312,20 @@ public class VariantEvalUtils {
*
* @return the mapping of track to VC list that should be populated
*/
public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List<RodBinding<VariantContext>> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>
bindVariantContexts(RefMetaDataTracker tracker,
ReferenceContext ref,
List<RodBinding<VariantContext>> tracks,
boolean byFilter,
boolean subsetBySample,
boolean trackPerSample,
boolean mergeTracks) {
if ( tracker == null )
return null;
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindings = new HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>();
RodBinding<VariantContext> firstTrack = tracks.isEmpty() ? null : tracks.get(0);
for ( RodBinding<VariantContext> track : tracks ) {
HashMap<String, Set<VariantContext>> mapping = new HashMap<String, Set<VariantContext>>();
@ -346,7 +354,20 @@ public class VariantEvalUtils {
}
}
bindings.put(track, mapping);
if ( mergeTracks && bindings.containsKey(firstTrack) ) {
// go through each binding of sample -> value and add all of the bindings from this entry
HashMap<String, Set<VariantContext>> firstMapping = bindings.get(firstTrack);
for ( Map.Entry<String, Set<VariantContext>> elt : mapping.entrySet() ) {
Set<VariantContext> firstMappingSet = firstMapping.get(elt.getKey());
if ( firstMappingSet != null ) {
firstMappingSet.addAll(elt.getValue());
} else {
firstMapping.put(elt.getKey(), elt.getValue());
}
}
} else {
bindings.put(track, mapping);
}
}
return bindings;

View File

@ -26,6 +26,8 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -41,6 +43,14 @@ public class VCFUtils {
*/
private VCFUtils() { }
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
// Collect the eval rod names
final Set<String> names = new TreeSet<String>();
for ( final RodBinding<T> evalRod : rodBindings )
names.add(evalRod.getName());
return getVCFHeadersFromRods(toolkit, names);
}
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();