VariantEval module to stratify the variants by whether they overlap an interval set

The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up by whether a variant overlaps an interval or not.  I use this to differentiate between variants occurring in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like:

-T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification

Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value.  In order to safely use this module you should provide entire contigs worth of variants, and let the interval strat decide overlap, as opposed to using -L which will not properly work with symbolic variants.

Minor improvements to create() interval in GenomeLocParser.
This commit is contained in:
Mark DePristo 2011-11-10 10:58:40 -05:00
parent 0111e58d4e
commit dc4932f93d
3 changed files with 148 additions and 0 deletions

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -15,11 +16,14 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
@ -178,6 +182,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false)
public boolean mergeEvals = false;
/**
* File containing tribble-readable features for the IntervalStratificiation
*/
@Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=true)
protected IntervalBinding<Feature> intervalsFile = null;
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -260,6 +270,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
perSampleIsEnabled = true;
}
if ( intervalsFile != null ) {
boolean fail = true;
for ( final VariantStratifier vs : stratificationObjects ) {
if ( vs.getClass().equals(IntervalStratification.class) )
fail = false;
}
if ( fail )
throw new UserException.BadArgumentValue("ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
}
// Initialize the evaluation contexts
evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null);
@ -532,6 +552,14 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; }
public List<GenomeLoc> getIntervals() {
if ( intervalsFile == null )
throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled");
return intervalsFile.getIntervals(getToolkit());
}
public Set<String> getContigNames() {
final TreeSet<String> contigs = new TreeSet<String>();
for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) {
@ -540,4 +568,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return contigs;
}
public GenomeLocParser getGenomeLocParser() {
return getToolkit().getGenomeLocParser();
}
}

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import net.sf.picard.util.IntervalTree;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Stratifies the variants by whether they overlap an interval in the set provided on the command line.
*
* The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up
* by whether a variant overlaps an interval or not. I use this to differentiate between variants occurring
* in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like:
*
* -T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification
*
* Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value. In order to
* safely use this module you should provide entire contigs worth of variants, and let the interval strat decide
* overlap, as opposed to using -L which will not properly work with symbolic variants.
*/
public class IntervalStratification extends VariantStratifier {
final protected static Logger logger = Logger.getLogger(IntervalStratification.class);
final Map<String, IntervalTree<Boolean>> intervalTreeByContig = new HashMap<String, IntervalTree<Boolean>>();
@Override
public void initialize() {
final List<GenomeLoc> locs = getVariantEvalWalker().getIntervals();
if ( locs.isEmpty() )
throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?");
logger.info(String.format("Creating IntervalStratification containing %d intervals covering %d bp",
locs.size(), IntervalUtils.intervalSize(locs)));
// set up the map from contig -> interval tree
for ( final String contig : getVariantEvalWalker().getContigNames() )
intervalTreeByContig.put(contig, new IntervalTree<Boolean>());
for ( final GenomeLoc loc : locs ) {
intervalTreeByContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), true);
}
states = new ArrayList<String>(Arrays.asList("all", "overlaps.intervals", "outside.intervals"));
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
final ArrayList<String> relevantStates = new ArrayList<String>(Arrays.asList("all"));
if (eval != null) {
final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true);
IntervalTree<Boolean> intervalTree = intervalTreeByContig.get(loc.getContig());
IntervalTree.Node<Boolean> node = intervalTree.minOverlapper(loc.getStart(), loc.getStop());
//logger.info(String.format("Overlap %s found %s", loc, node));
relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals");
}
return relevantStates;
}
}

View File

@ -35,8 +35,10 @@ import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
/**
* Factory class for creating GenomeLocs
@ -453,6 +455,28 @@ public class GenomeLocParser {
return createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd());
}
/**
* Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible
* is true, and VC is a symbolic allele the end of the created genome loc will be the value
* of the END info field key, if it exists, or vc.getEnd() if not.
*
* @param vc
* @param includeSymbolicEndIfPossible
* @return
*/
public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) {
if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) {
int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd());
return createGenomeLoc(vc.getChr(), vc.getStart(), end);
}
else
return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd());
}
public GenomeLoc createGenomeLoc(final VariantContext vc) {
return createGenomeLoc(vc, false);
}
/**
* create a new genome loc, given the contig name, and a single position. Must be on the reference
*