diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index a9448bc06..21c1610c7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -15,11 +16,14 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; @@ -178,6 +182,12 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false) public boolean mergeEvals = false; + /** + * File containing tribble-readable features for the IntervalStratificiation + */ + @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=true) + protected IntervalBinding intervalsFile = null; + // Variables private Set jexlExpressions = new TreeSet(); @@ -260,6 +270,16 @@ public class VariantEvalWalker extends RodWalker implements Tr perSampleIsEnabled = true; } + if ( intervalsFile != null ) { + boolean fail = true; + for ( final VariantStratifier vs : stratificationObjects ) { + if ( vs.getClass().equals(IntervalStratification.class) ) + fail = false; + } + if ( fail ) + throw new UserException.BadArgumentValue("ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); + } + // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); @@ -532,6 +552,14 @@ public class VariantEvalWalker extends RodWalker implements Tr public Set getJexlExpressions() { return jexlExpressions; } + public List getIntervals() { + if ( intervalsFile == null ) + throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); + + return intervalsFile.getIntervals(getToolkit()); + } + + public Set getContigNames() { final TreeSet contigs = new TreeSet(); for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) { @@ -540,4 +568,7 @@ public class VariantEvalWalker extends RodWalker implements Tr return contigs; } + public GenomeLocParser getGenomeLocParser() { + return getToolkit().getGenomeLocParser(); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java new file mode 100644 index 000000000..bf001588a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import net.sf.picard.util.IntervalTree; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Stratifies the variants by whether they overlap an interval in the set provided on the command line. + * + * The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up + * by whether a variant overlaps an interval or not. I use this to differentiate between variants occurring + * in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like: + * + * -T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification + * + * Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value. In order to + * safely use this module you should provide entire contigs worth of variants, and let the interval strat decide + * overlap, as opposed to using -L which will not properly work with symbolic variants. + */ +public class IntervalStratification extends VariantStratifier { + final protected static Logger logger = Logger.getLogger(IntervalStratification.class); + final Map> intervalTreeByContig = new HashMap>(); + + @Override + public void initialize() { + final List locs = getVariantEvalWalker().getIntervals(); + + if ( locs.isEmpty() ) + throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); + + logger.info(String.format("Creating IntervalStratification containing %d intervals covering %d bp", + locs.size(), IntervalUtils.intervalSize(locs))); + + // set up the map from contig -> interval tree + for ( final String contig : getVariantEvalWalker().getContigNames() ) + intervalTreeByContig.put(contig, new IntervalTree()); + + for ( final GenomeLoc loc : locs ) { + intervalTreeByContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), true); + } + + states = new ArrayList(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); + } + + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + final ArrayList relevantStates = new ArrayList(Arrays.asList("all")); + + if (eval != null) { + final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true); + IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); + IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); + //logger.info(String.format("Overlap %s found %s", loc, node)); + relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals"); + } + + return relevantStates; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 8d9768681..e10bcbaa0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -35,8 +35,10 @@ import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; /** * Factory class for creating GenomeLocs @@ -453,6 +455,28 @@ public class GenomeLocParser { return createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); } + /** + * Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible + * is true, and VC is a symbolic allele the end of the created genome loc will be the value + * of the END info field key, if it exists, or vc.getEnd() if not. + * + * @param vc + * @param includeSymbolicEndIfPossible + * @return + */ + public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) { + if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) { + int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd()); + return createGenomeLoc(vc.getChr(), vc.getStart(), end); + } + else + return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); + } + + public GenomeLoc createGenomeLoc(final VariantContext vc) { + return createGenomeLoc(vc, false); + } + /** * create a new genome loc, given the contig name, and a single position. Must be on the reference *