From e639f0798e07a93c90b0ead785dd3c9d98ba13a6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 9 Nov 2011 14:35:50 -0500 Subject: [PATCH 01/19] mergeEvals allows you to treat -eval 1.vcf -eval 2.vcf as a single call set -- A bit of code cleanup in VCFUtils -- VariantEval table to create 1000G Phase I variant summary table -- First version of 1000G Phase I summary table Qscript --- .../varianteval/VariantEvalWalker.java | 24 +-- .../evaluators/G1KPhaseITable.java | 162 ++++++++++++++++++ .../varianteval/stratifications/EvalRod.java | 6 +- .../varianteval/util/VariantEvalUtils.java | 25 ++- .../sting/utils/codecs/vcf/VCFUtils.java | 10 ++ 5 files changed, 214 insertions(+), 13 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 28f4f2a56..a9448bc06 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -143,10 +143,10 @@ public class VariantEvalWalker extends RodWalker implements Tr /** * See the -list argument to view available modules. */ - @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) + @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false) protected String[] MODULES_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false) + @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments @@ -171,6 +171,13 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) private boolean requireStrictAlleleMatch = false; + /** + * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying + * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. + */ + @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false) + public boolean mergeEvals = false; + // Variables private Set jexlExpressions = new TreeSet(); @@ -224,13 +231,8 @@ public class VariantEvalWalker extends RodWalker implements Tr knowns.add(compRod); } - // Collect the eval rod names - Set evalNames = new TreeSet(); - for ( RodBinding evalRod : evals ) - evalNames.add(evalRod.getName()); - // Now that we have all the rods categorized, determine the sample list from the eval rods. - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames); + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list @@ -289,8 +291,8 @@ public class VariantEvalWalker extends RodWalker implements Tr String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); // --------- track --------- sample - VariantContexts - - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled); - HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false); + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for ( final RodBinding evalRod : evals ) { @@ -353,6 +355,8 @@ public class VariantEvalWalker extends RodWalker implements Tr } } } + + if ( mergeEvals ) break; // stop processing the eval tracks } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java new file mode 100644 index 000000000..0e2ee70ef --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; + +@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table") +public class G1KPhaseITable extends VariantEvaluator { + // basic counts on various rates found + @DataPoint(description = "Number of samples") + public long nSamples = 0; + + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public long nSNPs = 0; + @DataPoint(description = "SNP Novelty Rate") + public double SNPNoveltyRate = 0; + @DataPoint(description = "Mean number of SNPs per individual") + public long nSNPsPerSample = 0; + + @DataPoint(description = "Number of Indels") + public long nIndels = 0; + @DataPoint(description = "Indel Novelty Rate") + public double IndelNoveltyRate = 0; + @DataPoint(description = "Mean number of Indels per individual") + public long nIndelsPerSample = 0; + + @DataPoint(description = "Number of SVs") + public long nSVs = 0; + @DataPoint(description = "SV Novelty Rate") + public double SVNoveltyRate = 0; + @DataPoint(description = "Mean number of SVs per individual") + public long nSVsPerSample = 0; + + Map allVariantCounts, knownVariantCounts; + Map> countsPerSample; + + private final Map makeCounts() { + Map counts = new EnumMap(VariantContext.Type.class); + counts.put(VariantContext.Type.SNP, 0); + counts.put(VariantContext.Type.INDEL, 0); + counts.put(VariantContext.Type.SYMBOLIC, 0); + return counts; + } + + public void initialize(VariantEvalWalker walker) { + countsPerSample = new HashMap>(); + nSamples = walker.getSampleNamesForEvaluation().size(); + + for ( String sample : walker.getSampleNamesForEvaluation() ) { + countsPerSample.put(sample, makeCounts()); + } + + allVariantCounts = makeCounts(); + knownVariantCounts = makeCounts(); + } + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null ) return null; + + switch (eval.getType()) { +// case NO_VARIATION: +// // shouldn't get here +// break; + case SNP: + case INDEL: + case SYMBOLIC: + allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1); + if ( comp != null ) + knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1); + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + // count variants per sample + for (final Genotype g : eval.getGenotypes().values()) { + if ( ! g.isNoCall() && ! g.isHomRef() ) { + int count = countsPerSample.get(g.getSampleName()).get(eval.getType()); + countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1); + } + } + + return null; // we don't capture any interesting sites + } + + private final int perSampleMean(VariantContext.Type type) { + long sum = 0; + for ( Map count : countsPerSample.values() ) { + sum += count.get(type); + } + return (int)(Math.round(sum / (1.0 * countsPerSample.size()))); + } + + private final double noveltyRate(VariantContext.Type type) { + int all = allVariantCounts.get(type); + int known = knownVariantCounts.get(type); + int novel = all - known; + return (novel / (1.0 * all)); + } + + public void finalizeEvaluation() { + nSNPs = allVariantCounts.get(VariantContext.Type.SNP); + nIndels = allVariantCounts.get(VariantContext.Type.INDEL); + nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC); + + nSNPsPerSample = perSampleMean(VariantContext.Type.SNP); + nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL); + nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC); + + SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); + IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); + SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index e276adc32..b2b6d4165 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -15,8 +16,11 @@ public class EvalRod extends VariantStratifier implements RequiredStratification @Override public void initialize() { states = new ArrayList(); - for ( RodBinding rod : getVariantEvalWalker().getEvals() ) + for ( RodBinding rod : getVariantEvalWalker().getEvals() ) { states.add(rod.getName()); + if ( getVariantEvalWalker().mergeEvals ) + break; + } } public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 6a057a456..aa8c6cfb9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -312,12 +312,20 @@ public class VariantEvalUtils { * * @return the mapping of track to VC list that should be populated */ - public HashMap, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { + public HashMap, HashMap>> + bindVariantContexts(RefMetaDataTracker tracker, + ReferenceContext ref, + List> tracks, + boolean byFilter, + boolean subsetBySample, + boolean trackPerSample, + boolean mergeTracks) { if ( tracker == null ) return null; HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); for ( RodBinding track : tracks ) { HashMap> mapping = new HashMap>(); @@ -346,7 +354,20 @@ public class VariantEvalUtils { } } - bindings.put(track, mapping); + if ( mergeTracks && bindings.containsKey(firstTrack) ) { + // go through each binding of sample -> value and add all of the bindings from this entry + HashMap> firstMapping = bindings.get(firstTrack); + for ( Map.Entry> elt : mapping.entrySet() ) { + Set firstMappingSet = firstMapping.get(elt.getKey()); + if ( firstMappingSet != null ) { + firstMappingSet.addAll(elt.getValue()); + } else { + firstMapping.put(elt.getKey(), elt.getValue()); + } + } + } else { + bindings.put(track, mapping); + } } return bindings; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 2d8421507..3c55c1ff9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,6 +43,14 @@ public class VCFUtils { */ private VCFUtils() { } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Map data = new HashMap(); From 0111e58d4e259368da67df551012177e85bcec04 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 9 Nov 2011 14:45:40 -0500 Subject: [PATCH 03/19] Don't generate PDF unless you have -run specified --- .../src/org/broadinstitute/sting/queue/QCommandLine.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index e8091cde7..768eab7e4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -129,9 +129,11 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Writing JobLogging GATKReport to file " + reportFile) QJobReport.printReport(qGraph.getFunctionsAndStatus(script.functions), reportFile) - val pdfFile = new File(jobStringName + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) + if ( settings.run ) { + val pdfFile = new File(jobStringName + ".pdf") + logger.info("Plotting JobLogging GATKReport to file " + pdfFile) + QJobReport.plotReport(reportFile, pdfFile) + } } } } From dc4932f93d991b7e0247e86f911c64fd87d20083 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 10:58:40 -0500 Subject: [PATCH 04/19] VariantEval module to stratify the variants by whether they overlap an interval set The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up by whether a variant overlaps an interval or not. I use this to differentiate between variants occurring in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like: -T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value. In order to safely use this module you should provide entire contigs worth of variants, and let the interval strat decide overlap, as opposed to using -L which will not properly work with symbolic variants. Minor improvements to create() interval in GenomeLocParser. --- .../varianteval/VariantEvalWalker.java | 31 +++++++ .../IntervalStratification.java | 93 +++++++++++++++++++ .../sting/utils/GenomeLocParser.java | 24 +++++ 3 files changed, 148 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index a9448bc06..21c1610c7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -15,11 +16,14 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; @@ -178,6 +182,12 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false) public boolean mergeEvals = false; + /** + * File containing tribble-readable features for the IntervalStratificiation + */ + @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=true) + protected IntervalBinding intervalsFile = null; + // Variables private Set jexlExpressions = new TreeSet(); @@ -260,6 +270,16 @@ public class VariantEvalWalker extends RodWalker implements Tr perSampleIsEnabled = true; } + if ( intervalsFile != null ) { + boolean fail = true; + for ( final VariantStratifier vs : stratificationObjects ) { + if ( vs.getClass().equals(IntervalStratification.class) ) + fail = false; + } + if ( fail ) + throw new UserException.BadArgumentValue("ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); + } + // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); @@ -532,6 +552,14 @@ public class VariantEvalWalker extends RodWalker implements Tr public Set getJexlExpressions() { return jexlExpressions; } + public List getIntervals() { + if ( intervalsFile == null ) + throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); + + return intervalsFile.getIntervals(getToolkit()); + } + + public Set getContigNames() { final TreeSet contigs = new TreeSet(); for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) { @@ -540,4 +568,7 @@ public class VariantEvalWalker extends RodWalker implements Tr return contigs; } + public GenomeLocParser getGenomeLocParser() { + return getToolkit().getGenomeLocParser(); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java new file mode 100644 index 000000000..bf001588a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import net.sf.picard.util.IntervalTree; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Stratifies the variants by whether they overlap an interval in the set provided on the command line. + * + * The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up + * by whether a variant overlaps an interval or not. I use this to differentiate between variants occurring + * in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like: + * + * -T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification + * + * Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value. In order to + * safely use this module you should provide entire contigs worth of variants, and let the interval strat decide + * overlap, as opposed to using -L which will not properly work with symbolic variants. + */ +public class IntervalStratification extends VariantStratifier { + final protected static Logger logger = Logger.getLogger(IntervalStratification.class); + final Map> intervalTreeByContig = new HashMap>(); + + @Override + public void initialize() { + final List locs = getVariantEvalWalker().getIntervals(); + + if ( locs.isEmpty() ) + throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); + + logger.info(String.format("Creating IntervalStratification containing %d intervals covering %d bp", + locs.size(), IntervalUtils.intervalSize(locs))); + + // set up the map from contig -> interval tree + for ( final String contig : getVariantEvalWalker().getContigNames() ) + intervalTreeByContig.put(contig, new IntervalTree()); + + for ( final GenomeLoc loc : locs ) { + intervalTreeByContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), true); + } + + states = new ArrayList(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); + } + + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + final ArrayList relevantStates = new ArrayList(Arrays.asList("all")); + + if (eval != null) { + final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true); + IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); + IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); + //logger.info(String.format("Overlap %s found %s", loc, node)); + relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals"); + } + + return relevantStates; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 8d9768681..e10bcbaa0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -35,8 +35,10 @@ import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; /** * Factory class for creating GenomeLocs @@ -453,6 +455,28 @@ public class GenomeLocParser { return createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); } + /** + * Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible + * is true, and VC is a symbolic allele the end of the created genome loc will be the value + * of the END info field key, if it exists, or vc.getEnd() if not. + * + * @param vc + * @param includeSymbolicEndIfPossible + * @return + */ + public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) { + if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) { + int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd()); + return createGenomeLoc(vc.getChr(), vc.getStart(), end); + } + else + return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); + } + + public GenomeLoc createGenomeLoc(final VariantContext vc) { + return createGenomeLoc(vc, false); + } + /** * create a new genome loc, given the contig name, and a single position. Must be on the reference * From 714cac21c9d4b1a5a3d96b0c1299d1aa4d4348fd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 11:08:34 -0500 Subject: [PATCH 05/19] Testdata for IntervalStratification --- public/testdata/overlapTest.bed | 3 + public/testdata/withSymbolic.b37.vcf | 96 ++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 public/testdata/overlapTest.bed create mode 100644 public/testdata/withSymbolic.b37.vcf diff --git a/public/testdata/overlapTest.bed b/public/testdata/overlapTest.bed new file mode 100644 index 000000000..6859f1fdc --- /dev/null +++ b/public/testdata/overlapTest.bed @@ -0,0 +1,3 @@ +20 315000 315100 # should overlap 2 in withSymbolic.vcf at 315006 and 315072 +20 316955 316959 # should overlap only deletion variant at 316952 +20 317900 400000 # should overlap only the symbolic variant at 317173 diff --git a/public/testdata/withSymbolic.b37.vcf b/public/testdata/withSymbolic.b37.vcf new file mode 100644 index 000000000..4974f1229 --- /dev/null +++ b/public/testdata/withSymbolic.b37.vcf @@ -0,0 +1,96 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 315006 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.721;LDAF=0.0025;AVGPOST=0.9978;RSQ=0.6449;ERATE=0.0008;THETA=0.0006;AC=4;AN=2184 +20 315072 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.996;LDAF=0.0057;AVGPOST=0.9996;RSQ=0.9743;ERATE=0.0003;THETA=0.0016;AC=12;AN=2184 +20 315162 . T G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.950;LDAF=0.0005;AVGPOST=0.9998;RSQ=0.8078;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 315168 rs71327439 G GA 7142 PASS INDEL;BAVGPOST=0.999;BRSQ=0.990;LDAF=0.0575;AVGPOST=0.9985;RSQ=0.9891;ERATE=0.0003;THETA=0.0004;AC=125;AN=2184 +20 315201 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.965;LDAF=0.0005;AVGPOST=0.9999;RSQ=0.8599;ERATE=0.0003;THETA=0.0008;AC=1;AN=2184 +20 315214 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.974;LDAF=0.0009;AVGPOST=0.9990;RSQ=0.5860;ERATE=0.0003;THETA=0.0005;AC=1;AN=2184 +20 315270 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0557;AVGPOST=0.9992;RSQ=0.9950;ERATE=0.0003;THETA=0.0004;AC=121;AN=2184 +20 315279 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.991;LDAF=0.0572;AVGPOST=0.9990;RSQ=0.9926;ERATE=0.0003;THETA=0.0013;AC=125;AN=2184 +20 315320 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.995;LDAF=0.0028;AVGPOST=0.9998;RSQ=0.9594;ERATE=0.0005;THETA=0.0003;AC=6;AN=2184 +20 315322 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.814;LDAF=0.0007;AVGPOST=0.9994;RSQ=0.6510;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 315431 . A T 100 PASS LCSNP;EXSNP;BAVGPOST=0.987;BRSQ=0.864;LDAF=0.0431;AVGPOST=0.9873;RSQ=0.8675;ERATE=0.0006;THETA=0.0010;AC=86;AN=2184 +20 315481 . T A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.998;LDAF=0.0619;AVGPOST=0.9993;RSQ=0.9948;ERATE=0.0003;THETA=0.0007;AC=135;AN=2184 +20 315490 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.984;LDAF=0.0138;AVGPOST=0.9971;RSQ=0.9260;ERATE=0.0003;THETA=0.0046;AC=31;AN=2184 +20 315523 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.822;LDAF=0.0015;AVGPOST=0.9988;RSQ=0.6777;ERATE=0.0005;THETA=0.0003;AC=2;AN=2184 +20 315547 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.893;LDAF=0.0026;AVGPOST=0.9995;RSQ=0.9024;ERATE=0.0003;THETA=0.0004;AC=5;AN=2184 +20 315549 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.888;LDAF=0.0029;AVGPOST=0.9988;RSQ=0.8565;ERATE=0.0003;THETA=0.0013;AC=5;AN=2184 +20 315551 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.829;LDAF=0.0008;AVGPOST=0.9992;RSQ=0.5723;ERATE=0.0003;THETA=0.0012;AC=1;AN=2184 +20 315704 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.945;LDAF=0.0184;AVGPOST=0.9978;RSQ=0.9523;ERATE=0.0003;THETA=0.0017;AC=40;AN=2184 +20 315798 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.773;LDAF=0.0012;AVGPOST=0.9985;RSQ=0.4929;ERATE=0.0003;THETA=0.0005;AC=1;AN=2184 +20 315842 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.979;LDAF=0.0110;AVGPOST=0.9991;RSQ=0.9673;ERATE=0.0003;THETA=0.0005;AC=23;AN=2184 +20 315876 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.926;LDAF=0.0018;AVGPOST=0.9988;RSQ=0.7731;ERATE=0.0003;THETA=0.0007;AC=3;AN=2184 +20 316028 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0714;AVGPOST=0.9997;RSQ=0.9982;ERATE=0.0003;THETA=0.0003;AC=156;AN=2184 +20 316055 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.997;LDAF=0.1006;AVGPOST=0.9993;RSQ=0.9969;ERATE=0.0003;THETA=0.0007;AC=220;AN=2184 +20 316137 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.976;LDAF=0.0037;AVGPOST=0.9982;RSQ=0.7861;ERATE=0.0004;THETA=0.0009;AC=6;AN=2184 +20 316142 . C A 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.793;LDAF=0.0033;AVGPOST=0.9980;RSQ=0.7527;ERATE=0.0003;THETA=0.0003;AC=6;AN=2184 +20 316143 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.839;LDAF=0.0034;AVGPOST=0.9984;RSQ=0.8054;ERATE=0.0003;THETA=0.0003;AC=6;AN=2184 +20 316211 . C G 100 PASS LCSNP;EXSNP;BAVGPOST=0.997;BRSQ=0.976;LDAF=0.0565;AVGPOST=0.9983;RSQ=0.9872;ERATE=0.0003;THETA=0.0010;AC=124;AN=2184 +20 316285 . A AT 5514 PASS INDEL;BAVGPOST=0.999;BRSQ=0.993;LDAF=0.0552;AVGPOST=0.9978;RSQ=0.9829;ERATE=0.0004;THETA=0.0005;AC=119;AN=2184 +20 316295 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.808;LDAF=0.0021;AVGPOST=0.9980;RSQ=0.6390;ERATE=0.0003;THETA=0.0008;AC=4;AN=2184 +20 316481 . G T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0499;AVGPOST=0.9997;RSQ=0.9970;ERATE=0.0003;THETA=0.0007;AC=109;AN=2184 +20 316488 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.897;LDAF=0.0011;AVGPOST=0.9997;RSQ=0.8509;ERATE=0.0003;THETA=0.0006;AC=2;AN=2184 +20 316553 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.945;LDAF=0.0007;AVGPOST=0.9996;RSQ=0.7074;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 316659 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.998;LDAF=0.0497;AVGPOST=0.9995;RSQ=0.9960;ERATE=0.0003;THETA=0.0007;AC=109;AN=2184 +20 316691 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.985;LDAF=0.0058;AVGPOST=0.9989;RSQ=0.9301;ERATE=0.0003;THETA=0.0003;AC=13;AN=2184 +20 316700 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.737;LDAF=0.0030;AVGPOST=0.9971;RSQ=0.5700;ERATE=0.0009;THETA=0.0016;AC=4;AN=2184 +20 316725 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.853;LDAF=0.0011;AVGPOST=0.9995;RSQ=0.7944;ERATE=0.0003;THETA=0.0004;AC=2;AN=2184 +20 316770 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.966;LDAF=0.0017;AVGPOST=0.9991;RSQ=0.7543;ERATE=0.0005;THETA=0.0005;AC=3;AN=2184 +20 316813 . GTTC G 1701 PASS INDEL;BAVGPOST=0.995;BRSQ=0.850;LDAF=0.0156;AVGPOST=0.9987;RSQ=0.9611;ERATE=0.0003;THETA=0.0006;AC=34;AN=2184 +20 316824 rs11472305 T TTTC 5129 PASS INDEL;BAVGPOST=0.979;BRSQ=0.863;LDAF=0.0697;AVGPOST=0.9782;RSQ=0.8837;ERATE=0.0016;THETA=0.0006;AC=153;AN=2184 +20 316839 . CT C 1122 PASS INDEL;BAVGPOST=0.980;BRSQ=0.804;LDAF=0.0505;AVGPOST=0.9743;RSQ=0.8100;ERATE=0.0040;THETA=0.0009;AC=104;AN=2184 +20 316841 . TTC T 1096 PASS INDEL;BAVGPOST=0.981;BRSQ=0.824;LDAF=0.0491;AVGPOST=0.9783;RSQ=0.8367;ERATE=0.0029;THETA=0.0005;AC=105;AN=2184 +20 316842 . TCC T 110 PASS INDEL;BAVGPOST=0.983;BRSQ=0.592;LDAF=0.0203;AVGPOST=0.9772;RSQ=0.5576;ERATE=0.0022;THETA=0.0005;AC=30;AN=2184 +20 316845 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.645;LDAF=0.0015;AVGPOST=0.9989;RSQ=0.6666;ERATE=0.0003;THETA=0.0010;AC=2;AN=2184 +20 316853 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.990;BRSQ=0.922;LDAF=0.0688;AVGPOST=0.9742;RSQ=0.8456;ERATE=0.0049;THETA=0.0006;AC=133;AN=2184 +20 316882 rs11479165 CT C 145 PASS INDEL;BAVGPOST=0.991;BRSQ=0.484;LDAF=0.0074;AVGPOST=0.9920;RSQ=0.5655;ERATE=0.0005;THETA=0.0014;AC=9;AN=2184 +20 316889 . T TTC 108 PASS INDEL;BAVGPOST=0.978;BRSQ=0.235;LDAF=0.0521;AVGPOST=0.9304;RSQ=0.4479;ERATE=0.0035;THETA=0.0016;AC=63;AN=2184 +20 316901 . T TTC 272 PASS INDEL;BAVGPOST=0.979;BRSQ=0.510;LDAF=0.0363;AVGPOST=0.9527;RSQ=0.4348;ERATE=0.0071;THETA=0.0003;AC=34;AN=2184 +20 316917 . CT C 67 PASS INDEL;BAVGPOST=0.983;BRSQ=0.483;LDAF=0.0265;AVGPOST=0.9828;RSQ=0.7509;ERATE=0.0006;THETA=0.0011;AC=51;AN=2184 +20 316918 rs112071142 TTTC T 3049 PASS INDEL;BAVGPOST=0.980;BRSQ=0.801;LDAF=0.0588;AVGPOST=0.9761;RSQ=0.8444;ERATE=0.0026;THETA=0.0005;AC=120;AN=2184 +20 316924 . CT C 8 PASS INDEL;BAVGPOST=0.999;BRSQ=0.581;LDAF=0.0053;AVGPOST=0.9932;RSQ=0.5513;ERATE=0.0004;THETA=0.0011;AC=6;AN=2184 +20 316936 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.994;BRSQ=0.651;LDAF=0.0098;AVGPOST=0.9884;RSQ=0.5505;ERATE=0.0018;THETA=0.0005;AC=13;AN=2184 +20 316939 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.997;BRSQ=0.720;LDAF=0.0071;AVGPOST=0.9899;RSQ=0.4845;ERATE=0.0010;THETA=0.0033;AC=8;AN=2184 +20 316952 . CTCTTCCTCTTCT C 15835 PASS INDEL;BAVGPOST=0.891;BRSQ=0.667;LDAF=0.1650;AVGPOST=0.9042;RSQ=0.7274;ERATE=0.0037;THETA=0.0008;AC=294;AN=2184 +20 317003 . TCC T 333 PASS INDEL;BAVGPOST=0.890;BRSQ=0.114;LDAF=0.3450;AVGPOST=0.5526;RSQ=0.1674;ERATE=0.0300;THETA=0.0011;AC=350;AN=2184 +20 317022 rs111424933 TTTC T 4776 PASS INDEL;BAVGPOST=0.982;BRSQ=0.865;LDAF=0.0561;AVGPOST=0.9837;RSQ=0.8889;ERATE=0.0017;THETA=0.0004;AC=110;AN=2184 +20 317057 . C A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.775;LDAF=0.0014;AVGPOST=0.9985;RSQ=0.6049;ERATE=0.0003;THETA=0.0002;AC=3;AN=2184 +20 317135 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.995;BRSQ=0.992;LDAF=0.0489;AVGPOST=0.9964;RSQ=0.9749;ERATE=0.0004;THETA=0.0025;AC=106;AN=2184 +20 317173 MERGED_DEL_2_99440 A . . SV;BAVGPOST=0.998;BRSQ=0.577;LDAF=0.0018;AVGPOST=0.9990;RSQ=0.7465;ERATE=0.0003;THETA=0.0007;CIEND=-61,92;CIPOS=-84,73;END=319201;SOURCE=BreakDancer_317182_319211,GenomeStrip_317164_319190;SVTYPE=DEL;AC=2;AN=2184 +20 317174 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.995;BRSQ=0.991;LDAF=0.0502;AVGPOST=0.9962;RSQ=0.9727;ERATE=0.0005;THETA=0.0007;AC=107;AN=2184 +20 317266 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.999;LDAF=0.1748;AVGPOST=0.9988;RSQ=0.9973;ERATE=0.0003;THETA=0.0005;AC=383;AN=2184 +20 317448 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0068;AVGPOST=0.9988;RSQ=0.9405;ERATE=0.0003;THETA=0.0002;AC=16;AN=2184 +20 317491 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.991;LDAF=0.0609;AVGPOST=0.9993;RSQ=0.9939;ERATE=0.0003;THETA=0.0004;AC=133;AN=2184 +20 317546 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.837;LDAF=0.0007;AVGPOST=0.9994;RSQ=0.6154;ERATE=0.0003;THETA=0.0012;AC=1;AN=2184 +20 317578 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.996;BRSQ=0.882;LDAF=0.0180;AVGPOST=0.9981;RSQ=0.9606;ERATE=0.0003;THETA=0.0004;AC=40;AN=2184 +20 317658 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0609;AVGPOST=0.9998;RSQ=0.9980;ERATE=0.0003;THETA=0.0005;AC=133;AN=2184 +20 317676 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0608;AVGPOST=0.9999;RSQ=0.9992;ERATE=0.0003;THETA=0.0004;AC=133;AN=2184 +20 317710 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.952;LDAF=0.0014;AVGPOST=0.9997;RSQ=0.8880;ERATE=0.0004;THETA=0.0008;AC=3;AN=2184 +20 317824 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0555;AVGPOST=0.9993;RSQ=0.9947;ERATE=0.0003;THETA=0.0009;AC=121;AN=2184 From 67b022c34b6a3fc624f813de806e00bd7d625a3e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 13:27:13 -0500 Subject: [PATCH 07/19] Cleanup for new SampleUtils function -- getVCFHeadersFromRods(rods) is now available so that you don't have getVCFHeadersFromRods(rods, null) throughout the codebase --- .../gatk/walkers/varianteval/evaluators/CountVariants.java | 6 ++++-- .../gatk/walkers/varianteval/evaluators/G1KPhaseITable.java | 2 +- .../sting/gatk/walkers/variantutils/CombineVariants.java | 2 +- .../src/org/broadinstitute/sting/utils/SampleUtils.java | 2 +- .../org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java | 4 ++++ 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index e83434037..cba2781d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -41,6 +41,9 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nDeletions = 0; @DataPoint(description = "Number of complex indels") public long nComplex = 0; + @DataPoint(description = "Number of symbolic events") + public long nSymbolic = 0; + @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)") public long nMixed = 0; @@ -131,8 +134,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nMixed++; break; case SYMBOLIC: - // ignore symbolic alleles, but don't fail - // todo - consistent way of treating symbolic alleles thgoughout codebase? + nSymbolic++; break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java index 0e2ee70ef..3ab618496 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java @@ -103,7 +103,7 @@ public class G1KPhaseITable extends VariantEvaluator { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval == null ) return null; + if ( eval == null || eval.isMonomorphic() ) return null; switch (eval.getType()) { // case NO_VARIATION: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index ce03dfffe..573e15971 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -162,7 +162,7 @@ public class CombineVariants extends RodWalker { private boolean sitesOnlyVCF = false; public void initialize() { - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null); + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); if ( PRIORITY_STRING == null ) { PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index edc1413ba..68b220aab 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -150,7 +150,7 @@ public class SampleUtils { // iterate to get all of the sample names - for ( Map.Entry pair : VCFUtils.getVCFHeadersFromRods(toolkit, null).entrySet() ) { + for ( Map.Entry pair : VCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { Set vcfSamples = pair.getValue().getGenotypeSamples(); for ( String sample : vcfSamples ) addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 3c55c1ff9..5bd6a9b32 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -51,6 +51,10 @@ public class VCFUtils { return getVCFHeadersFromRods(toolkit, names); } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { + return getVCFHeadersFromRods(toolkit, (Collection)null); + } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Map data = new HashMap(); From dd1810140fc95e1a6d50d6fc0912bd9d26796d1f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 13:27:32 -0500 Subject: [PATCH 08/19] -stratIntervals is optional --- .../sting/gatk/walkers/varianteval/VariantEvalWalker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 21c1610c7..4e4a1550d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -185,7 +185,7 @@ public class VariantEvalWalker extends RodWalker implements Tr /** * File containing tribble-readable features for the IntervalStratificiation */ - @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=true) + @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false) protected IntervalBinding intervalsFile = null; // Variables From 060c7ce8aea6939c77aca62d87e8f5665bab1d80 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 10 Nov 2011 14:03:22 -0500 Subject: [PATCH 14/19] It wouldn't harm integrationtests if we had our logic right... :-) --- .../sting/gatk/filters/MalformedReadFilter.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java index 46d28185b..37a6cfc36 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java @@ -129,8 +129,14 @@ public class MalformedReadFilter extends ReadFilter { * @return true if they have the same number. False otherwise. */ private static boolean checkMismatchingBasesAndQuals(SAMRecord read, boolean filterMismatchingBaseAndQuals) { - if (!filterMismatchingBaseAndQuals) - throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals"); - return (read.getReadLength() == read.getBaseQualities().length); + boolean result; + if (read.getReadLength() == read.getBaseQualities().length) + result = true; + else if (filterMismatchingBaseAndQuals) + result = false; + else + throw new UserException.MalformedBAM(read, String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals]", read.getReadName(), read.getReadLength(), read.getBaseQualities().length)); + + return result; } } From 153e52ffed98a0676d69b215c3bf1b803ac3176f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 14:10:39 -0500 Subject: [PATCH 15/19] VariantEvalIntegrationTest for IntervalStratification --- .../VariantEvalIntegrationTest.java | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index cbfe0ceab..3dceb9bd2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -70,7 +70,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("1fefd6cf9c2554d5f886c3998defd4d0") + Arrays.asList("fb926edfd3d811e18b33798a43ef4379") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -91,7 +91,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("d470e00a368b5a0468012818994c6a89") + Arrays.asList("26b7d57e3a204ac80a28cb29485b59b7") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -113,7 +113,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("12856e52c2682328f91594089328596c") + Arrays.asList("1df8184062f330bea9da8bacacc5a09d") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -134,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("91610b9240f64e0eb03cfd2602cf57af") + Arrays.asList("927f26414509db9e7c0a2c067d57c949") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -155,7 +155,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e40b77e7ed6581328e373a24b93cd170") + Arrays.asList("e6fddefd95122cabc5a0f0b95bce6d34") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -176,7 +176,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("15beaf3823c131cabc5fb0445239f978") + Arrays.asList("df10486dae73a9cf8c647964f51ba3e0") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -197,7 +197,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("7ddd4ee74938d229ce5cb7b9b9104abe") + Arrays.asList("524adb0b7ff70e227b8803a88f36713e") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -220,7 +220,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a90f33906a732ef5eb346e559c96ccc1") + Arrays.asList("ef6449789dfc032602458b7c5538a1bc") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -245,7 +245,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("2567f90d3d7354850c5a59730ecc6e4f") + Arrays.asList("13b90e94fa82d72bb04a0a5addb27c3f") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -264,7 +264,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("fa091aa8967893389c51102fd9f0bebb") + Arrays.asList("8458b9d7803d75aae551fac7dbd152d6") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -277,7 +277,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("f70997b6a3e7fdc89d11e1d61a2463d4")); + 1, Arrays.asList("b954dee127ec4205ed7d33c91aa3e045")); executeTestParallel("testSelect1", spec); } @@ -294,7 +294,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("407682de41dcf139ea635e9cda21b912")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ae0027197547731a9a5c1eec5fbe0221")); executeTestParallel("testCompVsEvalAC",spec); } @@ -324,7 +324,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("424c9d438b1faa59b2c29413ba32f37b")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("835b44fc3004cc975c968c9f92ed25d6")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -336,7 +336,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18fa0b89ebfff51141975d7e4ce7a159")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("f0e003f1293343c3210ae95e8936b19a")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -414,8 +414,29 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("da65fc8f0d0eeaf0a0b06a07f444bb8e") + Arrays.asList("924b6123edb9da540d0abc66f6f33e16") ); executeTest("testAlleleCountStrat", spec); } + + @Test + public void testIntervalStrat() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + testDir + "/withSymbolic.b37.vcf", + "-noEV", + "-EV CountVariants", + "-noST", + "-stratIntervals " + testDir + "/overlapTest.bed", + "-ST IntervalStratification", + "-L 20", + "-o %s" + ), + 1, + Arrays.asList("9794e2dba205c6929dc89899fdf0bf6b") + ); + executeTest("testIntervalStrat", spec); + } } From dc9b351b5e21e84b4296995ea97ae25427ab4a94 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 10 Nov 2011 17:10:26 -0500 Subject: [PATCH 16/19] Meaningful error message when an IntervalArg file fails to parse correctly --- .../broadinstitute/sting/commandline/IntervalBinding.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 86ca6c2df..f920d90ef 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -92,7 +92,10 @@ public final class IntervalBinding { codec.readHeader(lineReader); String line = lineReader.readLine(); while ( line != null ) { - intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line))); + final Feature feature = codec.decodeLoc(line); + if ( feature == null ) + throw new UserException.MalformedFile(featureIntervals.getSource(), "Couldn't parse line '" + line + "'"); + intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature)); line = lineReader.readLine(); } } catch (IOException e) {