From 5c979633f0562a8fee1a1a14aa9fc9383eddfb48 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 6 Mar 2011 19:31:12 +0000 Subject: [PATCH] Due to a problem in the way that dynamic type selection works, I've added an explicit (temporary) ability to restrict VE to specific variant types (SNPs, INDELs, etc), so that calculations will work when a site has a SNP in dbSNP but is called as an indel, causing the SNP site to mysteriously disappear from the comp track, a huge problem for validation report. VEU updated to allow both dynamic type (old) and just returning everything in the track. Also, created a standard Queue script that calculates a suite of standard indel and SNP assessment results. Will be the basis for a general evaluation Queue script with standardized data files for SNPs and Indels. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5385 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/VariantEvalWalker.java | 11 +- .../varianteval/util/VariantEvalUtils.java | 55 ++++--- .../depristo/StandardVariantEvalation.scala | 147 ++++++++++++++++++ 3 files changed, 188 insertions(+), 25 deletions(-) create mode 100755 scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index b00e03af2..eb68dd01a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -68,6 +68,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)") protected Boolean NO_STANDARD_STRATIFICATIONS = false; + @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) + protected Set typesToUse = null; + // Evaluator arguments @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; @@ -201,7 +204,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (tracker != null) { // track sample vc - HashMap> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames); + HashMap> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null); for ( String compName : compNames ) { VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null; @@ -210,6 +213,12 @@ public class VariantEvalWalker extends RodWalker implements Tr for ( String sampleName : sampleNamesForStratification ) { VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null; + if ( typesToUse != null ) { + if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null; + if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null; +// if ( eval != null ) logger.info("Keeping " + eval); + } + HashMap> stateMap = new HashMap>(); for ( VariantStratifier vs : stratificationObjects ) { ArrayList states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 9bbd69432..427aa834d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -267,27 +267,35 @@ public class VariantEvalUtils { * @param evalNames the evaluation track names * @return the set of allowable variation types */ - public EnumSet getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames) { - EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); + public EnumSet getAllowableVariationTypes(RefMetaDataTracker tracker, + ReferenceContext ref, + Set compNames, + Set evalNames, + boolean dynamicSelectTypes ) { + if ( dynamicSelectTypes ) { // todo -- this code is really conceptually broken + EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); - if (tracker != null) { - Collection evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false); + if (tracker != null) { + Collection evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false); - for (VariantContext vc : evalvcs) { - allowableTypes.add(vc.getType()); - } - - if (allowableTypes.size() == 1) { - // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types - Collection compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false); - - for (VariantContext vc : compvcs) { + for (VariantContext vc : evalvcs) { allowableTypes.add(vc.getType()); } - } - } - return allowableTypes; + if (allowableTypes.size() == 1) { + // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types + Collection compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false); + + for (VariantContext vc : compvcs) { + allowableTypes.add(vc.getType()); + } + } + } + + return allowableTypes; + } else { + return EnumSet.allOf(VariantContext.Type.class); + } } /** @@ -345,10 +353,9 @@ public class VariantEvalUtils { * @param subsetBySample if false, do not separate the track into per-sample VCs * @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need * to do this) - * @param allowNoCalls if false, don't accept no-call loci from a variant track * @return a mapping of track names to a list of VariantContext objects */ - public HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, EnumSet allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample, boolean allowNoCalls) { + public HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, EnumSet allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { HashMap> bindings = new HashMap>(); for (String trackName : trackNames) { @@ -365,7 +372,7 @@ public class VariantEvalUtils { vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); } - if ((byFilter || !vcsub.isFiltered()) && (allowNoCalls || vcsub.getType() != VariantContext.Type.NO_VARIATION)) { + if ((byFilter || !vcsub.isFiltered())) { vcs.put(VariantEvalWalker.getAllSampleName(), vcsub); } @@ -374,7 +381,7 @@ public class VariantEvalUtils { for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) { VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName); - if ((byFilter || !samplevc.isFiltered()) && (allowNoCalls || samplevc.getType() != VariantContext.Type.NO_VARIATION)) { + if ((byFilter || !samplevc.isFiltered())) { vcs.put(sampleName, samplevc); } } @@ -397,10 +404,10 @@ public class VariantEvalUtils { * @param evalNames the list of eval names to process * @return a mapping of track names to a list of VariantContext objects */ - public HashMap> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames) { + public HashMap> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames, boolean dynamicSelectTypes) { HashMap> vcs = new HashMap>(); - EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames); + EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes); boolean byFilter = false; boolean perSampleIsEnabled = false; @@ -412,8 +419,8 @@ public class VariantEvalUtils { } } - HashMap> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled, true); - HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false, false); + HashMap> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled); + HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false); vcs.putAll(compBindings); vcs.putAll(evalBindings); diff --git a/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala b/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala new file mode 100755 index 000000000..1a3330560 --- /dev/null +++ b/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala @@ -0,0 +1,147 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.queue.extensions.gatk.RodBind +import org.broadinstitute.sting.queue.extensions.gatk._ + +class StandardVariantEvalation extends QScript { + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName = "intervals", doc="intervals", required=false) + val TARGET_INTERVAL: String = "20"; + + val DATA_DIR = "data/" // todo -- make into an argument + + val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); + + val JOB_QUEUE = "hour" + + // todo -- add arguments to include other files for SNPs and indels + + val VARIANT_TYPES: List[String] = List("indels", "snps") + val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( + "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), + "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) + ) + + class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { + val originalFile = new File(DATA_DIR + filename) + val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile + val sitesFile = swapExt(file, ".vcf", ".sites.vcf") + } + + class Eval(val name: String, val evalType: String, val file: File) {} + + var COMPS: List[Comp] = Nil + def addComp(comp: Comp) { COMPS = comp :: COMPS } + + var EVALS: List[Eval] = Nil + def addEval(eval: Eval) { EVALS = eval :: EVALS } + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.intervalsString = List(TARGET_INTERVAL); + this.reference_sequence = referenceFile; + this.jobQueue = JOB_QUEUE; + this.memoryLimit = Some(2) + } + + def script = { + // + // Standard evaluation files for indels + // + addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) + addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf")) + addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true)) + addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) + addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) + + // + // INDEL call sets + // + addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf"))) + addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf"))) + addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))) + + // + // Standard evaluation files for SNPs + // + addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) + addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf")) + addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true)) + + // + // SNP call sets + // + addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))) + + // + // create hom-var versions of key files + // + for ( comp <- COMPS ) + if ( comp.MakeHomVar ) + add(new SelectHomVars(comp.originalFile, comp.file)) + + for ( comp <- COMPS ) + add(new JustSites(comp.file, comp.sitesFile)) + + // + // Loop over evaluation types + // + for ( evalType <- VARIANT_TYPES ) { + var evalsOfType = EVALS.filter(_.evalType == evalType) + val compsOfType = COMPS.filter(_.evalType == evalType) + + if ( evalsOfType.size > 1 ) { + val union: File = new File("union.%s.vcf".format(evalType)) + add(new MyCombine(evalsOfType.map(_.file), union)); + evalsOfType = new Eval("union", evalType, union) :: evalsOfType + } + + val VE = new MyEval() + VE.VT = VARIANT_TYPE_VT(evalType) + VE.o = new File(evalType + ".eval") + + // add evals + for ( calls <- evalsOfType ) + VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) + + // add comps + //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) + for ( comp <- compsOfType ) + VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) + + add(VE) + } + } + + + class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("variant", "VCF", vcf) + this.o = out + this.select ++= List("\"AC == 2\"") + } + + class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { + for ( vcf <- vcfs ) + this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.o = out + } + + class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { + def commandLine = "cut -f 1-8 %s > %s".format(in, out) + this.jobQueue = JOB_QUEUE; + } + + class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { + this.noST = true + this.evalModule :+= "ValidationReport" + } +} +