diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index b00e03af2..eb68dd01a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -68,6 +68,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)") protected Boolean NO_STANDARD_STRATIFICATIONS = false; + @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) + protected Set typesToUse = null; + // Evaluator arguments @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; @@ -201,7 +204,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (tracker != null) { // track sample vc - HashMap> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames); + HashMap> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null); for ( String compName : compNames ) { VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null; @@ -210,6 +213,12 @@ public class VariantEvalWalker extends RodWalker implements Tr for ( String sampleName : sampleNamesForStratification ) { VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null; + if ( typesToUse != null ) { + if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null; + if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null; +// if ( eval != null ) logger.info("Keeping " + eval); + } + HashMap> stateMap = new HashMap>(); for ( VariantStratifier vs : stratificationObjects ) { ArrayList states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 9bbd69432..427aa834d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -267,27 +267,35 @@ public class VariantEvalUtils { * @param evalNames the evaluation track names * @return the set of allowable variation types */ - public EnumSet getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames) { - EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); + public EnumSet getAllowableVariationTypes(RefMetaDataTracker tracker, + ReferenceContext ref, + Set compNames, + Set evalNames, + boolean dynamicSelectTypes ) { + if ( dynamicSelectTypes ) { // todo -- this code is really conceptually broken + EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); - if (tracker != null) { - Collection evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false); + if (tracker != null) { + Collection evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false); - for (VariantContext vc : evalvcs) { - allowableTypes.add(vc.getType()); - } - - if (allowableTypes.size() == 1) { - // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types - Collection compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false); - - for (VariantContext vc : compvcs) { + for (VariantContext vc : evalvcs) { allowableTypes.add(vc.getType()); } - } - } - return allowableTypes; + if (allowableTypes.size() == 1) { + // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types + Collection compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false); + + for (VariantContext vc : compvcs) { + allowableTypes.add(vc.getType()); + } + } + } + + return allowableTypes; + } else { + return EnumSet.allOf(VariantContext.Type.class); + } } /** @@ -345,10 +353,9 @@ public class VariantEvalUtils { * @param subsetBySample if false, do not separate the track into per-sample VCs * @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need * to do this) - * @param allowNoCalls if false, don't accept no-call loci from a variant track * @return a mapping of track names to a list of VariantContext objects */ - public HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, EnumSet allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample, boolean allowNoCalls) { + public HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, EnumSet allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { HashMap> bindings = new HashMap>(); for (String trackName : trackNames) { @@ -365,7 +372,7 @@ public class VariantEvalUtils { vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); } - if ((byFilter || !vcsub.isFiltered()) && (allowNoCalls || vcsub.getType() != VariantContext.Type.NO_VARIATION)) { + if ((byFilter || !vcsub.isFiltered())) { vcs.put(VariantEvalWalker.getAllSampleName(), vcsub); } @@ -374,7 +381,7 @@ public class VariantEvalUtils { for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) { VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName); - if ((byFilter || !samplevc.isFiltered()) && (allowNoCalls || samplevc.getType() != VariantContext.Type.NO_VARIATION)) { + if ((byFilter || !samplevc.isFiltered())) { vcs.put(sampleName, samplevc); } } @@ -397,10 +404,10 @@ public class VariantEvalUtils { * @param evalNames the list of eval names to process * @return a mapping of track names to a list of VariantContext objects */ - public HashMap> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames) { + public HashMap> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames, boolean dynamicSelectTypes) { HashMap> vcs = new HashMap>(); - EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames); + EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes); boolean byFilter = false; boolean perSampleIsEnabled = false; @@ -412,8 +419,8 @@ public class VariantEvalUtils { } } - HashMap> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled, true); - HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false, false); + HashMap> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled); + HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false); vcs.putAll(compBindings); vcs.putAll(evalBindings); diff --git a/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala b/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala new file mode 100755 index 000000000..1a3330560 --- /dev/null +++ b/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala @@ -0,0 +1,147 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.queue.extensions.gatk.RodBind +import org.broadinstitute.sting.queue.extensions.gatk._ + +class StandardVariantEvalation extends QScript { + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName = "intervals", doc="intervals", required=false) + val TARGET_INTERVAL: String = "20"; + + val DATA_DIR = "data/" // todo -- make into an argument + + val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); + + val JOB_QUEUE = "hour" + + // todo -- add arguments to include other files for SNPs and indels + + val VARIANT_TYPES: List[String] = List("indels", "snps") + val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( + "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), + "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) + ) + + class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { + val originalFile = new File(DATA_DIR + filename) + val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile + val sitesFile = swapExt(file, ".vcf", ".sites.vcf") + } + + class Eval(val name: String, val evalType: String, val file: File) {} + + var COMPS: List[Comp] = Nil + def addComp(comp: Comp) { COMPS = comp :: COMPS } + + var EVALS: List[Eval] = Nil + def addEval(eval: Eval) { EVALS = eval :: EVALS } + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.intervalsString = List(TARGET_INTERVAL); + this.reference_sequence = referenceFile; + this.jobQueue = JOB_QUEUE; + this.memoryLimit = Some(2) + } + + def script = { + // + // Standard evaluation files for indels + // + addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) + addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf")) + addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true)) + addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) + addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) + + // + // INDEL call sets + // + addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf"))) + addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf"))) + addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))) + + // + // Standard evaluation files for SNPs + // + addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) + addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf")) + addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true)) + + // + // SNP call sets + // + addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))) + + // + // create hom-var versions of key files + // + for ( comp <- COMPS ) + if ( comp.MakeHomVar ) + add(new SelectHomVars(comp.originalFile, comp.file)) + + for ( comp <- COMPS ) + add(new JustSites(comp.file, comp.sitesFile)) + + // + // Loop over evaluation types + // + for ( evalType <- VARIANT_TYPES ) { + var evalsOfType = EVALS.filter(_.evalType == evalType) + val compsOfType = COMPS.filter(_.evalType == evalType) + + if ( evalsOfType.size > 1 ) { + val union: File = new File("union.%s.vcf".format(evalType)) + add(new MyCombine(evalsOfType.map(_.file), union)); + evalsOfType = new Eval("union", evalType, union) :: evalsOfType + } + + val VE = new MyEval() + VE.VT = VARIANT_TYPE_VT(evalType) + VE.o = new File(evalType + ".eval") + + // add evals + for ( calls <- evalsOfType ) + VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) + + // add comps + //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) + for ( comp <- compsOfType ) + VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) + + add(VE) + } + } + + + class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("variant", "VCF", vcf) + this.o = out + this.select ++= List("\"AC == 2\"") + } + + class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { + for ( vcf <- vcfs ) + this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.o = out + } + + class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { + def commandLine = "cut -f 1-8 %s > %s".format(in, out) + this.jobQueue = JOB_QUEUE; + } + + class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { + this.noST = true + this.evalModule :+= "ValidationReport" + } +} +