diff --git a/scala/qscript/core/StandardVariantEvaluation.scala b/scala/qscript/core/StandardVariantEvaluation.scala new file mode 100755 index 000000000..e316b7ad5 --- /dev/null +++ b/scala/qscript/core/StandardVariantEvaluation.scala @@ -0,0 +1,191 @@ +package core + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk.RodBind +import org.broadinstitute.sting.queue.extensions.gatk._ + +class StandardVariantEvaluation extends QScript { + // todo -- update to released version when things stabilize + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false) + val TARGET_INTERVAL: String = "20" + + @Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false) + val CREATE_UNION: Boolean = false + + @Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false) + val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/" + + val COMPS_DIR = DATA_DIR + "/comps/" + val EVALS_DIR = DATA_DIR + "/evals/" + + @Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false) + val moreSNPsToEval: List[File] = Nil + + @Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false) + val moreIndelsToEval: List[File] = Nil + + val VARIANT_TYPES: List[String] = List("indels", "snps") + val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( + "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), + "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) + ) + + val SITES_DIR: String = "sitesFiles" + + // path to b37 DBSNP + @Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false) + val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") + //val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); + + class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { + val originalFile = new File(COMPS_DIR + filename) + val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile + val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName) + } + + class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) { + val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename) + } + + var COMPS: List[Comp] = Nil + def addComp(comp: Comp) { COMPS = comp :: COMPS } + + var EVALS: List[Eval] = Nil + def addEval(eval: Eval) { EVALS = eval :: EVALS } + def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) } + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.intervalsString = List(TARGET_INTERVAL); + this.reference_sequence = referenceFile; + this.memoryLimit = Some(2) + } + + def initializeStandardDataFiles() = { + // + // Standard evaluation files for indels + // + addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) + addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf")) + addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true)) + addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) + addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) + + // + // INDEL call sets + // + addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf")) + addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf")) + addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf")) + + // + // Standard evaluation files for SNPs + // + addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) + addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf")) + addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true)) + addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf")) + addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf")) + addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf")) + + // + // SNP call sets + // + addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")) + // todo -- are there other good call sets for evaluation? + // todo -- add hg19 na12878 64x + } + + def script = { + val sitesDir = new File(SITES_DIR) + if ( ! sitesDir.exists ) sitesDir.mkdirs() + + initializeStandardDataFiles(); + + // add additional files for evaluation, if necessary + moreSNPsToEval.foreach(addEvalFromCMD(_, "snps")) + moreIndelsToEval.foreach(addEvalFromCMD(_, "indels")) + + // + // create hom-var versions of key files + // + for ( comp <- COMPS ) + if ( comp.MakeHomVar ) + add(new SelectHomVars(comp.originalFile, comp.file)) + + for ( comp <- COMPS ) + add(new JustSites(comp.file, comp.sitesFile)) + + // + // Loop over evaluation types + // + for ( evalType <- VARIANT_TYPES ) { + var evalsOfType = EVALS.filter(_.evalType == evalType) + val compsOfType = COMPS.filter(_.evalType == evalType) + + // if desired and possible, create a union.X.vcf file + if ( CREATE_UNION && evalsOfType.size > 1 ) { + val union: File = new File("union.%s.vcf".format(evalType)) + add(new MyCombine(evalsOfType.map(_.file), union)); + evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType + } + + // our root VE + val VE = new MyEval() + VE.VT = VARIANT_TYPE_VT(evalType) + VE.o = new File(evalType + ".eval") + + // add evals + for ( calls <- evalsOfType ) + VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) + + // add comps + //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) + for ( comp <- compsOfType ) + VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) + + add(VE) + } + } + + /** + * Select homozygous non-reference sites from a single deep data set + */ + class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("variant", "VCF", vcf) + this.o = out + this.select ++= List("\"AC == 2\"") + } + + /** + * A simple union + */ + class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { + for ( vcf <- vcfs ) + this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.o = out + } + + /** + * A command line (cut) that removes all genotyping information from a file + */ + class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { + def commandLine = "cut -f 1-8 %s > %s".format(in, out) + } + + /** + * Base class for VariantEval used here + */ + class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { + this.noST = true + this.evalModule :+= "ValidationReport" + } +} + diff --git a/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala b/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala deleted file mode 100755 index 1a3330560..000000000 --- a/scala/qscript/oneoffs/depristo/StandardVariantEvalation.scala +++ /dev/null @@ -1,147 +0,0 @@ -package oneoffs.depristo - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction -import org.broadinstitute.sting.queue.extensions.gatk.RodBind -import org.broadinstitute.sting.queue.extensions.gatk._ - -class StandardVariantEvalation extends QScript { - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="ref", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "intervals", doc="intervals", required=false) - val TARGET_INTERVAL: String = "20"; - - val DATA_DIR = "data/" // todo -- make into an argument - - val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); - - val JOB_QUEUE = "hour" - - // todo -- add arguments to include other files for SNPs and indels - - val VARIANT_TYPES: List[String] = List("indels", "snps") - val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( - "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), - "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) - ) - - class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { - val originalFile = new File(DATA_DIR + filename) - val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile - val sitesFile = swapExt(file, ".vcf", ".sites.vcf") - } - - class Eval(val name: String, val evalType: String, val file: File) {} - - var COMPS: List[Comp] = Nil - def addComp(comp: Comp) { COMPS = comp :: COMPS } - - var EVALS: List[Eval] = Nil - def addEval(eval: Eval) { EVALS = eval :: EVALS } - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.intervalsString = List(TARGET_INTERVAL); - this.reference_sequence = referenceFile; - this.jobQueue = JOB_QUEUE; - this.memoryLimit = Some(2) - } - - def script = { - // - // Standard evaluation files for indels - // - addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) - addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf")) - addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true)) - addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) - addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) - - // - // INDEL call sets - // - addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf"))) - addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf"))) - addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))) - - // - // Standard evaluation files for SNPs - // - addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) - addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf")) - addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true)) - - // - // SNP call sets - // - addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))) - - // - // create hom-var versions of key files - // - for ( comp <- COMPS ) - if ( comp.MakeHomVar ) - add(new SelectHomVars(comp.originalFile, comp.file)) - - for ( comp <- COMPS ) - add(new JustSites(comp.file, comp.sitesFile)) - - // - // Loop over evaluation types - // - for ( evalType <- VARIANT_TYPES ) { - var evalsOfType = EVALS.filter(_.evalType == evalType) - val compsOfType = COMPS.filter(_.evalType == evalType) - - if ( evalsOfType.size > 1 ) { - val union: File = new File("union.%s.vcf".format(evalType)) - add(new MyCombine(evalsOfType.map(_.file), union)); - evalsOfType = new Eval("union", evalType, union) :: evalsOfType - } - - val VE = new MyEval() - VE.VT = VARIANT_TYPE_VT(evalType) - VE.o = new File(evalType + ".eval") - - // add evals - for ( calls <- evalsOfType ) - VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) - - // add comps - //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) - for ( comp <- compsOfType ) - VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) - - add(VE) - } - } - - - class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", vcf) - this.o = out - this.select ++= List("\"AC == 2\"") - } - - class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { - for ( vcf <- vcfs ) - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) - this.o = out - } - - class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { - def commandLine = "cut -f 1-8 %s > %s".format(in, out) - this.jobQueue = JOB_QUEUE; - } - - class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { - this.noST = true - this.evalModule :+= "ValidationReport" - } -} - diff --git a/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala b/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala index 6d4836fe8..33d7f16ea 100755 --- a/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala +++ b/scala/qscript/oneoffs/depristo/manySampleUGPerformance.scala @@ -9,8 +9,18 @@ class ManySampleUGPerformanceTesting extends QScript { @Argument(shortName = "R", doc="ref", required=false) var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - val TARGET_INTERVAL = "my.intervals" - val FULL_BAM_LIST = new File("allPopulations_phase1_release.no_solid.list") + @Argument(shortName = "bams", doc="BAMs", required=true) + val FULL_BAM_LIST: File = null; + + @Argument(shortName = "intervals", doc="intervals", required=true) + val TARGET_INTERVAL: String = null; + + @Argument(shortName = "preMerge", doc="preMerge", required=false) + val PRE_MERGE: Boolean = false; + + @Argument(shortName = "dcov", doc="dcov", required=false) + val DCOV: Int = 50; + val MERGED_DIR = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/manySampleUGPerformance/") trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { @@ -24,26 +34,37 @@ class ManySampleUGPerformanceTesting extends QScript { } def script = { - for (nSamples <- List(1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900)) { + for (nSamples <- List(1, 2, 4, 8, 16, 32, 60)) { +// for (nSamples <- List(1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900)) { // for (nSamples <- List(10)) { val sublist = new SliceList(nSamples) val mergeSublist = new MergeBAMs(sublist.list) + val name: String = if ( PRE_MERGE ) "pre_merge" else "dynamic_merge" + val bams: File = if ( PRE_MERGE ) mergeSublist.o else sublist.list + add(sublist) - add(mergeSublist) - add(new Index(mergeSublist.o) ) + if ( PRE_MERGE ) { + add(mergeSublist) + add(new Index(mergeSublist.o) ) + } // SNP calling //add(new Call(sublist.list, nSamples, "dynamic_merge")) - add(new Call(mergeSublist.o, nSamples, "pre_merge")) + val gt = new Call(bams, nSamples, name); + gt.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.N2_GOLD_STANDARD) + add(gt) + + val gtLinear = new Call(bams, nSamples, name + "_linear"); + gtLinear.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL) + add(gtLinear) // SNP calling -- no annotations - //add(new Call(sublist.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; }) - add(new Call(mergeSublist.o, nSamples, "pre_merge_no_annotations") { this.G :+= "none"; }) + //add(new Call(bams.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; }) // CountLoci //add(new MyCountLoci(sublist.list, nSamples, "dynamic_merge")) - add(new MyCountLoci(mergeSublist.o, nSamples, "pre_merge")) + add(new MyCountLoci(bams, nSamples, name)) } } @@ -63,20 +84,21 @@ class ManySampleUGPerformanceTesting extends QScript { @Output(doc="foo") var outVCF: File = new File("%s.%d.%s.vcf".format(bamList.getName, n, name)) this.input_file :+= bamList this.stand_call_conf = Option(10.0) - this.dcov = Option(50); + this.dcov = Option(DCOV); this.o = outVCF } class MyCountLoci(@Input(doc="foo") bamList: File, n: Int, name: String) extends CountLoci with UNIVERSAL_GATK_ARGS { @Output(doc="foo") var outFile: File = new File("%s.%d.%s.txt".format(bamList.getName, n, name)) this.input_file :+= bamList - this.dcov = Option(50); + this.dcov = Option(DCOV); this.o = outFile } class SliceList(n: Int) extends CommandLineFunction { @Output(doc="foo") var list: File = new File("bams.%d.list".format(n)) def commandLine = "head -n %d %s > %s".format(n, FULL_BAM_LIST, list) + this.jobQueue = "gsa"; } }