diff --git a/scala/qscript/playground/WholeGenomePipeline.scala b/scala/qscript/playground/WholeGenomePipeline.scala new file mode 100644 index 000000000..69b0003cc --- /dev/null +++ b/scala/qscript/playground/WholeGenomePipeline.scala @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils + +class WholeGenomePipeline extends QScript { + @Input(doc="Bam file list", shortName = "I", required=true) + var bamList: File = _ + + @Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true) + var cleanerTmpDir: String = _ + + @Argument(doc="Flag for running the whole genome (wg), chromosome 20 (chr20) or just the centromere of chromosome 1 (cent1). The default is cent1.", shortName="runType", required=false) + var runType = "cent1" + + @Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false) + var chunkSize = 3000000 + + val reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta" + val dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.vcf" + val hapmap = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + val omni = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + trait CommandLineGATKArgs extends CommandLineGATK { + this.reference_sequence = reference + this.intervalsString = runIntervals + this.memoryLimit = 2 + } + + case class ChrRange(chr: String, start: Long, stop: Long) { + def toInterval = "%s:%d-%d".format(chr, start, stop) + override def toString = toInterval + } + + val chr1Centromere = new ChrRange("1", 121429168, 121529169) + + var runIntervals = List.empty[String] + + def script() { + + var ranges = Traversable.empty[ChrRange] + val chrs = IntervalUtils.getContigSizes(reference) + runType match { + case "wg" => + // use all chromosomes from 1 to their length + ranges = chrs.map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } + runIntervals = Nil + case "chr20" => + // use chromosome 20 from 1 to its length + ranges = chrs.filterKeys(_ == "20").map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } + runIntervals = List(ranges.head.toInterval) + case "cent1" => + // use chromosome 1 centromere + ranges = List(chr1Centromere) + runIntervals = List(ranges.head.toInterval) + } + + val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_)) + val projectBase = project + "." + runType + + var chunkVcfs = List.empty[File] + for (range <- ranges) { + val chr = range.chr + val chrStart = range.start + val chrStop = range.stop + + var start = chrStart + var chunkNumber = 1 + + while (start <= chrStop) { + val stop = (start + chunkSize - 1) min chrStop + var interval = "%s:%d-%d".format(chr, start, stop) + val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber + val tmpBase: String = cleanerTmpDir + "/" + chunkBase + + val target = new RealignerTargetCreator with CommandLineGATKArgs + target.intervalsString :+= interval + target.input_file :+= bamList + target.mismatchFraction = 0.0 + target.maxIntervalSize = 700 + target.out = tmpBase + ".target.intervals" + target.jobOutputFile = chunkBase + ".target.out" + target.isIntermediate = true + add(target) + + val clean = new IndelRealigner with CommandLineGATKArgs + clean.intervalsString :+= interval + clean.input_file :+= bamList + clean.targetIntervals = target.out + clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) + clean.doNotUseSW = true + clean.simplifyBAM = true + clean.bam_compression = 1 + clean.out = tmpBase + ".cleaned.bam" + clean.jobOutputFile = chunkBase + ".clean.out" + clean.jobPriority = 51 + clean.isIntermediate = true + add(clean) + + val call = new UnifiedGenotyper with CommandLineGATKArgs + call.intervalsString :+= interval + call.input_file :+= clean.out + call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) + call.downsample_to_coverage = 50 + call.standard_min_confidence_threshold_for_calling = 4.0 + call.standard_min_confidence_threshold_for_emitting = 4.0 + call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY + call.exactCalculation = org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL + call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH + call.GSA_PRODUCTION_ONLY = true + call.out = chunkBase + ".vcf" + call.jobOutputFile = chunkBase + ".call.out" + call.jobPriority = 52 + add(call) + + chunkVcfs :+= call.out + start += chunkSize + chunkNumber += 1 + } + } + + val combineVCFs = new CombineVariants with CommandLineGATKArgs + combineVCFs.rodBind = chunkVcfs.zipWithIndex.map { case (vcf, index) => RodBind("input"+index, "VCF", vcf) } + combineVCFs.rod_priority_list = chunkVcfs.zipWithIndex.map { case (vcf, index) => "input"+index }.mkString(",") + combineVCFs.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION + combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE + combineVCFs.out = projectBase + ".merged.vcf" + combineVCFs.jobOutputFile = projectBase + ".combine.out" + combineVCFs.assumeIdenticalSamples = true + add(combineVCFs) + + val sv = new SelectVariants with CommandLineGATKArgs + sv.selectIndels = true + sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out) + sv.out = swapExt(combineVCFs.out, ".merged.vcf", ".indels.vcf") + sv.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".sv.out") + add(sv) + + val filter = new VariantFiltration with CommandLineGATKArgs + filter.variantVCF = sv.out + filter.filterName :+= "HARD_TO_VALIDATE" + filter.filterExpression :+= "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"" + filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf") + filter.jobOutputFile = swapExt(sv.jobOutputFile, ".sv.out", ".filter.out") + add(filter) + + val recombine = new CombineVariants with CommandLineGATKArgs + recombine.rodBind :+= RodBind("indels", "VCF", sv.out) + recombine.rodBind :+= RodBind("all", "VCF", combineVCFs.out) + recombine.rod_priority_list = "indels,all" + recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE + recombine.out = swapExt(combineVCFs.out, ".vcf", ".filtered.vcf") + recombine.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".recombine.out") + add(recombine) + + val vr = new VariantRecalibrator with CommandLineGATKArgs + vr.rodBind :+= RodBind("input", "VCF", recombine.out) + vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "training=true, prior=3.0") + vr.rodBind :+= RodBind("1kg", "VCF", omni, "training=true, prior=3.0") + vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true") + vr.use_annotation = List("QD", "SB", "HaplotypeScore", "HRun") + vr.trustAllPolymorphic = true + vr.TStranche = List("0.1","0.5","0.75","1.0","1.25","1.35","1.5","1.7","1.8","1.9","2.0","2.1","2.2","2.3","2.5","3.0","5.0","8.0","10.0") + vr.tranches_file = swapExt(filter.out, ".merged.filtered.vcf", ".tranches") + vr.recal_file = swapExt(filter.out, ".merged.filtered.vcf", "") + vr.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".cr.out") + add(vr) + + val ar = new ApplyRecalibration with CommandLineGATKArgs + ar.tranches_file = vr.tranches_file + ar.recal_file = vr.recal_file + ar.ignore_filter :+= "HARD_TO_VALIDATE" + ar.out = swapExt(filter.out, ".vcf", ".recalibrated.vcf") + ar.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".ar.out") + add(ar) + + val stdEval = new VariantEval with CommandLineGATKArgs + stdEval.tranchesFile = vr.tranches_file + stdEval.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) + stdEval.rodBind :+= RodBind("eval", "VCF", ar.out) + stdEval.doNotUseAllStandardStratifications = true + stdEval.doNotUseAllStandardModules = true + stdEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") + stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty") + stdEval.out = swapExt(ar.out, ".vcf", ".eval") + stdEval.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".eval.out") + add(stdEval) + } +} diff --git a/scala/qscript/playground/wgs.q b/scala/qscript/playground/wgs.q deleted file mode 100644 index 7b974e581..000000000 --- a/scala/qscript/playground/wgs.q +++ /dev/null @@ -1,209 +0,0 @@ -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.extensions.samtools._ -import org.broadinstitute.sting.queue.{QException, QScript} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType - -class WGSpipeline extends QScript { - qscript => - - @Input(doc="File with list of Bams", shortName = "bams", required = true) - var bamList: File = _ - - @Input(doc="path to GATK jar", shortName="gatk", required=true) - var gatkJar: File = _ - - @Input(doc="Project Name", shortName = "P", required = true) - var project: String =_ - - @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true) - var outputTmpDir: String = _ - - @Input(doc="reference file", shortName = "R", required = true) - var reference: File = _ - - @Input(doc="dbSNP", shortName = "D", required = true) - var dbSNP: File = _ - - @Input(doc="gsa-pipeline directory key", shortName = "dirKey", required = true) - var dirkey: String =_ - - @Input(doc="version id in format ### (ie 001)", shortName = "v", required = true) - var version: String =_ - - @Input(doc="Flag for running the entire genome. Otherwise a QC run of chr 20 is all that is run.", shortName = "freeze", required = false) - var freeze: Boolean = false - - - private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" - - val hapmap = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" - val g1k = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf" - val omni = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" - - private var pipeline: Pipeline = _ - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference - this.memoryLimit = Some(3) - } - - def script = { - var chr: Int = 20 - var runType: String = "QC" - if (qscript.freeze) { - var chr: Int =20 - var runType = "freeze" - } //TODO: Impliment whole genome once infrastructure for waiting for tmp space is available. - // for now this will only run on chromosome 20 because otherwise we might run into space errors. - else{ - var chr: Int = 20 - var runType = "QC" - } - val dirbase = qscript.dirkey + "/" + runType + "_v" +qscript.version - val base = dirbase + "/Chunks/" + qscript.project + "." + runType - val projectBase = dirbase + "/" + qscript.project + "." + runType - val basesPerJob: Int = 3000000 - val lastBase: Int = 63025520 - var start: Int = 1 - var stop: Int = start - 1 + basesPerJob - if( stop > lastBase ) { stop = lastBase } - var jobNumber: Int = 1 - var mergeList: List[List[_]] = Nil - while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { - var interval = "%d:%d-%d".format(chr, start, stop) - val baseTmpName: String = qscript.outputTmpDir + "/" +base + "chunk_" +jobNumber - val baseJobName: String = qscript.project + "_chunk_" + jobNumber - // 1.) Create cleaning targets - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.memoryLimit = Some(3) - target.input_file :+= qscript.bamList - target.intervalsString :+= interval - target.out = new File(baseTmpName + ".target.intervals") - target.mismatchFraction = Some(0.0) - target.maxIntervalSize = Some(700) - target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - target.jobName = baseJobName + ".target" - target.jobOutputFile = new File(baseTmpName + ".target.out") - target.isIntermediate = true - // 2.) Clean without SW - val clean = new IndelRealigner with CommandLineGATKArgs - val cleanedBam = new File(baseTmpName + ".cleaned.bam") - clean.memoryLimit = Some(4) - clean.input_file :+= qscript.bamList - clean.intervalsString :+= interval - clean.targetIntervals = target.out - clean.jobOutputFile = new File (baseTmpName + ".clean.out" ) - clean.out = cleanedBam - clean.doNotUseSW = true - clean.simplifyBAM = true - clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) - clean.jobName = baseJobName + ".clean" - clean.isIntermediate = true - clean.compress = Some(0) - clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - val call = new UnifiedGenotyper with CommandLineGATKArgs - call.out = new File("/humgen/gsa-pipeline/"+base+ "chunk_" +jobNumber + ".vcf") - call.dcov = Some( 50 ) - call.stand_call_conf = Some( 4.0 ) - call.stand_emit_conf = Some( 4.0 ) - call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY - call.jobName = baseJobName +".call" - call.exactCalculation = org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL - call.jobOutputFile = new File ("/humgen/gsa-pipeline/"+base+ "chunk_" +jobNumber + ".call.out" ) - call.input_file = List(clean.out) - call.intervalsString :+= interval - call.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) - - mergeList:+= List(baseJobName, call.out) - add(target, clean, call) - - start += basesPerJob - stop += basesPerJob - if( stop > lastBase ) { stop = lastBase } - jobNumber += 1 - } - - val combineVCFs = new CombineVariants with CommandLineGATKArgs - combineVCFs.priority = "" - for (c <- mergeList ){ - combineVCFs.rodBind :+= RodBind(c(0).toString, "VCF", c(1).toString) - combineVCFs.priority += c(0).toString +"," - } - combineVCFs.variantMergeOptions = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION - combineVCFs.jobName = qscript.project + ".combine" - combineVCFs.out = new File("/humgen/gsa-pipeline/" + projectBase + ".merged.vcf") - combineVCFs.jobOutputFile = new File ("/humgen/gsa-pipeline/" +projectBase + ".combine.out") - combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE - combineVCFs.intervalsString :+= "20:1-63025520" ///TODO impliment only using the chr 20 or nothing if freeze - combineVCFs.assumeIdenticalSamples = true - - - val sv = new SelectVariants with CommandLineGATKArgs - sv.selectIndels = true - sv.out = swapExt(combineVCFs.out, "merged.vcf", "indelsOnly.vcf") - sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out) - sv.jobName = qscript.project + ".SV" - sv.jobOutputFile = swapExt(combineVCFs.jobOutputFile, "combine.out", "sv.out") - - val filter = new VariantFiltration with CommandLineGATKArgs - filter.intervalsString :+= "20:1-63025520" ///TODO impliment only using the chr 20 or nothing if freeze - filter.variantVCF = sv.out - filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf") - filter.filterName ++= List("HARD_TO_VALIDATE") //ToDO check to make sure that this is what Guillermo recomends - filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - filter.jobName = qscript.project + ".VF" - filter.jobOutputFile = swapExt(sv.jobOutputFile, ".sv.out", ".filter.out") - - - val recombine = new CombineVariants with CommandLineGATKArgs - recombine.rodBind :+= RodBind("indels", "VCF", sv.out) - recombine.rodBind :+= RodBind("all", "VCF", combineVCFs.out) - recombine.priority = "indels,all" - recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE - recombine.out = swapExt(combineVCFs.out, "vcf", "filtered.vcf") - recombine.jobName = qscript.project + ".recombine" - recombine.jobOutputFile = swapExt(combineVCFs.jobOutputFile, "combine.out", "recombine.out") - - val cr = new VariantRecalibrator with CommandLineGATKArgs - cr.rodBind :+= RodBind("hapmap","VCF", qscript.hapmap, "training=true, prior=3.0") - cr.rodBind :+= RodBind("1kg", "VCF", qscript.omni, "training=true, prior=3.0") - cr.rodBind :+= RodBind("input", "VCF", recombine.out) - cr.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP, "known=true") - cr.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") - cr.jobName = qscript.project + ".cr" - cr.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".cr.out") - cr.intervalsString :+= "20" ///TODO impliment only using the chr 20 or nothing if freeze - cr.allPoly = true - cr.tranches_file =swapExt(filter.out, "merged.filtered.vcf", "tranches") - cr.recal_file = swapExt(filter.out, "merged.filtered.vcf", "") - cr.tranche ++= List("0.1","0.5","0.75","1.0","1.25","1.35","1.5","1.7","1.8","1.9","2.0","2.1","2.2","2.3","2.5","3.0","5.0","8.0","10.0") - - val ar = new ApplyRecalibration with CommandLineGATKArgs - ar.tranches_file = cr.tranches_file - ar.recal_file = cr.recal_file - ar.ignore_filter++= List("HARD_TO_VALIDATE") - ar.out = swapExt(filter.out, "vcf", "recalibrated.vcf") - ar.jobName = qscript.project + ".ar" - ar.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".ar.out") - - val stdEval = new VariantEval with CommandLineGATKArgs - stdEval.jobName = qscript.project+".eval" - stdEval.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".eval.out") - stdEval.noST = true - stdEval.noEV = true - stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty") - stdEval.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) - stdEval.rodBind :+= RodBind("eval", "VCF", ar.out) - stdEval.out = swapExt(ar.out, ".vcf", ".eval") - stdEval.tranchesFile = cr.tranches_file - - add(combineVCFs, filter, sv, recombine, cr, ar, stdEval) - } -}