From 95fc6c0a835898a4183880f5dc71b39759c98877 Mon Sep 17 00:00:00 2001 From: kshakir Date: Fri, 13 May 2011 14:38:56 +0000 Subject: [PATCH] Changed VR tranches from old 0.1-10 to new 100 to 90. Using hapmap training and truth based on wiki. Explicitly setting the ts_filter_level even though 99.0 is the default. Recal file path now ends with with .recal. Added ar's vcf input. Omni rod name now omni instead of 1kg. The VR RodBind tags had spaces in them. Was passing both the full intervals and the chunk intervals to chunk jobs. Switched back to chr20 for default since the VR crashes on small intervals sets with "MESSAGE: Matrix is singular." Log files names based on the file paths + .out. Added eval statifications by sample based on the Hybrid Selection / Whole Exome pipeline. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5800 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/WholeGenomePipeline.scala | 83 ++++++++++--------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/scala/qscript/playground/WholeGenomePipeline.scala b/scala/qscript/playground/WholeGenomePipeline.scala index 69b0003cc..4fc2ffcf1 100644 --- a/scala/qscript/playground/WholeGenomePipeline.scala +++ b/scala/qscript/playground/WholeGenomePipeline.scala @@ -34,8 +34,8 @@ class WholeGenomePipeline extends QScript { @Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true) var cleanerTmpDir: String = _ - @Argument(doc="Flag for running the whole genome (wg), chromosome 20 (chr20) or just the centromere of chromosome 1 (cent1). The default is cent1.", shortName="runType", required=false) - var runType = "cent1" + @Argument(doc="Flag for running the whole genome (wg) or chromosome 20 (chr20). The default is chr20.", shortName="runType", required=false) + var runType = "chr20" @Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false) var chunkSize = 3000000 @@ -56,27 +56,32 @@ class WholeGenomePipeline extends QScript { override def toString = toInterval } - val chr1Centromere = new ChrRange("1", 121429168, 121529169) - var runIntervals = List.empty[String] def script() { + val rangeMap = Map( + "cent1" -> new ChrRange("1", 121429168, 121529168), + "cent16" -> new ChrRange("16", 40844464, 40944464), + "chr20" -> new ChrRange("20", 1, 63025520), + "chr20_100k" -> new ChrRange("20", 100001, 200000)) + var ranges = Traversable.empty[ChrRange] val chrs = IntervalUtils.getContigSizes(reference) - runType match { - case "wg" => + + runType = runType.toLowerCase + if (runType == "wg") { // use all chromosomes from 1 to their length ranges = chrs.map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } runIntervals = Nil - case "chr20" => - // use chromosome 20 from 1 to its length - ranges = chrs.filterKeys(_ == "20").map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } - runIntervals = List(ranges.head.toInterval) - case "cent1" => - // use chromosome 1 centromere - ranges = List(chr1Centromere) - runIntervals = List(ranges.head.toInterval) + } else { + rangeMap.get(runType) match { + case Some(range) => + ranges = List(range) + runIntervals = List(range.toInterval) + case None => + throw new RuntimeException("Inalid runType: " + runType + ". Must be one of: " + rangeMap.keys.mkString(", ") + ", or wg") + } } val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_)) @@ -93,22 +98,24 @@ class WholeGenomePipeline extends QScript { while (start <= chrStop) { val stop = (start + chunkSize - 1) min chrStop - var interval = "%s:%d-%d".format(chr, start, stop) + val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber val tmpBase: String = cleanerTmpDir + "/" + chunkBase + val chunkInterval = "%s:%d-%d".format(chr, start, stop) + val target = new RealignerTargetCreator with CommandLineGATKArgs - target.intervalsString :+= interval + target.intervalsString = List(chunkInterval) target.input_file :+= bamList target.mismatchFraction = 0.0 target.maxIntervalSize = 700 target.out = tmpBase + ".target.intervals" - target.jobOutputFile = chunkBase + ".target.out" + target.jobOutputFile = chunkBase + ".target.intervals.out" target.isIntermediate = true add(target) val clean = new IndelRealigner with CommandLineGATKArgs - clean.intervalsString :+= interval + clean.intervalsString = List(chunkInterval) clean.input_file :+= bamList clean.targetIntervals = target.out clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) @@ -116,13 +123,13 @@ class WholeGenomePipeline extends QScript { clean.simplifyBAM = true clean.bam_compression = 1 clean.out = tmpBase + ".cleaned.bam" - clean.jobOutputFile = chunkBase + ".clean.out" + clean.jobOutputFile = chunkBase + ".cleaned.bam.out" clean.jobPriority = 51 clean.isIntermediate = true add(clean) val call = new UnifiedGenotyper with CommandLineGATKArgs - call.intervalsString :+= interval + call.intervalsString = List(chunkInterval) call.input_file :+= clean.out call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) call.downsample_to_coverage = 50 @@ -133,7 +140,7 @@ class WholeGenomePipeline extends QScript { call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH call.GSA_PRODUCTION_ONLY = true call.out = chunkBase + ".vcf" - call.jobOutputFile = chunkBase + ".call.out" + call.jobOutputFile = call.out + ".out" call.jobPriority = 52 add(call) @@ -149,15 +156,15 @@ class WholeGenomePipeline extends QScript { combineVCFs.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE combineVCFs.out = projectBase + ".merged.vcf" - combineVCFs.jobOutputFile = projectBase + ".combine.out" + combineVCFs.jobOutputFile = combineVCFs.out + ".out" combineVCFs.assumeIdenticalSamples = true add(combineVCFs) val sv = new SelectVariants with CommandLineGATKArgs sv.selectIndels = true sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out) - sv.out = swapExt(combineVCFs.out, ".merged.vcf", ".indels.vcf") - sv.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".sv.out") + sv.out = projectBase + ".indels.vcf" + sv.jobOutputFile = sv.out + ".out" add(sv) val filter = new VariantFiltration with CommandLineGATKArgs @@ -165,7 +172,7 @@ class WholeGenomePipeline extends QScript { filter.filterName :+= "HARD_TO_VALIDATE" filter.filterExpression :+= "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"" filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf") - filter.jobOutputFile = swapExt(sv.jobOutputFile, ".sv.out", ".filter.out") + filter.jobOutputFile = filter.out + ".out" add(filter) val recombine = new CombineVariants with CommandLineGATKArgs @@ -174,28 +181,30 @@ class WholeGenomePipeline extends QScript { recombine.rod_priority_list = "indels,all" recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE recombine.out = swapExt(combineVCFs.out, ".vcf", ".filtered.vcf") - recombine.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".recombine.out") + recombine.jobOutputFile = recombine.out + ".out" add(recombine) val vr = new VariantRecalibrator with CommandLineGATKArgs vr.rodBind :+= RodBind("input", "VCF", recombine.out) - vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "training=true, prior=3.0") - vr.rodBind :+= RodBind("1kg", "VCF", omni, "training=true, prior=3.0") - vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true") + vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0") + vr.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=false,prior=12.0") + vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true,training=false,truth=false,prior=10.0") vr.use_annotation = List("QD", "SB", "HaplotypeScore", "HRun") vr.trustAllPolymorphic = true - vr.TStranche = List("0.1","0.5","0.75","1.0","1.25","1.35","1.5","1.7","1.8","1.9","2.0","2.1","2.2","2.3","2.5","3.0","5.0","8.0","10.0") - vr.tranches_file = swapExt(filter.out, ".merged.filtered.vcf", ".tranches") - vr.recal_file = swapExt(filter.out, ".merged.filtered.vcf", "") - vr.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".cr.out") + vr.TStranche = List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") + vr.tranches_file = swapExt(filter.out, ".vcf", ".tranches") + vr.recal_file = swapExt(filter.out, ".vcf", ".recal") + vr.jobOutputFile = vr.recal_file + ".out" add(vr) val ar = new ApplyRecalibration with CommandLineGATKArgs + ar.rodBind :+= RodBind("input", "VCF", recombine.out) ar.tranches_file = vr.tranches_file ar.recal_file = vr.recal_file + ar.ts_filter_level = 99.0 ar.ignore_filter :+= "HARD_TO_VALIDATE" - ar.out = swapExt(filter.out, ".vcf", ".recalibrated.vcf") - ar.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".ar.out") + ar.out = swapExt(recombine.out, ".vcf", ".recalibrated.vcf") + ar.jobOutputFile = ar.out + ".out" add(ar) val stdEval = new VariantEval with CommandLineGATKArgs @@ -205,9 +214,9 @@ class WholeGenomePipeline extends QScript { stdEval.doNotUseAllStandardStratifications = true stdEval.doNotUseAllStandardModules = true stdEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") - stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty") + stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample") stdEval.out = swapExt(ar.out, ".vcf", ".eval") - stdEval.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".eval.out") + stdEval.jobOutputFile = stdEval.out + ".out" add(stdEval) } }