Changed VR tranches from old 0.1-10 to new 100 to 90.

Using hapmap training and truth based on wiki.
Explicitly setting the ts_filter_level even though 99.0 is the default.
Recal file path now ends with with .recal.
Added ar's vcf input.
Omni rod name now omni instead of 1kg.
The VR RodBind tags had spaces in them.
Was passing both the full intervals and the chunk intervals to chunk jobs.
Switched back to chr20 for default since the VR crashes on small intervals sets with "MESSAGE: Matrix is singular."
Log files names based on the file paths + .out.
Added eval statifications by sample based on the Hybrid Selection / Whole Exome pipeline.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5800 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-05-13 14:38:56 +00:00
parent 08c13f3944
commit 95fc6c0a83
1 changed files with 46 additions and 37 deletions

View File

@ -34,8 +34,8 @@ class WholeGenomePipeline extends QScript {
@Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true) @Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true)
var cleanerTmpDir: String = _ var cleanerTmpDir: String = _
@Argument(doc="Flag for running the whole genome (wg), chromosome 20 (chr20) or just the centromere of chromosome 1 (cent1). The default is cent1.", shortName="runType", required=false) @Argument(doc="Flag for running the whole genome (wg) or chromosome 20 (chr20). The default is chr20.", shortName="runType", required=false)
var runType = "cent1" var runType = "chr20"
@Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false) @Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false)
var chunkSize = 3000000 var chunkSize = 3000000
@ -56,27 +56,32 @@ class WholeGenomePipeline extends QScript {
override def toString = toInterval override def toString = toInterval
} }
val chr1Centromere = new ChrRange("1", 121429168, 121529169)
var runIntervals = List.empty[String] var runIntervals = List.empty[String]
def script() { def script() {
val rangeMap = Map(
"cent1" -> new ChrRange("1", 121429168, 121529168),
"cent16" -> new ChrRange("16", 40844464, 40944464),
"chr20" -> new ChrRange("20", 1, 63025520),
"chr20_100k" -> new ChrRange("20", 100001, 200000))
var ranges = Traversable.empty[ChrRange] var ranges = Traversable.empty[ChrRange]
val chrs = IntervalUtils.getContigSizes(reference) val chrs = IntervalUtils.getContigSizes(reference)
runType match {
case "wg" => runType = runType.toLowerCase
if (runType == "wg") {
// use all chromosomes from 1 to their length // use all chromosomes from 1 to their length
ranges = chrs.map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } ranges = chrs.map { case (chr, len) => new ChrRange(chr, 1, len.longValue) }
runIntervals = Nil runIntervals = Nil
case "chr20" => } else {
// use chromosome 20 from 1 to its length rangeMap.get(runType) match {
ranges = chrs.filterKeys(_ == "20").map { case (chr, len) => new ChrRange(chr, 1, len.longValue) } case Some(range) =>
runIntervals = List(ranges.head.toInterval) ranges = List(range)
case "cent1" => runIntervals = List(range.toInterval)
// use chromosome 1 centromere case None =>
ranges = List(chr1Centromere) throw new RuntimeException("Inalid runType: " + runType + ". Must be one of: " + rangeMap.keys.mkString(", ") + ", or wg")
runIntervals = List(ranges.head.toInterval) }
} }
val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_)) val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_))
@ -93,22 +98,24 @@ class WholeGenomePipeline extends QScript {
while (start <= chrStop) { while (start <= chrStop) {
val stop = (start + chunkSize - 1) min chrStop val stop = (start + chunkSize - 1) min chrStop
var interval = "%s:%d-%d".format(chr, start, stop)
val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber
val tmpBase: String = cleanerTmpDir + "/" + chunkBase val tmpBase: String = cleanerTmpDir + "/" + chunkBase
val chunkInterval = "%s:%d-%d".format(chr, start, stop)
val target = new RealignerTargetCreator with CommandLineGATKArgs val target = new RealignerTargetCreator with CommandLineGATKArgs
target.intervalsString :+= interval target.intervalsString = List(chunkInterval)
target.input_file :+= bamList target.input_file :+= bamList
target.mismatchFraction = 0.0 target.mismatchFraction = 0.0
target.maxIntervalSize = 700 target.maxIntervalSize = 700
target.out = tmpBase + ".target.intervals" target.out = tmpBase + ".target.intervals"
target.jobOutputFile = chunkBase + ".target.out" target.jobOutputFile = chunkBase + ".target.intervals.out"
target.isIntermediate = true target.isIntermediate = true
add(target) add(target)
val clean = new IndelRealigner with CommandLineGATKArgs val clean = new IndelRealigner with CommandLineGATKArgs
clean.intervalsString :+= interval clean.intervalsString = List(chunkInterval)
clean.input_file :+= bamList clean.input_file :+= bamList
clean.targetIntervals = target.out clean.targetIntervals = target.out
clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
@ -116,13 +123,13 @@ class WholeGenomePipeline extends QScript {
clean.simplifyBAM = true clean.simplifyBAM = true
clean.bam_compression = 1 clean.bam_compression = 1
clean.out = tmpBase + ".cleaned.bam" clean.out = tmpBase + ".cleaned.bam"
clean.jobOutputFile = chunkBase + ".clean.out" clean.jobOutputFile = chunkBase + ".cleaned.bam.out"
clean.jobPriority = 51 clean.jobPriority = 51
clean.isIntermediate = true clean.isIntermediate = true
add(clean) add(clean)
val call = new UnifiedGenotyper with CommandLineGATKArgs val call = new UnifiedGenotyper with CommandLineGATKArgs
call.intervalsString :+= interval call.intervalsString = List(chunkInterval)
call.input_file :+= clean.out call.input_file :+= clean.out
call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp) call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
call.downsample_to_coverage = 50 call.downsample_to_coverage = 50
@ -133,7 +140,7 @@ class WholeGenomePipeline extends QScript {
call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH
call.GSA_PRODUCTION_ONLY = true call.GSA_PRODUCTION_ONLY = true
call.out = chunkBase + ".vcf" call.out = chunkBase + ".vcf"
call.jobOutputFile = chunkBase + ".call.out" call.jobOutputFile = call.out + ".out"
call.jobPriority = 52 call.jobPriority = 52
add(call) add(call)
@ -149,15 +156,15 @@ class WholeGenomePipeline extends QScript {
combineVCFs.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION combineVCFs.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE
combineVCFs.out = projectBase + ".merged.vcf" combineVCFs.out = projectBase + ".merged.vcf"
combineVCFs.jobOutputFile = projectBase + ".combine.out" combineVCFs.jobOutputFile = combineVCFs.out + ".out"
combineVCFs.assumeIdenticalSamples = true combineVCFs.assumeIdenticalSamples = true
add(combineVCFs) add(combineVCFs)
val sv = new SelectVariants with CommandLineGATKArgs val sv = new SelectVariants with CommandLineGATKArgs
sv.selectIndels = true sv.selectIndels = true
sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out) sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out)
sv.out = swapExt(combineVCFs.out, ".merged.vcf", ".indels.vcf") sv.out = projectBase + ".indels.vcf"
sv.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".sv.out") sv.jobOutputFile = sv.out + ".out"
add(sv) add(sv)
val filter = new VariantFiltration with CommandLineGATKArgs val filter = new VariantFiltration with CommandLineGATKArgs
@ -165,7 +172,7 @@ class WholeGenomePipeline extends QScript {
filter.filterName :+= "HARD_TO_VALIDATE" filter.filterName :+= "HARD_TO_VALIDATE"
filter.filterExpression :+= "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"" filter.filterExpression :+= "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\""
filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf") filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf")
filter.jobOutputFile = swapExt(sv.jobOutputFile, ".sv.out", ".filter.out") filter.jobOutputFile = filter.out + ".out"
add(filter) add(filter)
val recombine = new CombineVariants with CommandLineGATKArgs val recombine = new CombineVariants with CommandLineGATKArgs
@ -174,28 +181,30 @@ class WholeGenomePipeline extends QScript {
recombine.rod_priority_list = "indels,all" recombine.rod_priority_list = "indels,all"
recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE
recombine.out = swapExt(combineVCFs.out, ".vcf", ".filtered.vcf") recombine.out = swapExt(combineVCFs.out, ".vcf", ".filtered.vcf")
recombine.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".recombine.out") recombine.jobOutputFile = recombine.out + ".out"
add(recombine) add(recombine)
val vr = new VariantRecalibrator with CommandLineGATKArgs val vr = new VariantRecalibrator with CommandLineGATKArgs
vr.rodBind :+= RodBind("input", "VCF", recombine.out) vr.rodBind :+= RodBind("input", "VCF", recombine.out)
vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "training=true, prior=3.0") vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0")
vr.rodBind :+= RodBind("1kg", "VCF", omni, "training=true, prior=3.0") vr.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=false,prior=12.0")
vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true") vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true,training=false,truth=false,prior=10.0")
vr.use_annotation = List("QD", "SB", "HaplotypeScore", "HRun") vr.use_annotation = List("QD", "SB", "HaplotypeScore", "HRun")
vr.trustAllPolymorphic = true vr.trustAllPolymorphic = true
vr.TStranche = List("0.1","0.5","0.75","1.0","1.25","1.35","1.5","1.7","1.8","1.9","2.0","2.1","2.2","2.3","2.5","3.0","5.0","8.0","10.0") vr.TStranche = List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
vr.tranches_file = swapExt(filter.out, ".merged.filtered.vcf", ".tranches") vr.tranches_file = swapExt(filter.out, ".vcf", ".tranches")
vr.recal_file = swapExt(filter.out, ".merged.filtered.vcf", "") vr.recal_file = swapExt(filter.out, ".vcf", ".recal")
vr.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".cr.out") vr.jobOutputFile = vr.recal_file + ".out"
add(vr) add(vr)
val ar = new ApplyRecalibration with CommandLineGATKArgs val ar = new ApplyRecalibration with CommandLineGATKArgs
ar.rodBind :+= RodBind("input", "VCF", recombine.out)
ar.tranches_file = vr.tranches_file ar.tranches_file = vr.tranches_file
ar.recal_file = vr.recal_file ar.recal_file = vr.recal_file
ar.ts_filter_level = 99.0
ar.ignore_filter :+= "HARD_TO_VALIDATE" ar.ignore_filter :+= "HARD_TO_VALIDATE"
ar.out = swapExt(filter.out, ".vcf", ".recalibrated.vcf") ar.out = swapExt(recombine.out, ".vcf", ".recalibrated.vcf")
ar.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".ar.out") ar.jobOutputFile = ar.out + ".out"
add(ar) add(ar)
val stdEval = new VariantEval with CommandLineGATKArgs val stdEval = new VariantEval with CommandLineGATKArgs
@ -205,9 +214,9 @@ class WholeGenomePipeline extends QScript {
stdEval.doNotUseAllStandardStratifications = true stdEval.doNotUseAllStandardStratifications = true
stdEval.doNotUseAllStandardModules = true stdEval.doNotUseAllStandardModules = true
stdEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") stdEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty") stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample")
stdEval.out = swapExt(ar.out, ".vcf", ".eval") stdEval.out = swapExt(ar.out, ".vcf", ".eval")
stdEval.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".eval.out") stdEval.jobOutputFile = stdEval.out + ".out"
add(stdEval) add(stdEval)
} }
} }