Changed VR tranches from old 0.1-10 to new 100 to 90.

Using hapmap training and truth based on wiki.
Explicitly setting the ts_filter_level even though 99.0 is the default.
Recal file path now ends with with .recal.
Added ar's vcf input.
Omni rod name now omni instead of 1kg.
The VR RodBind tags had spaces in them.
Was passing both the full intervals and the chunk intervals to chunk jobs.
Switched back to chr20 for default since the VR crashes on small intervals sets with "MESSAGE: Matrix is singular."
Log files names based on the file paths + .out.
Added eval statifications by sample based on the Hybrid Selection / Whole Exome pipeline.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5800 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-05-13 14:38:56 +00:00
parent 08c13f3944
commit 95fc6c0a83
1 changed files with 46 additions and 37 deletions

View File

@ -34,8 +34,8 @@ class WholeGenomePipeline extends QScript {
@Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true)
var cleanerTmpDir: String = _
@Argument(doc="Flag for running the whole genome (wg), chromosome 20 (chr20) or just the centromere of chromosome 1 (cent1). The default is cent1.", shortName="runType", required=false)
var runType = "cent1"
@Argument(doc="Flag for running the whole genome (wg) or chromosome 20 (chr20). The default is chr20.", shortName="runType", required=false)
var runType = "chr20"
@Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false)
var chunkSize = 3000000
@ -56,27 +56,32 @@ class WholeGenomePipeline extends QScript {
override def toString = toInterval
}
val chr1Centromere = new ChrRange("1", 121429168, 121529169)
var runIntervals = List.empty[String]
def script() {
val rangeMap = Map(
"cent1" -> new ChrRange("1", 121429168, 121529168),
"cent16" -> new ChrRange("16", 40844464, 40944464),
"chr20" -> new ChrRange("20", 1, 63025520),
"chr20_100k" -> new ChrRange("20", 100001, 200000))
var ranges = Traversable.empty[ChrRange]
val chrs = IntervalUtils.getContigSizes(reference)
runType match {
case "wg" =>
runType = runType.toLowerCase
if (runType == "wg") {
// use all chromosomes from 1 to their length
ranges = chrs.map { case (chr, len) => new ChrRange(chr, 1, len.longValue) }
runIntervals = Nil
case "chr20" =>
// use chromosome 20 from 1 to its length
ranges = chrs.filterKeys(_ == "20").map { case (chr, len) => new ChrRange(chr, 1, len.longValue) }
runIntervals = List(ranges.head.toInterval)
case "cent1" =>
// use chromosome 1 centromere
ranges = List(chr1Centromere)
runIntervals = List(ranges.head.toInterval)
} else {
rangeMap.get(runType) match {
case Some(range) =>
ranges = List(range)
runIntervals = List(range.toInterval)
case None =>
throw new RuntimeException("Inalid runType: " + runType + ". Must be one of: " + rangeMap.keys.mkString(", ") + ", or wg")
}
}
val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_))
@ -93,22 +98,24 @@ class WholeGenomePipeline extends QScript {
while (start <= chrStop) {
val stop = (start + chunkSize - 1) min chrStop
var interval = "%s:%d-%d".format(chr, start, stop)
val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber
val tmpBase: String = cleanerTmpDir + "/" + chunkBase
val chunkInterval = "%s:%d-%d".format(chr, start, stop)
val target = new RealignerTargetCreator with CommandLineGATKArgs
target.intervalsString :+= interval
target.intervalsString = List(chunkInterval)
target.input_file :+= bamList
target.mismatchFraction = 0.0
target.maxIntervalSize = 700
target.out = tmpBase + ".target.intervals"
target.jobOutputFile = chunkBase + ".target.out"
target.jobOutputFile = chunkBase + ".target.intervals.out"
target.isIntermediate = true
add(target)
val clean = new IndelRealigner with CommandLineGATKArgs
clean.intervalsString :+= interval
clean.intervalsString = List(chunkInterval)
clean.input_file :+= bamList
clean.targetIntervals = target.out
clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
@ -116,13 +123,13 @@ class WholeGenomePipeline extends QScript {
clean.simplifyBAM = true
clean.bam_compression = 1
clean.out = tmpBase + ".cleaned.bam"
clean.jobOutputFile = chunkBase + ".clean.out"
clean.jobOutputFile = chunkBase + ".cleaned.bam.out"
clean.jobPriority = 51
clean.isIntermediate = true
add(clean)
val call = new UnifiedGenotyper with CommandLineGATKArgs
call.intervalsString :+= interval
call.intervalsString = List(chunkInterval)
call.input_file :+= clean.out
call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
call.downsample_to_coverage = 50
@ -133,7 +140,7 @@ class WholeGenomePipeline extends QScript {
call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH
call.GSA_PRODUCTION_ONLY = true
call.out = chunkBase + ".vcf"
call.jobOutputFile = chunkBase + ".call.out"
call.jobOutputFile = call.out + ".out"
call.jobPriority = 52
add(call)
@ -149,15 +156,15 @@ class WholeGenomePipeline extends QScript {
combineVCFs.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
combineVCFs.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE
combineVCFs.out = projectBase + ".merged.vcf"
combineVCFs.jobOutputFile = projectBase + ".combine.out"
combineVCFs.jobOutputFile = combineVCFs.out + ".out"
combineVCFs.assumeIdenticalSamples = true
add(combineVCFs)
val sv = new SelectVariants with CommandLineGATKArgs
sv.selectIndels = true
sv.rodBind :+= RodBind("variant", "VCF", combineVCFs.out)
sv.out = swapExt(combineVCFs.out, ".merged.vcf", ".indels.vcf")
sv.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".sv.out")
sv.out = projectBase + ".indels.vcf"
sv.jobOutputFile = sv.out + ".out"
add(sv)
val filter = new VariantFiltration with CommandLineGATKArgs
@ -165,7 +172,7 @@ class WholeGenomePipeline extends QScript {
filter.filterName :+= "HARD_TO_VALIDATE"
filter.filterExpression :+= "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\""
filter.out = swapExt(sv.out, ".vcf", ".filtered.vcf")
filter.jobOutputFile = swapExt(sv.jobOutputFile, ".sv.out", ".filter.out")
filter.jobOutputFile = filter.out + ".out"
add(filter)
val recombine = new CombineVariants with CommandLineGATKArgs
@ -174,28 +181,30 @@ class WholeGenomePipeline extends QScript {
recombine.rod_priority_list = "indels,all"
recombine.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.PRIORITIZE
recombine.out = swapExt(combineVCFs.out, ".vcf", ".filtered.vcf")
recombine.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".recombine.out")
recombine.jobOutputFile = recombine.out + ".out"
add(recombine)
val vr = new VariantRecalibrator with CommandLineGATKArgs
vr.rodBind :+= RodBind("input", "VCF", recombine.out)
vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "training=true, prior=3.0")
vr.rodBind :+= RodBind("1kg", "VCF", omni, "training=true, prior=3.0")
vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true")
vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0")
vr.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=false,prior=12.0")
vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true,training=false,truth=false,prior=10.0")
vr.use_annotation = List("QD", "SB", "HaplotypeScore", "HRun")
vr.trustAllPolymorphic = true
vr.TStranche = List("0.1","0.5","0.75","1.0","1.25","1.35","1.5","1.7","1.8","1.9","2.0","2.1","2.2","2.3","2.5","3.0","5.0","8.0","10.0")
vr.tranches_file = swapExt(filter.out, ".merged.filtered.vcf", ".tranches")
vr.recal_file = swapExt(filter.out, ".merged.filtered.vcf", "")
vr.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".cr.out")
vr.TStranche = List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
vr.tranches_file = swapExt(filter.out, ".vcf", ".tranches")
vr.recal_file = swapExt(filter.out, ".vcf", ".recal")
vr.jobOutputFile = vr.recal_file + ".out"
add(vr)
val ar = new ApplyRecalibration with CommandLineGATKArgs
ar.rodBind :+= RodBind("input", "VCF", recombine.out)
ar.tranches_file = vr.tranches_file
ar.recal_file = vr.recal_file
ar.ts_filter_level = 99.0
ar.ignore_filter :+= "HARD_TO_VALIDATE"
ar.out = swapExt(filter.out, ".vcf", ".recalibrated.vcf")
ar.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".ar.out")
ar.out = swapExt(recombine.out, ".vcf", ".recalibrated.vcf")
ar.jobOutputFile = ar.out + ".out"
add(ar)
val stdEval = new VariantEval with CommandLineGATKArgs
@ -205,9 +214,9 @@ class WholeGenomePipeline extends QScript {
stdEval.doNotUseAllStandardStratifications = true
stdEval.doNotUseAllStandardModules = true
stdEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty")
stdEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample")
stdEval.out = swapExt(ar.out, ".vcf", ".eval")
stdEval.jobOutputFile = swapExt(combineVCFs.jobOutputFile, ".combine.out", ".eval.out")
stdEval.jobOutputFile = stdEval.out + ".out"
add(stdEval)
}
}