243 lines
10 KiB
Scala
243 lines
10 KiB
Scala
/*
|
|
* Copyright (c) 2011, The Broad Institute
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
import org.broadinstitute.sting.queue.QScript
|
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
|
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
|
|
|
class WholeGenomePipeline extends QScript {
|
|
@Input(doc="Bam file list", shortName = "I", required=true)
|
|
var bamList: File = _
|
|
|
|
@Input(doc="Exclude intervals list", shortName = "XL", required=false)
|
|
var excludeIntervals: List[File] = Nil
|
|
|
|
@Argument(doc="path to tmp space for storing intermediate bam files", shortName="cleanerTmpDir", required=true)
|
|
var cleanerTmpDir: String = _
|
|
|
|
@Argument(doc="Flag for running the whole genome (wg) or chromosome 20 (chr20). The default is chr20.", shortName="runType", required=false)
|
|
var runType = "chr20"
|
|
|
|
@Argument(doc="Chunk size. Defaults to 3,000,000", shortName="chunk", required=false)
|
|
var chunkSize = 3000000
|
|
|
|
val resources = "/humgen/gsa-pipeline/resources/5777/b37/"
|
|
val reference = resources + "human_g1k_v37.fasta"
|
|
val dbsnp = resources + "dbsnp_132.b37.vcf"
|
|
val indels = resources + "1000G_indels_for_realignment.b37.vcf"
|
|
val omni = resources + "1000G_omni2.5.b37.sites.vcf"
|
|
val hapmap = resources + "hapmap_3.3.b37.sites.vcf"
|
|
|
|
trait CommandLineGATKArgs extends CommandLineGATK {
|
|
this.reference_sequence = reference
|
|
this.intervalsString = runIntervals
|
|
this.memoryLimit = 2
|
|
}
|
|
|
|
case class Interval(chr: String, start: Long, stop: Long) {
|
|
override def toString = "%s:%d-%d".format(chr, start, stop)
|
|
}
|
|
|
|
var runIntervals = List.empty[String]
|
|
|
|
def script() {
|
|
|
|
var intervals = Traversable.empty[Interval]
|
|
|
|
runType = runType.toLowerCase
|
|
if (runType == "wg") {
|
|
val contigs = (1 to 22).map(_.toString) ++ List("X", "Y", "MT")
|
|
val sizes = IntervalUtils.getContigSizes(reference)
|
|
intervals = contigs.map(chr => new Interval(chr, 1, sizes.get(chr).longValue))
|
|
runIntervals = Nil
|
|
} else {
|
|
val locs = Map(
|
|
"cent1" -> new Interval("1", 121429168, 121529168),
|
|
"cent16" -> new Interval("16", 40844464, 40944464),
|
|
"chr20" -> new Interval("20", 1, 63025520),
|
|
"chr20_100k" -> new Interval("20", 100001, 200000))
|
|
|
|
locs.get(runType) match {
|
|
case Some(range) =>
|
|
intervals = List(range)
|
|
runIntervals = List(range.toString)
|
|
case None =>
|
|
throw new RuntimeException("Invalid runType: " + runType + ". Must be one of: " + locs.keys.mkString(", ") + ", or wg")
|
|
}
|
|
}
|
|
|
|
val project = Array(".bams.list", ".bam.list", ".list").foldLeft(bamList.getName)(_.stripSuffix(_))
|
|
val projectBase = project + "." + runType
|
|
|
|
var chunkVcfs = List.empty[File]
|
|
for (interval <- intervals) {
|
|
val chr = interval.chr
|
|
val chrStart = interval.start
|
|
val chrStop = interval.stop
|
|
|
|
var start = chrStart
|
|
var chunkNumber = 1
|
|
|
|
while (start <= chrStop) {
|
|
val stop = (start + chunkSize - 1) min chrStop
|
|
|
|
val chunkBase: String = "chunks/" + project + "." + runType + "_chunk_" + chr + "_" + chunkNumber
|
|
val tmpBase: String = cleanerTmpDir + "/" + chunkBase
|
|
|
|
val chunkInterval = List("%s:%d-%d".format(chr, start, stop))
|
|
|
|
val target = new RealignerTargetCreator with CommandLineGATKArgs
|
|
target.input_file :+= bamList
|
|
target.intervalsString = chunkInterval
|
|
target.excludeIntervals = excludeIntervals
|
|
target.mismatchFraction = 0.0
|
|
target.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
|
|
target.rodBind :+= RodBind("indels", "VCF", indels)
|
|
target.out = tmpBase + ".target.intervals"
|
|
target.jobOutputFile = chunkBase + ".target.intervals.out"
|
|
target.isIntermediate = true
|
|
add(target)
|
|
|
|
val clean = new IndelRealigner with CommandLineGATKArgs
|
|
clean.input_file :+= bamList
|
|
clean.intervalsString = chunkInterval
|
|
clean.excludeIntervals = excludeIntervals
|
|
clean.targetIntervals = target.out
|
|
clean.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
|
|
clean.rodBind :+= RodBind("indels", "VCF", indels)
|
|
clean.doNotUseSW = true
|
|
clean.simplifyBAM = true
|
|
clean.bam_compression = 1
|
|
clean.out = tmpBase + ".cleaned.bam"
|
|
clean.jobOutputFile = chunkBase + ".cleaned.bam.out"
|
|
clean.isIntermediate = true
|
|
add(clean)
|
|
|
|
val call = new UnifiedGenotyper with CommandLineGATKArgs
|
|
call.input_file :+= clean.out
|
|
call.intervalsString = chunkInterval
|
|
call.excludeIntervals = excludeIntervals
|
|
call.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
|
|
call.downsample_to_coverage = 50
|
|
call.standard_min_confidence_threshold_for_calling = 4.0
|
|
call.standard_min_confidence_threshold_for_emitting = 4.0
|
|
call.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY
|
|
call.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH
|
|
call.GSA_PRODUCTION_ONLY = true
|
|
call.out = chunkBase + ".vcf"
|
|
call.jobOutputFile = call.out + ".out"
|
|
add(call)
|
|
|
|
chunkVcfs :+= call.out
|
|
start += chunkSize
|
|
chunkNumber += 1
|
|
}
|
|
}
|
|
|
|
val combineChunks = new CombineVariants with CommandLineGATKArgs
|
|
combineChunks.rodBind = chunkVcfs.zipWithIndex.map { case (vcf, index) => RodBind("input"+index, "VCF", vcf) }
|
|
combineChunks.rod_priority_list = chunkVcfs.zipWithIndex.map { case (vcf, index) => "input"+index }.mkString(",")
|
|
combineChunks.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
|
|
combineChunks.assumeIdenticalSamples = true
|
|
combineChunks.out = projectBase + ".unfiltered.vcf"
|
|
combineChunks.jobOutputFile = combineChunks.out + ".out"
|
|
add(combineChunks)
|
|
|
|
val selectSNPs = new SelectVariants with CommandLineGATKArgs
|
|
selectSNPs.selectSNPs = true
|
|
selectSNPs.rodBind :+= RodBind("variant", "VCF", combineChunks.out)
|
|
selectSNPs.out = projectBase + ".snps.unrecalibrated.vcf"
|
|
selectSNPs.jobOutputFile = selectSNPs.out + ".out"
|
|
add(selectSNPs)
|
|
|
|
val selectIndels = new SelectVariants with CommandLineGATKArgs
|
|
selectIndels.selectIndels = true
|
|
selectIndels.rodBind :+= RodBind("variant", "VCF", combineChunks.out)
|
|
selectIndels.out = projectBase + ".indels.unfiltered.vcf"
|
|
selectIndels.jobOutputFile = selectIndels.out + ".out"
|
|
add(selectIndels)
|
|
|
|
val filterIndels = new VariantFiltration with CommandLineGATKArgs
|
|
filterIndels.variantVCF = selectIndels.out
|
|
filterIndels.filterName = List("Indel_QUAL", "Indel_SB", "Indel_QD", "Indel_HRun", "Indel_HaplotypeScore")
|
|
filterIndels.filterExpression = List("\"QUAL<30.0\"", "\"SB>-1.0\"", "\"QD<2.0\"", "\"HRun>15\"", "\"HaplotypeScore>20.0\"")
|
|
filterIndels.out = projectBase + ".indels.filtered.vcf"
|
|
filterIndels.jobOutputFile = filterIndels.out + ".out"
|
|
add(filterIndels)
|
|
|
|
val combineSNPsIndels = new CombineVariants with CommandLineGATKArgs
|
|
combineSNPsIndels.rodBind :+= RodBind("indels", "VCF", selectIndels.out)
|
|
combineSNPsIndels.rodBind :+= RodBind("snps", "VCF", selectSNPs.out)
|
|
combineSNPsIndels.rod_priority_list = "indels,snps"
|
|
combineSNPsIndels.filteredRecordsMergeType = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
|
|
combineSNPsIndels.assumeIdenticalSamples = true
|
|
combineSNPsIndels.out = projectBase + ".unrecalibrated.vcf"
|
|
combineSNPsIndels.jobOutputFile = combineSNPsIndels.out + ".out"
|
|
add(combineSNPsIndels)
|
|
|
|
val vr = new VariantRecalibrator with CommandLineGATKArgs
|
|
vr.rodBind :+= RodBind("input", "VCF", combineSNPsIndels.out)
|
|
vr.rodBind :+= RodBind("hapmap", "VCF", hapmap, "known=false,training=true,truth=true,prior=15.0")
|
|
vr.rodBind :+= RodBind("omni", "VCF", omni, "known=false,training=true,truth=false,prior=12.0")
|
|
vr.rodBind :+= RodBind("dbsnp", "VCF", dbsnp, "known=true,training=false,truth=false,prior=8.0")
|
|
vr.trustAllPolymorphic = true
|
|
vr.use_annotation = List("QD", "HaplotypeScore", "HRun", "MQRankSum", "ReadPosRankSum")
|
|
vr.TStranche = List(
|
|
"100.0", "99.9", "99.5", "99.3",
|
|
"99.0", "98.9", "98.8",
|
|
"98.5", "98.4", "98.3", "98.2", "98.1",
|
|
"98.0",
|
|
"97.0",
|
|
"95.0",
|
|
"90.0")
|
|
vr.tranches_file = projectBase + ".tranches"
|
|
vr.recal_file = projectBase + ".recal"
|
|
vr.jobOutputFile = vr.recal_file + ".out"
|
|
add(vr)
|
|
|
|
for (tranche <- vr.TStranche) {
|
|
val ar = new ApplyRecalibration with CommandLineGATKArgs
|
|
ar.rodBind :+= RodBind("input", "VCF", combineSNPsIndels.out)
|
|
ar.tranches_file = vr.tranches_file
|
|
ar.recal_file = vr.recal_file
|
|
ar.ts_filter_level = tranche.toDouble
|
|
ar.out = projectBase + ".recalibrated." + tranche + ".vcf"
|
|
ar.jobOutputFile = ar.out + ".out"
|
|
add(ar)
|
|
|
|
val eval = new VariantEval with CommandLineGATKArgs
|
|
eval.tranchesFile = vr.tranches_file
|
|
eval.rodBind :+= RodBind("eval", "VCF", ar.out)
|
|
eval.rodBind :+= RodBind("dbsnp", "VCF", dbsnp)
|
|
eval.doNotUseAllStandardStratifications = true
|
|
eval.doNotUseAllStandardModules = true
|
|
eval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
|
|
eval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "FunctionalClass", "Sample")
|
|
eval.out = swapExt(ar.out, ".vcf", ".eval")
|
|
eval.jobOutputFile = eval.out + ".out"
|
|
add(eval)
|
|
}
|
|
}
|
|
}
|