220 lines
11 KiB
Plaintext
Executable File
220 lines
11 KiB
Plaintext
Executable File
import org.broadinstitute.sting.commandline.ArgumentSource
|
|
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
|
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
|
import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
|
|
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
|
|
import org.broadinstitute.sting.queue.QScript
|
|
import collection.JavaConversions._
|
|
import org.broadinstitute.sting.utils.text.XReadLines
|
|
|
|
class FullCallingPipeline extends QScript {
|
|
qscript =>
|
|
|
|
@Input(doc="path to GATK jar", shortName="G")
|
|
var gatkJar: File = _
|
|
|
|
@Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels). By default is set to 20.", shortName="varScatter", required=false)
|
|
var num_var_scatter_jobs = 20
|
|
|
|
@Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false)
|
|
var expandIntervals = 50
|
|
|
|
private var pipeline: Pipeline = _
|
|
|
|
private final val picardFixMatesClass = "net.sf.picard.sam.FixMateInformation"
|
|
|
|
|
|
val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList
|
|
val DBSNP : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
|
val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
|
val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list")
|
|
val BASE : String = "exon_vqsr"
|
|
|
|
val handFiltered : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf")
|
|
|
|
|
|
trait CommandLineGATKArgs extends CommandLineGATK {
|
|
this.intervals :+= INTS
|
|
this.jarFile = qscript.gatkJar
|
|
this.reference_sequence = REF
|
|
this.memoryLimit = Some(4)
|
|
}
|
|
// ------------ SETUP THE PIPELINE ----------- //
|
|
|
|
|
|
def script = {
|
|
|
|
endToEnd(BASE,"cleaned")
|
|
}
|
|
|
|
def endToEnd(base: String, bamType: String) = {
|
|
val bamFiles = BAM_FILES
|
|
|
|
val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS")
|
|
ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out")
|
|
|
|
if (qscript.expandIntervals > 0) {
|
|
//add(ei)
|
|
}
|
|
|
|
trait ExpandedIntervals extends CommandLineGATK {
|
|
if (qscript.expandIntervals > 0) {
|
|
this.intervals :+= ei.outList
|
|
}
|
|
}
|
|
|
|
// Call indels
|
|
val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
|
|
indels.analysisName = base + "_indels"
|
|
indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out")
|
|
indels.memoryLimit = Some(6)
|
|
indels.downsample_to_coverage = Some(600)
|
|
indels.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL
|
|
indels.input_file = bamFiles
|
|
indels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
|
|
indels.out = new File("IndelCalls", base+".indels.vcf")
|
|
|
|
indels.scatterCount = qscript.num_var_scatter_jobs
|
|
indels.setupScatterFunction = {
|
|
case scatter: ScatterFunction =>
|
|
scatter.commandDirectory = new File("IndelCalls/ScatterGather")
|
|
scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
|
|
}
|
|
indels.setupCloneFunction = {
|
|
case (clone: CloneFunction, index: Int) =>
|
|
clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
|
|
clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
|
|
}
|
|
indels.setupGatherFunction = {
|
|
case (gather: GatherFunction, source: ArgumentSource) =>
|
|
gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
|
|
gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
|
|
}
|
|
|
|
// Filter indels
|
|
val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
|
|
filteredIndels.analysisName = base + "_filteredIndels"
|
|
filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out")
|
|
filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter")
|
|
filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"")
|
|
filteredIndels.variantVCF = indels.out
|
|
filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf")
|
|
|
|
// Call snps
|
|
val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
|
|
snps.analysisName = base+"_snps"
|
|
snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out")
|
|
snps.memoryLimit = Some(6)
|
|
snps.downsample_to_coverage = Some(600)
|
|
snps.input_file = bamFiles
|
|
snps.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
|
|
snps.out = new File("SnpCalls", base+".snps.vcf")
|
|
|
|
snps.scatterCount = qscript.num_var_scatter_jobs
|
|
snps.setupScatterFunction = {
|
|
case scatter: ScatterFunction =>
|
|
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
|
|
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
|
|
}
|
|
snps.setupCloneFunction = {
|
|
case (clone: CloneFunction, index: Int) =>
|
|
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
|
|
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
|
|
}
|
|
snps.setupGatherFunction = {
|
|
case (gather: GatherFunction, source: ArgumentSource) =>
|
|
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
|
|
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
|
|
}
|
|
|
|
// Filter snps at indels
|
|
val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
|
|
filteredSNPs.analysisName = base+"_filteredSNPs"
|
|
filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out")
|
|
filteredSNPs.clusterWindowSize = Some(10)
|
|
filteredSNPs.clusterSize = Some(3)
|
|
filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out)
|
|
filteredSNPs.variantVCF = snps.out
|
|
filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf")
|
|
|
|
// Mako de Clusters
|
|
val cr = new ContrastiveRecalibrator with CommandLineGATKArgs with ExpandedIntervals
|
|
cr.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
|
|
cr.rodBind :+= new RodBind("dbsnp","vcf",DBSNP,"known=true,training=false,truth=false,prior=8.0")
|
|
cr.rodBind :+= new RodBind("hapmap","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"),"known=false,training=true,truth=true,prior=15.0")
|
|
cr.rodBind :+= new RodBind("omni","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"),"known=false,training=true,truth=true,prior=12.0")
|
|
cr.allPoly = true
|
|
cr.use_annotation ++= List("HaplotypeScore","SB","QD","HRun")
|
|
cr.tranches_file = new File(base+".tranche")
|
|
cr.recal_file = new File(base+".contrastive.recal.table")
|
|
cr.tranche ++= List("99.9","99.5","99.25","98.0","97.75","97.65","97.5","97.3","97.2","97.1","98.0","97.5","97.0","96.75","96.5","96.0","95.5","95.0","94.75","94.5","94.25","94.0",
|
|
"93.75","93.5","93.25","93.0","92.75","92.5","92.25","92.0","91.0","90.0")
|
|
cr.analysisName = base+"_ContrastiveRecalibrator"
|
|
cr.memoryLimit = Some(32)
|
|
cr.num_threads = Some(6)
|
|
|
|
|
|
// Apply the Recalibration
|
|
val ar = new ApplyRecalibration with CommandLineGATKArgs with ExpandedIntervals
|
|
ar.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
|
|
ar.tranches_file = cr.tranches_file
|
|
ar.recal_file = cr.recal_file
|
|
ar.ts_filter_level = Some(91.75)
|
|
ar.out = new File(base+"_contrastive_recal.91.75.vcf")
|
|
ar.memoryLimit = Some(6)
|
|
|
|
// Variant eval the standard region
|
|
val stdEval = new VariantEval with CommandLineGATKArgs
|
|
stdEval.analysisName = base+"_VariantEval"
|
|
stdEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.std.out")
|
|
stdEval.noST = true
|
|
stdEval.noEV = true
|
|
stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
|
|
stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
|
|
stdEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
|
|
stdEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
|
|
stdEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
|
|
stdEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
|
|
stdEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
|
|
stdEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
|
|
stdEval.out = swapExt(ar.out, ".vcf", ".eval")
|
|
stdEval.num_threads = Some(6)
|
|
|
|
// Variant eval the flanking region
|
|
val flanksEval = new VariantEval with CommandLineGATKArgs
|
|
flanksEval.analysisName = base+"_VariantEval"
|
|
flanksEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.flanks.out")
|
|
flanksEval.intervals = List(ei.outList)
|
|
flanksEval.noST = true
|
|
flanksEval.noEV = true
|
|
flanksEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
|
|
flanksEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
|
|
flanksEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
|
|
flanksEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
|
|
flanksEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
|
|
flanksEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
|
|
flanksEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
|
|
flanksEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
|
|
flanksEval.out = swapExt(ar.out, ".vcf", ".flanks.eval")
|
|
flanksEval.num_threads = Some(6)
|
|
|
|
// Make the bam list
|
|
val listOfBams = new File("Resources", base +".BamFiles.list")
|
|
|
|
val writeBamList = new ListWriterFunction
|
|
writeBamList.analysisName = base + "_BamList"
|
|
writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out")
|
|
writeBamList.inputFiles = bamFiles
|
|
writeBamList.listFile = listOfBams
|
|
|
|
//add(indels, filteredIndels, snps, filteredSNPs, stdEval, writeBamList,cr,ar)
|
|
add(ar,stdEval)
|
|
|
|
if (qscript.expandIntervals > 0) {
|
|
add(flanksEval)
|
|
}
|
|
|
|
}
|
|
}
|