This is a draft of the improved and prettified pipeline. It may not yet compile, but Kiran is taking over adding a few more things as I finish up other tasks.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5248 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d185c2961f
commit
d2efea6003
|
|
@ -19,7 +19,6 @@ class FullCallingPipeline extends QScript {
|
|||
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
|
||||
var trigger: File = _
|
||||
|
||||
// TODO: Fix command lines that pass -refseqTable
|
||||
@Input(doc="path to refseqTable (for GenomicAnnotator) if not present in the YAML", shortName="refseqTable", required=false)
|
||||
var refseqTable: File = _
|
||||
|
||||
|
|
@ -35,11 +34,8 @@ class FullCallingPipeline extends QScript {
|
|||
@Input(doc="level of parallelism for IndelRealigner. By default is set to 1.", shortName="cleanerScatter", required=false)
|
||||
var num_cleaner_scatter_jobs = 1
|
||||
|
||||
@Input(doc="level of parallelism for UnifiedGenotyper. By default is set to 20.", shortName="snpScatter", required=false)
|
||||
var num_snp_scatter_jobs = 20
|
||||
|
||||
//@Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
|
||||
//var num_indel_scatter_jobs = 5
|
||||
@Input(doc="level of parallelism for UnifiedGenotyper (both for Snps and Indels). By default is set to 20.", shortName="varScatter", required=false)
|
||||
var num_var_scatter_jobs = 20
|
||||
|
||||
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
|
||||
var skip_cleaning = false
|
||||
|
|
@ -47,12 +43,6 @@ class FullCallingPipeline extends QScript {
|
|||
@Input(doc="ADPR script", shortName ="tearScript", required=false)
|
||||
var tearScript: File = _
|
||||
|
||||
//@Input(doc="Sequencing maching name (for use by adpr)")
|
||||
//var machine: String = _
|
||||
|
||||
//@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
|
||||
//var protocol: String = _
|
||||
|
||||
// TODO: Fix command lines that pass -bigMemQueue
|
||||
@Argument(doc="Unused", shortName="bigMemQueue", required=false)
|
||||
var big_mem_queue: String = _
|
||||
|
|
@ -79,20 +69,17 @@ class FullCallingPipeline extends QScript {
|
|||
pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
|
||||
|
||||
val projectBase: String = qscript.pipeline.getProject.getName
|
||||
// TODO: Fix command lines that pass -refseqTable
|
||||
|
||||
if (qscript.refseqTable != null)
|
||||
qscript.pipeline.getProject.setRefseqTable(qscript.refseqTable)
|
||||
if (qscript.skip_cleaning) {
|
||||
//endToEnd(projectBase + ".uncleaned", "recalibrated", adprRscript, seq, expKind)
|
||||
|
||||
|
||||
endToEnd(projectBase + ".uncleaned", "recalibrated")
|
||||
} else {
|
||||
|
||||
// there are commands that use all the bam files
|
||||
val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
|
||||
//val adprRScript = qscript.adprScript
|
||||
//val seq = qscript.machine
|
||||
//val expKind = qscript.protocol
|
||||
/
|
||||
|
||||
for ( sample <- recalibratedSamples ) {
|
||||
val sampleId = sample.getId
|
||||
|
|
@ -129,7 +116,6 @@ class FullCallingPipeline extends QScript {
|
|||
// if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates
|
||||
if (realigner.scatterCount > 1) {
|
||||
realigner.out = cleaned_bam
|
||||
// While gathering run fix mates.
|
||||
realigner.setupScatterFunction = {
|
||||
case scatter: ScatterFunction =>
|
||||
scatter.commandDirectory = new File("CleanedBams/IntermediateFiles/%s/ScatterGather".format(sampleId))
|
||||
|
|
@ -146,7 +132,6 @@ class FullCallingPipeline extends QScript {
|
|||
gather.jobOutputFile = new File(".queue/logs/Cleaning/%s/FixMates.out".format(sampleId))
|
||||
gather.memoryLimit = Some(6)
|
||||
gather.jarFile = qscript.picardFixMatesJar
|
||||
// Don't pass this AS=true to fix mates!
|
||||
gather.assumeSorted = None
|
||||
case (gather: GatherFunction, source: ArgumentSource) =>
|
||||
gather.commandDirectory = new File("CleanedBams/IntermediateFiles/%s/ScatterGather/Gather_%s".format(sampleId, source.field.getName))
|
||||
|
|
@ -160,7 +145,6 @@ class FullCallingPipeline extends QScript {
|
|||
|
||||
// Explicitly run fix mates if the function won't be scattered.
|
||||
val fixMates = new PicardBamJarFunction {
|
||||
// Declare inputs/outputs for dependency tracking.
|
||||
@Input(doc="unfixed bam") var unfixed: File = _
|
||||
@Output(doc="fixed bam") var fixed: File = _
|
||||
def inputBams = List(unfixed)
|
||||
|
|
@ -209,7 +193,7 @@ class FullCallingPipeline extends QScript {
|
|||
snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
|
||||
snps.memoryLimit = Some(6)
|
||||
|
||||
snps.scatterCount = qscript.num_snp_scatter_jobs
|
||||
snps.scatterCount = qscript.num_var_scatter_jobs
|
||||
snps.setupScatterFunction = {
|
||||
case scatter: ScatterFunction =>
|
||||
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
|
||||
|
|
@ -240,7 +224,7 @@ class FullCallingPipeline extends QScript {
|
|||
|
||||
|
||||
|
||||
indels.scatterCount = qscript.num_snp_scatter_jobs
|
||||
indels.scatterCount = qscript.num_var_scatter_jobs
|
||||
indels.setupScatterFunction = {
|
||||
case scatter: ScatterFunction =>
|
||||
scatter.commandDirectory = new File("IndelCalls/ScatterGather")
|
||||
|
|
@ -267,40 +251,41 @@ class FullCallingPipeline extends QScript {
|
|||
annotated.rodToIntervalTrackName = "variant"
|
||||
annotated.analysisName = base+"_GenomicAnnotator"
|
||||
|
||||
// 2.a filter on cluster and near indels
|
||||
val masker = new VariantFiltration with CommandLineGATKArgs
|
||||
masker.jobOutputFile = new File(".queue/logs/SNPCalling/Masker.out")
|
||||
masker.variantVCF = annotated.out
|
||||
masker.rodBind :+= RodBind("mask", "VCF", indels.out)
|
||||
masker.maskName = "NearIndel"
|
||||
masker.clusterWindowSize = Some(10)
|
||||
masker.clusterSize = Some(3)
|
||||
masker.out = swapExt("SnpCalls",annotated.out,".vcf",".indel.masked.vcf")
|
||||
masker.analysisName = base+"_Cluster_and_Indel_filter"
|
||||
|
||||
// 2.b hand filter with standard filter
|
||||
// 2. hand filter with standard filter; mask indels; filter snp clusters
|
||||
val handFilter = new VariantFiltration with CommandLineGATKArgs
|
||||
handFilter.jobOutputFile = new File(".queue/logs/SNPCalling/HandFilter.out")
|
||||
handFilter.variantVCF = masker.out
|
||||
handFilter.variantVCF = annotated.out
|
||||
handFilter.rodBind :+= RodBind("mask", "VCF", indels.out)
|
||||
//handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
|
||||
//handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","\"QD<5.0\"","\"HRun>=4\"")
|
||||
handFilter.maskname = "IndelSite"
|
||||
handFilter.clusterWindowSize = Some(10)
|
||||
handfilter.clusterSize = Some(3)
|
||||
handFilter.filterName ++= List("StrandBias","QualByDepth","HomopolymerRun")
|
||||
handFilter.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"")
|
||||
handFilter.out = swapExt("SnpCalls",annotated.out,".vcf",".handfiltered.vcf")
|
||||
handFilter.analysisName = base+"_HandFilter"
|
||||
|
||||
|
||||
// 4. Variant eval the cut and the hand-filtered vcf files
|
||||
val eval = new VariantEval with CommandLineGATKArgs
|
||||
eval.jobOutputFile = new File(".queue/logs/SNPCalling/VariantEval.out")
|
||||
// eval.rodBind :+= RodBind("evalOptimized", "VCF", cut.out)
|
||||
eval.rodBind :+= RodBind("eval", "VCF", handFilter.out)
|
||||
eval.evalModule ++= List("FunctionalClassBySample", "SimpleMetricsBySample", "CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
|
||||
eval.reportLocation = new File("SnpCalls", base+".eval")
|
||||
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
|
||||
eval.analysisName = base+"_VariantEval"
|
||||
eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
|
||||
// 3. Variant eval the cut and the hand-filtered vcf files
|
||||
val smallEval = new VariantEval with CommandLineGATKArgs
|
||||
smallEval.jobOutputFile = new File(".queue/logs/SNPCalling/VariantEval.out")
|
||||
smallEval.rodBind :+= RodBind("eval", "VCF", handFilter.out)
|
||||
smallEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
|
||||
smallEval.stratificationModule ++= List("EvalRod", "Novelty")
|
||||
smallEval.out = new File("SnpCalls", base+".gatkreport")
|
||||
smallEval.analysisName = base+"_VariantEval"
|
||||
smallEval.noST = true
|
||||
smallEval.noEV = true
|
||||
smallEval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
|
||||
|
||||
// 4. Combine indels and Snps into one VCF
|
||||
val combineAll = new CombineVariants with CommandLineGATKArgs
|
||||
combineAll.jobOutputFile = new File(".queue/logs/Overall/CombineVariants.out")
|
||||
combineAll.rod_priority_list = List("Indels", "SNPs")
|
||||
combineAll.rodBind :+= RodBind("SNPs", "VCF", handFilter.out)
|
||||
combineAll.rodBind :+= RodBind("Indels", "VCF", indels.out)
|
||||
combineAll.variantMergeOptions = Option(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
|
||||
combineAll.analysisName = base + "_CombineCallsets"
|
||||
combineAll.out = new File(base + "allVariants.vcf")
|
||||
|
||||
// 5. Make the bam list
|
||||
val listOfBams = new File(base +".BamFiles.list")
|
||||
|
|
@ -333,7 +318,7 @@ class FullCallingPipeline extends QScript {
|
|||
adpr.bamlist = listOfBams
|
||||
adpr.yaml = qscript.yamlFile.getAbsoluteFile
|
||||
adpr.script = qscript.tearScript
|
||||
adpr.evalroot = eval.reportLocation
|
||||
adpr.evalroot = eval.output
|
||||
adpr.jobOutputFile = new File(".queue/logs/SNPCalling/adpr.out")
|
||||
adpr.tearsheet = new File("SnpCalls", base + ".tearsheet.pdf")
|
||||
adpr.analysisName = base + "_ADPR"
|
||||
|
|
|
|||
Loading…
Reference in New Issue