Includes UG version of indel genotyping rather than IGV2

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5191 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
corin 2011-02-04 20:25:46 +00:00
parent bfc6ef1753
commit cd6ace1b47
1 changed files with 69 additions and 125 deletions

View File

@ -44,7 +44,7 @@ class fullCallingPipeline extends QScript {
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
var skip_cleaning = false
@Input(doc="ADPR script", shortName ="tearScript", required=true)
@Input(doc="ADPR script", shortName ="tearScript", required=false)
var tearScript: File = _
//@Input(doc="Sequencing maching name (for use by adpr)")
@ -60,6 +60,8 @@ class fullCallingPipeline extends QScript {
@Argument(doc="Job queue for short run jobs (<1hr)", shortName="shortJobQueue", required=false)
var short_job_queue: String = _
private var pipeline: Pipeline = _
private var dbsnpType: String = _
@ -208,92 +210,68 @@ class fullCallingPipeline extends QScript {
snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.out")
snps.analysisName = base+"_SNP_calls"
snps.input_file = bamFiles
//snps.annotation ++= List("AlleleBalance")
snps.input_file = bamFiles
snps.group :+= "Standard"
snps.out = new File("SnpCalls", base+".vcf")
//snps.standard_min_confidence_threshold_for_emitting = Some(10)
//snps.min_mapping_quality_score = Some(20)
//snps.min_base_quality_score = Some(20)
snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
//snps.annotation :+= "QualByDepthV2"
snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
snps.memoryLimit = Some(6)
//if (qscript.trigger != null) {
// snps.trigger_min_confidence_threshold_for_calling = Some(30)
// snps.rodBind :+= RodBind("trigger", "VCF", qscript.trigger)
// // TODO: triggers need to get a name for comp-ing them if not dbSNP?
// snps.rodBind :+= RodBind( "compTrigger", "VCF", qscript.trigger )
//}
// todo -- add generalize comp inputs
//if ( qscript.comp1KGCEU != null ) {
// snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
//}
snps.scatterCount = qscript.num_snp_scatter_jobs
snps.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
}
snps.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
}
snps.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
}
snps.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
}
snps.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
}
snps.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
}
// indel genotyper does one sample at a time
var indelCallFiles = List.empty[RodBind]
var indelGenotypers = List.empty[IndelGenotyperV2 with CommandLineGATKArgs]
var loopNo = 0
var priority = ""
for ( sample <- samples ) {
val sampleId = sample.getId
val bam = sample.getBamFiles.get(bamType)
val indels = new UnifiedGenotyper with CommandLineGATKArgs
indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.out")
indels.analysisName = base+"_Indel_calls"
indels.input_file = bamFiles
indels.input_file = bamFiles
indels.group :+= "Standard"
indels.out = new File("IndelCalls", base+".vcf")
indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
indels.memoryLimit = Some(6)
indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
var indel = new IndelGenotyperV2 with CommandLineGATKArgs
indel.jobOutputFile = new File(".queue/logs/IndelCalling/%s/IndelGenotyperV2.out".format(sampleId))
indel.window_size = Some(350)
indel.analysisName = "IndelGenotyper_"+sampleId
indel.input_file :+= bam
indel.out = swapExt("IndelCalls/IntermediateFiles/" + sampleId, bam,".bam",".indels.vcf")
indel.downsample_to_coverage = Some(qscript.downsampling_coverage)
indelCallFiles :+= RodBind("v"+loopNo.toString, "VCF", indel.out)
//indel.scatterCount = qscript.num_indel_scatter_jobs
indelGenotypers :+= indel
if ( loopNo == 0 ) {
priority = "v0"
} else {
priority += ",v"+loopNo.toString
}
loopNo += 1
}
val mergeIndels = new CombineVariants with CommandLineGATKArgs
mergeIndels.jobOutputFile = new File(".queue/logs/IndelCalling/CombineVariants.out")
mergeIndels.out = new TaggedFile("IndelCalls/" + qscript.pipeline.getProject.getName+".indels.vcf","vcf")
mergeIndels.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED)
mergeIndels.priority = priority
mergeIndels.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
mergeIndels.rodBind = indelCallFiles
mergeIndels.analysisName = base+"_MergeIndels"
mergeIndels.memoryLimit = Some(4)
indels.scatterCount = qscript.num_snp_scatter_jobs
indels.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("IndelCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
}
indels.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
}
indels.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
}
// 1b. genomically annotate SNPs -- no longer slow
val annotated = new GenomicAnnotator with CommandLineGATKArgs
annotated.jobOutputFile = new File(".queue/logs/SNPCalling/GenomicAnnotator.out")
annotated.rodBind :+= RodBind("variant", "VCF", snps.out)
annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable)
//annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
annotated.out = swapExt("SnpCalls",snps.out,".vcf",".annotated.vcf")
//annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
annotated.rodToIntervalTrackName = "variant"
annotated.analysisName = base+"_GenomicAnnotator"
@ -301,7 +279,7 @@ class fullCallingPipeline extends QScript {
val masker = new VariantFiltration with CommandLineGATKArgs
masker.jobOutputFile = new File(".queue/logs/SNPCalling/Masker.out")
masker.variantVCF = annotated.out
masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out)
masker.rodBind :+= RodBind("mask", "VCF", indels.out)
masker.maskName = "NearIndel"
masker.clusterWindowSize = Some(10)
masker.clusterSize = Some(3)
@ -312,7 +290,7 @@ class fullCallingPipeline extends QScript {
val handFilter = new VariantFiltration with CommandLineGATKArgs
handFilter.jobOutputFile = new File(".queue/logs/SNPCalling/HandFilter.out")
handFilter.variantVCF = masker.out
handFilter.rodBind :+= RodBind("mask", "VCF", mergeIndels.out)
handFilter.rodBind :+= RodBind("mask", "VCF", indels.out)
//handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
//handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","\"QD<5.0\"","\"HRun>=4\"")
handFilter.filterName ++= List("StrandBias","QualByDepth","HomopolymerRun")
@ -320,41 +298,6 @@ class fullCallingPipeline extends QScript {
handFilter.out = swapExt("SnpCalls",annotated.out,".vcf",".handfiltered.vcf")
handFilter.analysisName = base+"_HandFilter"
// 3.i generate gaussian clusters on the masked vcf
// todo -- args for annotations?
// todo -- args for resources (properties file)
// val clusters = new GenerateVariantClusters with CommandLineGATKArgs
// clusters.jobOutputFile = new File(".queue/logs/SNPCalling/Clusters.out")
// clusters.rodBind :+= RodBind("input", "VCF", masker.out)
// clusters.rodBind :+= RodBind("dbsnp", "ROD", qscript.pipeline.getProject.getDbsnpFile)
// val clusters_clusterFile = swapExt("SnpCalls/IntermediateFiles",snps.out,".vcf",".cluster")
// clusters.clusterFile = clusters_clusterFile
// clusters.memoryLimit = Some(4)
// clusters.jobQueue = qscript.big_mem_queue
// clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
// clusters.analysisName = base+"_Cluster"
// 3.ii apply gaussian clusters to the masked vcf
// val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
// recalibrate.jobOutputFile = new File(".queue/logs/SNPCalling/Recalibrator.out")
// recalibrate.clusterFile = clusters.clusterFile
// recalibrate.DBSNP = qscript.pipeline.getProject.getDbsnpFile
// recalibrate.rodBind :+= RodBind("input", "VCF", masker.out)
// recalibrate.out = swapExt("SnpCalls",masker.out,".vcf",".recalibrated.vcf")
// recalibrate.target_titv = qscript.target_titv
// recalibrate.tranches_file = swapExt("SnpCalls/IntermediateFiles", masker.out,".vcf",".recalibrate.tranches")
// recalibrate.analysisName = base+"_VariantRecalibrator"
// 3.iii apply variant cuts to the clusters
// val cut = new ApplyVariantCuts with CommandLineGATKArgs
// cut.jobOutputFile = new File(".queue/logs/SNPCalling/VariantCuts.out")
// cut.rodBind :+= RodBind("input", "VCF", recalibrate.out)
// cut.out = swapExt("SnpCalls",recalibrate.out,".vcf",".tranched.vcf")
//cut.tranches_file = recalibrate.tranches_file
// todo -- fdr inputs, etc
// cut.fdr_filter_level = Some(1)
// cut.analysisName = base+"_ApplyVariantCuts"
// 4. Variant eval the cut and the hand-filtered vcf files
val eval = new VariantEval with CommandLineGATKArgs
@ -371,7 +314,6 @@ class fullCallingPipeline extends QScript {
} else{
eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
}
add(snps)
// 5. Make the bam list
val listOfBams = new File(base +".BamFiles.list")
@ -384,35 +326,37 @@ class fullCallingPipeline extends QScript {
// 6. Run the ADPR and make pretty stuff
class rCommand extends CommandLineFunction{
@Input(doc="R script")
var script: File = _
@Input(doc="pipeline yaml")
var yaml: File = _
@Input(doc="list of bams")
var bamlist: File =_
@Input(doc="Eval files root")
var evalroot: File =_
@Output(doc="tearsheet loc")
var tearsheet: File =_
def commandLine = "Rscript %s -yaml %s -bamlist %s -evalroot %s -tearout %s"
.format(script, yaml, bamlist, evalroot, tearsheet)
}
add(snps, indels, annotated,masker,handFilter,eval,writeBamList)
if (qscript.tearScript != null){
class rCommand extends CommandLineFunction{
@Input(doc="R script")
var script: File = _
@Input(doc="pipeline yaml")
var yaml: File = _
@Input(doc="list of bams")
var bamlist: File =_
@Input(doc="Eval files root")
var evalroot: File =_
@Output(doc="tearsheet loc")
var tearsheet: File =_
def commandLine = "Rscript %s -yaml %s -bamlist %s -evalroot %s -tearout %s".format(script, yaml, bamlist, evalroot, tearsheet)
}
val adpr = new rCommand
val adpr = new rCommand
adpr.bamlist = listOfBams
adpr.yaml = qscript.yamlFile.getAbsoluteFile
adpr.script = tearScript
adpr.script = qscript.tearScript
adpr.evalroot = eval.reportLocation
adpr.jobOutputFile = new File(".queue/logs/SNPCalling/adpr.out")
adpr.tearsheet = new File("SnpCalls", base + ".tearsheet.pdf")
adpr.analysisName = base + "_ADPR"
for ( igv2 <- indelGenotypers ) {
add(igv2)
add(adpr)
}
add(mergeIndels,annotated,masker,handFilter,eval,writeBamList,adpr)
}
}