Includes UG version of indel genotyping rather than IGV2

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5191 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
corin 2011-02-04 20:25:46 +00:00
parent bfc6ef1753
commit cd6ace1b47
1 changed files with 69 additions and 125 deletions

View File

@ -44,7 +44,7 @@ class fullCallingPipeline extends QScript {
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false) @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
var skip_cleaning = false var skip_cleaning = false
@Input(doc="ADPR script", shortName ="tearScript", required=true) @Input(doc="ADPR script", shortName ="tearScript", required=false)
var tearScript: File = _ var tearScript: File = _
//@Input(doc="Sequencing maching name (for use by adpr)") //@Input(doc="Sequencing maching name (for use by adpr)")
@ -60,6 +60,8 @@ class fullCallingPipeline extends QScript {
@Argument(doc="Job queue for short run jobs (<1hr)", shortName="shortJobQueue", required=false) @Argument(doc="Job queue for short run jobs (<1hr)", shortName="shortJobQueue", required=false)
var short_job_queue: String = _ var short_job_queue: String = _
private var pipeline: Pipeline = _ private var pipeline: Pipeline = _
private var dbsnpType: String = _ private var dbsnpType: String = _
@ -208,92 +210,68 @@ class fullCallingPipeline extends QScript {
snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.out") snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.out")
snps.analysisName = base+"_SNP_calls" snps.analysisName = base+"_SNP_calls"
snps.input_file = bamFiles snps.input_file = bamFiles
//snps.annotation ++= List("AlleleBalance")
snps.input_file = bamFiles snps.input_file = bamFiles
snps.group :+= "Standard" snps.group :+= "Standard"
snps.out = new File("SnpCalls", base+".vcf") snps.out = new File("SnpCalls", base+".vcf")
//snps.standard_min_confidence_threshold_for_emitting = Some(10)
//snps.min_mapping_quality_score = Some(20)
//snps.min_base_quality_score = Some(20)
snps.downsample_to_coverage = Some(qscript.downsampling_coverage) snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
//snps.annotation :+= "QualByDepthV2"
snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
snps.memoryLimit = Some(6) snps.memoryLimit = Some(6)
//if (qscript.trigger != null) {
// snps.trigger_min_confidence_threshold_for_calling = Some(30)
// snps.rodBind :+= RodBind("trigger", "VCF", qscript.trigger)
// // TODO: triggers need to get a name for comp-ing them if not dbSNP?
// snps.rodBind :+= RodBind( "compTrigger", "VCF", qscript.trigger )
//}
// todo -- add generalize comp inputs
//if ( qscript.comp1KGCEU != null ) {
// snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
//}
snps.scatterCount = qscript.num_snp_scatter_jobs snps.scatterCount = qscript.num_snp_scatter_jobs
snps.setupScatterFunction = { snps.setupScatterFunction = {
case scatter: ScatterFunction => case scatter: ScatterFunction =>
scatter.commandDirectory = new File("SnpCalls/ScatterGather") scatter.commandDirectory = new File("SnpCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out") scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
} }
snps.setupCloneFunction = { snps.setupCloneFunction = {
case (clone: CloneFunction, index: Int) => case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index)) clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index)) clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
} }
snps.setupGatherFunction = { snps.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) => case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName)) gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
} }
// indel genotyper does one sample at a time val indels = new UnifiedGenotyper with CommandLineGATKArgs
var indelCallFiles = List.empty[RodBind] indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.out")
var indelGenotypers = List.empty[IndelGenotyperV2 with CommandLineGATKArgs] indels.analysisName = base+"_Indel_calls"
var loopNo = 0 indels.input_file = bamFiles
var priority = "" indels.input_file = bamFiles
for ( sample <- samples ) { indels.group :+= "Standard"
val sampleId = sample.getId indels.out = new File("IndelCalls", base+".vcf")
val bam = sample.getBamFiles.get(bamType) indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
indels.memoryLimit = Some(6)
indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
var indel = new IndelGenotyperV2 with CommandLineGATKArgs
indel.jobOutputFile = new File(".queue/logs/IndelCalling/%s/IndelGenotyperV2.out".format(sampleId))
indel.window_size = Some(350)
indel.analysisName = "IndelGenotyper_"+sampleId
indel.input_file :+= bam
indel.out = swapExt("IndelCalls/IntermediateFiles/" + sampleId, bam,".bam",".indels.vcf")
indel.downsample_to_coverage = Some(qscript.downsampling_coverage)
indelCallFiles :+= RodBind("v"+loopNo.toString, "VCF", indel.out)
//indel.scatterCount = qscript.num_indel_scatter_jobs
indelGenotypers :+= indel
if ( loopNo == 0 ) { indels.scatterCount = qscript.num_snp_scatter_jobs
priority = "v0" indels.setupScatterFunction = {
} else { case scatter: ScatterFunction =>
priority += ",v"+loopNo.toString scatter.commandDirectory = new File("IndelCalls/ScatterGather")
} scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
loopNo += 1 }
} indels.setupCloneFunction = {
val mergeIndels = new CombineVariants with CommandLineGATKArgs case (clone: CloneFunction, index: Int) =>
mergeIndels.jobOutputFile = new File(".queue/logs/IndelCalling/CombineVariants.out") clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
mergeIndels.out = new TaggedFile("IndelCalls/" + qscript.pipeline.getProject.getName+".indels.vcf","vcf") clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
mergeIndels.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED) }
mergeIndels.priority = priority indels.setupGatherFunction = {
mergeIndels.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION) case (gather: GatherFunction, source: ArgumentSource) =>
mergeIndels.rodBind = indelCallFiles gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
mergeIndels.analysisName = base+"_MergeIndels" gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
mergeIndels.memoryLimit = Some(4) }
// 1b. genomically annotate SNPs -- no longer slow // 1b. genomically annotate SNPs -- no longer slow
val annotated = new GenomicAnnotator with CommandLineGATKArgs val annotated = new GenomicAnnotator with CommandLineGATKArgs
annotated.jobOutputFile = new File(".queue/logs/SNPCalling/GenomicAnnotator.out") annotated.jobOutputFile = new File(".queue/logs/SNPCalling/GenomicAnnotator.out")
annotated.rodBind :+= RodBind("variant", "VCF", snps.out) annotated.rodBind :+= RodBind("variant", "VCF", snps.out)
annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable) annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable)
//annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
annotated.out = swapExt("SnpCalls",snps.out,".vcf",".annotated.vcf") annotated.out = swapExt("SnpCalls",snps.out,".vcf",".annotated.vcf")
//annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
annotated.rodToIntervalTrackName = "variant" annotated.rodToIntervalTrackName = "variant"
annotated.analysisName = base+"_GenomicAnnotator" annotated.analysisName = base+"_GenomicAnnotator"
@ -301,7 +279,7 @@ class fullCallingPipeline extends QScript {
val masker = new VariantFiltration with CommandLineGATKArgs val masker = new VariantFiltration with CommandLineGATKArgs
masker.jobOutputFile = new File(".queue/logs/SNPCalling/Masker.out") masker.jobOutputFile = new File(".queue/logs/SNPCalling/Masker.out")
masker.variantVCF = annotated.out masker.variantVCF = annotated.out
masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out) masker.rodBind :+= RodBind("mask", "VCF", indels.out)
masker.maskName = "NearIndel" masker.maskName = "NearIndel"
masker.clusterWindowSize = Some(10) masker.clusterWindowSize = Some(10)
masker.clusterSize = Some(3) masker.clusterSize = Some(3)
@ -312,7 +290,7 @@ class fullCallingPipeline extends QScript {
val handFilter = new VariantFiltration with CommandLineGATKArgs val handFilter = new VariantFiltration with CommandLineGATKArgs
handFilter.jobOutputFile = new File(".queue/logs/SNPCalling/HandFilter.out") handFilter.jobOutputFile = new File(".queue/logs/SNPCalling/HandFilter.out")
handFilter.variantVCF = masker.out handFilter.variantVCF = masker.out
handFilter.rodBind :+= RodBind("mask", "VCF", mergeIndels.out) handFilter.rodBind :+= RodBind("mask", "VCF", indels.out)
//handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun") //handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
//handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","\"QD<5.0\"","\"HRun>=4\"") //handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","\"QD<5.0\"","\"HRun>=4\"")
handFilter.filterName ++= List("StrandBias","QualByDepth","HomopolymerRun") handFilter.filterName ++= List("StrandBias","QualByDepth","HomopolymerRun")
@ -320,41 +298,6 @@ class fullCallingPipeline extends QScript {
handFilter.out = swapExt("SnpCalls",annotated.out,".vcf",".handfiltered.vcf") handFilter.out = swapExt("SnpCalls",annotated.out,".vcf",".handfiltered.vcf")
handFilter.analysisName = base+"_HandFilter" handFilter.analysisName = base+"_HandFilter"
// 3.i generate gaussian clusters on the masked vcf
// todo -- args for annotations?
// todo -- args for resources (properties file)
// val clusters = new GenerateVariantClusters with CommandLineGATKArgs
// clusters.jobOutputFile = new File(".queue/logs/SNPCalling/Clusters.out")
// clusters.rodBind :+= RodBind("input", "VCF", masker.out)
// clusters.rodBind :+= RodBind("dbsnp", "ROD", qscript.pipeline.getProject.getDbsnpFile)
// val clusters_clusterFile = swapExt("SnpCalls/IntermediateFiles",snps.out,".vcf",".cluster")
// clusters.clusterFile = clusters_clusterFile
// clusters.memoryLimit = Some(4)
// clusters.jobQueue = qscript.big_mem_queue
// clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
// clusters.analysisName = base+"_Cluster"
// 3.ii apply gaussian clusters to the masked vcf
// val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
// recalibrate.jobOutputFile = new File(".queue/logs/SNPCalling/Recalibrator.out")
// recalibrate.clusterFile = clusters.clusterFile
// recalibrate.DBSNP = qscript.pipeline.getProject.getDbsnpFile
// recalibrate.rodBind :+= RodBind("input", "VCF", masker.out)
// recalibrate.out = swapExt("SnpCalls",masker.out,".vcf",".recalibrated.vcf")
// recalibrate.target_titv = qscript.target_titv
// recalibrate.tranches_file = swapExt("SnpCalls/IntermediateFiles", masker.out,".vcf",".recalibrate.tranches")
// recalibrate.analysisName = base+"_VariantRecalibrator"
// 3.iii apply variant cuts to the clusters
// val cut = new ApplyVariantCuts with CommandLineGATKArgs
// cut.jobOutputFile = new File(".queue/logs/SNPCalling/VariantCuts.out")
// cut.rodBind :+= RodBind("input", "VCF", recalibrate.out)
// cut.out = swapExt("SnpCalls",recalibrate.out,".vcf",".tranched.vcf")
//cut.tranches_file = recalibrate.tranches_file
// todo -- fdr inputs, etc
// cut.fdr_filter_level = Some(1)
// cut.analysisName = base+"_ApplyVariantCuts"
// 4. Variant eval the cut and the hand-filtered vcf files // 4. Variant eval the cut and the hand-filtered vcf files
val eval = new VariantEval with CommandLineGATKArgs val eval = new VariantEval with CommandLineGATKArgs
@ -371,7 +314,6 @@ class fullCallingPipeline extends QScript {
} else{ } else{
eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
} }
add(snps)
// 5. Make the bam list // 5. Make the bam list
val listOfBams = new File(base +".BamFiles.list") val listOfBams = new File(base +".BamFiles.list")
@ -384,35 +326,37 @@ class fullCallingPipeline extends QScript {
// 6. Run the ADPR and make pretty stuff // 6. Run the ADPR and make pretty stuff
class rCommand extends CommandLineFunction{ add(snps, indels, annotated,masker,handFilter,eval,writeBamList)
@Input(doc="R script") if (qscript.tearScript != null){
var script: File = _ class rCommand extends CommandLineFunction{
@Input(doc="pipeline yaml") @Input(doc="R script")
var yaml: File = _ var script: File = _
@Input(doc="list of bams") @Input(doc="pipeline yaml")
var bamlist: File =_ var yaml: File = _
@Input(doc="Eval files root") @Input(doc="list of bams")
var evalroot: File =_ var bamlist: File =_
@Output(doc="tearsheet loc") @Input(doc="Eval files root")
var tearsheet: File =_ var evalroot: File =_
def commandLine = "Rscript %s -yaml %s -bamlist %s -evalroot %s -tearout %s" @Output(doc="tearsheet loc")
.format(script, yaml, bamlist, evalroot, tearsheet) var tearsheet: File =_
} def commandLine = "Rscript %s -yaml %s -bamlist %s -evalroot %s -tearout %s".format(script, yaml, bamlist, evalroot, tearsheet)
}
val adpr = new rCommand val adpr = new rCommand
adpr.bamlist = listOfBams adpr.bamlist = listOfBams
adpr.yaml = qscript.yamlFile.getAbsoluteFile adpr.yaml = qscript.yamlFile.getAbsoluteFile
adpr.script = tearScript adpr.script = qscript.tearScript
adpr.evalroot = eval.reportLocation adpr.evalroot = eval.reportLocation
adpr.jobOutputFile = new File(".queue/logs/SNPCalling/adpr.out") adpr.jobOutputFile = new File(".queue/logs/SNPCalling/adpr.out")
adpr.tearsheet = new File("SnpCalls", base + ".tearsheet.pdf") adpr.tearsheet = new File("SnpCalls", base + ".tearsheet.pdf")
adpr.analysisName = base + "_ADPR" adpr.analysisName = base + "_ADPR"
for ( igv2 <- indelGenotypers ) { add(adpr)
add(igv2)
} }
add(mergeIndels,annotated,masker,handFilter,eval,writeBamList,adpr)
} }
} }