Reformatting and tweaks to the end-to-end pipeline
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4066 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
63ada20da5
commit
0028b884d8
|
|
@ -18,6 +18,9 @@ class fullCallingPipeline extends QScript {
|
||||||
@Input(doc="trigger", shortName="trigger", required=false)
|
@Input(doc="trigger", shortName="trigger", required=false)
|
||||||
var trigger: File = _
|
var trigger: File = _
|
||||||
|
|
||||||
|
@Input(doc="compCEU",shortName="ceu",required=false)
|
||||||
|
var comp1KGCEU: File = _
|
||||||
|
|
||||||
@Input(doc="refseqTable", shortName="refseqTable")
|
@Input(doc="refseqTable", shortName="refseqTable")
|
||||||
var refseqTable: File = _
|
var refseqTable: File = _
|
||||||
|
|
||||||
|
|
@ -36,23 +39,30 @@ class fullCallingPipeline extends QScript {
|
||||||
@Input(doc="gatk jar")
|
@Input(doc="gatk jar")
|
||||||
var gatkJar: File = _
|
var gatkJar: File = _
|
||||||
|
|
||||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
@Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false)
|
||||||
|
var snpsInCluster = 4
|
||||||
|
|
||||||
|
@Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
|
||||||
|
var snpClusterWindow = 7
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||||
this.intervals = qscript.intervals
|
this.intervals = qscript.intervals
|
||||||
this.jarFile = qscript.gatkJar
|
this.jarFile = qscript.gatkJar
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------------ SETUP THE PIPELINE ----------- //
|
// ------------ SETUP THE PIPELINE ----------- //
|
||||||
|
|
||||||
// todo -- the unclean and clean pipelines are the same, so the code can be condensed significantly
|
|
||||||
|
|
||||||
def script = {
|
def script = {
|
||||||
val projectBase: String = qscript.project
|
val projectBase: String = qscript.project
|
||||||
val cleanedBase: String = projectBase + ".cleaned"
|
val cleanedBase: String = projectBase + ".cleaned"
|
||||||
val uncleanedBase: String = projectBase + ".uncleaned"
|
val uncleanedBase: String = projectBase + ".uncleaned"
|
||||||
// there are commands that use all the bam files
|
// there are commands that use all the bam files
|
||||||
var cleanBamFiles = List.empty[File]
|
var cleanBamFiles = List.empty[File]
|
||||||
|
|
||||||
for ( bam <- bamFiles ) {
|
for ( bam <- bamFiles ) {
|
||||||
|
|
||||||
// put unclean bams in unclean genotypers
|
// put unclean bams in unclean genotypers
|
||||||
|
|
||||||
|
|
@ -70,58 +80,64 @@ for ( bam <- bamFiles ) {
|
||||||
val realigner = new IndelRealigner with CommandLineGATKArgs
|
val realigner = new IndelRealigner with CommandLineGATKArgs
|
||||||
realigner.input_file = targetCreator.input_file
|
realigner.input_file = targetCreator.input_file
|
||||||
realigner.intervals = qscript.contigIntervals
|
realigner.intervals = qscript.contigIntervals
|
||||||
//realigner.targetIntervals = targetCreator.out
|
|
||||||
realigner.targetIntervals = targetCreator.out.getAbsolutePath
|
realigner.targetIntervals = targetCreator.out.getAbsolutePath
|
||||||
realigner.scatterCount = qscript.numContigs
|
realigner.scatterCount = qscript.numContigs
|
||||||
realigner.out = cleaned_bam
|
realigner.out = cleaned_bam
|
||||||
realigner.scatterClass = classOf[ContigScatterFunction]
|
realigner.scatterClass = classOf[ContigScatterFunction]
|
||||||
realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar }
|
realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar }
|
||||||
realigner.jobQueue = "long"
|
realigner.jobQueue = "week"
|
||||||
|
|
||||||
// put clean bams in clean genotypers
|
// put clean bams in clean genotypers
|
||||||
|
|
||||||
cleanBamFiles :+= realigner.out
|
cleanBamFiles :+= realigner.out
|
||||||
|
|
||||||
add(targetCreator,realigner)
|
add(targetCreator,realigner)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// actually make calls
|
||||||
endToEnd(uncleanedBase,bamFiles)
|
endToEnd(uncleanedBase,bamFiles)
|
||||||
endToEnd(cleanedBase,cleanBamFiles)
|
endToEnd(cleanedBase,cleanBamFiles)
|
||||||
}
|
}
|
||||||
|
|
||||||
def endToEnd(base: String, bamFiles: List[File]) = {
|
def endToEnd(base: String, bamFiles: List[File]) = {
|
||||||
|
|
||||||
// step through the un-indel-cleaned graph:
|
// step through the un-indel-cleaned graph:
|
||||||
// 1a. call snps and indels
|
// 1a. call snps and indels
|
||||||
val snps = new UnifiedGenotyper with CommandLineGATKArgs
|
val snps = new UnifiedGenotyper with CommandLineGATKArgs
|
||||||
snps.input_file = bamFiles
|
snps.input_file = bamFiles
|
||||||
snps.group :+= "Standard"
|
snps.group :+= "Standard"
|
||||||
snps.annotation :+= "MyHamplotypeScore"
|
|
||||||
snps.variants_out = new File(base+".vcf")
|
snps.variants_out = new File(base+".vcf")
|
||||||
snps.standard_min_confidence_threshold_for_emitting = Some(10)
|
snps.standard_min_confidence_threshold_for_emitting = Some(10)
|
||||||
snps.min_mapping_quality_score = Some(20)
|
snps.min_mapping_quality_score = Some(20)
|
||||||
snps.min_base_quality_score = Some(20)
|
snps.min_base_quality_score = Some(20)
|
||||||
snps.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
|
snps.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
|
||||||
snps.downsample_to_coverage = Some(200)
|
snps.downsample_to_coverage = Some(200)
|
||||||
// todo -- add input for comps, triggers, etc
|
snps.annotation :+= "QualByDepthV2"
|
||||||
|
|
||||||
if (qscript.trigger != null) {
|
if (qscript.trigger != null) {
|
||||||
snps.trigger_min_confidence_threshold_for_calling = Some(30)
|
snps.trigger_min_confidence_threshold_for_calling = Some(30)
|
||||||
snps.rodBind :+= RodBind("trigger", "VCF", qscript.trigger)
|
snps.rodBind :+= RodBind("trigger", "VCF", qscript.trigger)
|
||||||
|
// TODO: triggers need to get a name for comp-ing them if not dbSNP?
|
||||||
|
snps.rodBind :+= RodBind( "compTrigger", "VCF", qscript.trigger )
|
||||||
}
|
}
|
||||||
// todo -- hack -- get this from the command line, or properties
|
|
||||||
snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/CEU.low_coverage.2010_07.sites.hg18.vcf.gz") )
|
// todo -- add generalize comp inputs
|
||||||
|
if ( qscript.comp1KGCEU != null ) {
|
||||||
|
snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
|
||||||
|
}
|
||||||
|
|
||||||
|
snps.scatterCount = 50
|
||||||
|
|
||||||
|
|
||||||
// TODO: what is the 1KG_ALL track? -- same as trigger -- just makes sure the field is propagated :: chartl
|
|
||||||
snps.rodBind :+= RodBind( "comp1KG_ALL", "VCF", qscript.trigger )
|
|
||||||
|
|
||||||
|
|
||||||
snps.scatterCount = 100
|
|
||||||
val indels = new UnifiedGenotyper with CommandLineGATKArgs
|
val indels = new UnifiedGenotyper with CommandLineGATKArgs
|
||||||
indels.input_file = bamFiles
|
indels.input_file = bamFiles
|
||||||
|
indels.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
|
||||||
|
indels.downsample_to_coverage = Some(200)
|
||||||
indels.variants_out = new File(base+".indels.vcf")
|
indels.variants_out = new File(base+".indels.vcf")
|
||||||
indels.genotype_model = Some(Model.INDELS)
|
indels.genotype_model = Some(Model.INDELS)
|
||||||
indels.scatterCount = 100
|
indels.scatterCount = 50
|
||||||
// todo -- add inputs for the indel genotyper
|
|
||||||
|
|
||||||
// 1b. genomically annotate SNPs -- slow, but scatter it
|
// 1b. genomically annotate SNPs -- slow, but scatter it
|
||||||
val annotated = new GenomicAnnotator with CommandLineGATKArgs
|
val annotated = new GenomicAnnotator with CommandLineGATKArgs
|
||||||
annotated.rodBind :+= RodBind("variant", "VCF", snps.variants_out)
|
annotated.rodBind :+= RodBind("variant", "VCF", snps.variants_out)
|
||||||
|
|
@ -131,15 +147,18 @@ def endToEnd(base: String, bamFiles: List[File]) = {
|
||||||
annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
|
annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
|
||||||
annotated.rodToIntervalTrackName = "variant"
|
annotated.rodToIntervalTrackName = "variant"
|
||||||
annotated.scatterCount = 100
|
annotated.scatterCount = 100
|
||||||
|
|
||||||
|
|
||||||
// 2.a filter on cluster and near indels
|
// 2.a filter on cluster and near indels
|
||||||
val masker = new VariantFiltration with CommandLineGATKArgs
|
val masker = new VariantFiltration with CommandLineGATKArgs
|
||||||
masker.rodBind :+= RodBind("variant", "VCF", annotated.vcfOutput)
|
masker.rodBind :+= RodBind("variant", "VCF", annotated.vcfOutput)
|
||||||
masker.rodBind :+= RodBind("mask", "VCF", indels.variants_out)
|
masker.rodBind :+= RodBind("mask", "VCF", indels.variants_out)
|
||||||
masker.maskName = "NearIndel"
|
masker.maskName = "NearIndel"
|
||||||
masker.clusterWindowSize = Some(20)
|
masker.clusterWindowSize = Some(qscript.snpClusterWindow)
|
||||||
masker.clusterSize = Some(7)
|
masker.clusterSize = Some(qscript.snpsInCluster)
|
||||||
masker.out = swapExt(annotated.vcfOutput,".vcf",".indel.masked.vcf")
|
masker.out = swapExt(annotated.vcfOutput,".vcf",".indel.masked.vcf")
|
||||||
// todo -- snp cluster args?
|
|
||||||
|
|
||||||
// 2.b hand filter with standard filter
|
// 2.b hand filter with standard filter
|
||||||
val handFilter = new VariantFiltration with CommandLineGATKArgs
|
val handFilter = new VariantFiltration with CommandLineGATKArgs
|
||||||
handFilter.rodBind :+= RodBind("variant", "VCF", annotated.vcfOutput)
|
handFilter.rodBind :+= RodBind("variant", "VCF", annotated.vcfOutput)
|
||||||
|
|
@ -147,25 +166,31 @@ def endToEnd(base: String, bamFiles: List[File]) = {
|
||||||
handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
|
handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
|
||||||
handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","QD<5","\"HRun>=4\"")
|
handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","QD<5","\"HRun>=4\"")
|
||||||
handFilter.out = swapExt(annotated.vcfOutput,".vcf",".handfiltered.vcf")
|
handFilter.out = swapExt(annotated.vcfOutput,".vcf",".handfiltered.vcf")
|
||||||
|
|
||||||
|
|
||||||
// 3.i generate gaussian clusters on the masked vcf
|
// 3.i generate gaussian clusters on the masked vcf
|
||||||
|
// todo -- args for annotations?
|
||||||
|
// todo -- args for resources (properties file)
|
||||||
val clusters = new GenerateVariantClusters with CommandLineGATKArgs
|
val clusters = new GenerateVariantClusters with CommandLineGATKArgs
|
||||||
clusters.rodBind :+= RodBind("input", "VCF", masker.out)
|
clusters.rodBind :+= RodBind("input", "VCF", masker.out)
|
||||||
//clusters.clusterFile = swapExt(snps.variants_out,".vcf",".cluster")
|
|
||||||
val clusters_clusterFile = swapExt(snps.variants_out,".vcf",".cluster")
|
val clusters_clusterFile = swapExt(snps.variants_out,".vcf",".cluster")
|
||||||
clusters.clusterFile = clusters_clusterFile.getAbsolutePath
|
clusters.clusterFile = clusters_clusterFile.getAbsolutePath
|
||||||
clusters.memoryLimit = Some(8)
|
clusters.memoryLimit = Some(8)
|
||||||
clusters.jobQueue = "hugemem"
|
clusters.jobQueue = "hugemem"
|
||||||
// todo -- args for annotations?
|
|
||||||
// todo -- args for resources (properties file)
|
clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
|
||||||
clusters.use_annotation ++= List("QD", "SB", "MyHaplotypeScore", "HRun")
|
|
||||||
clusters.path_to_resources = "/humgen/gsa-scr1/chartl/sting/R"
|
clusters.path_to_resources = "/humgen/gsa-scr1/chartl/sting/R"
|
||||||
|
|
||||||
|
|
||||||
// 3.ii apply gaussian clusters to the masked vcf
|
// 3.ii apply gaussian clusters to the masked vcf
|
||||||
val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
|
val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
|
||||||
recalibrate.clusterFile = clusters.clusterFile
|
recalibrate.clusterFile = clusters.clusterFile
|
||||||
recalibrate.rodBind :+= RodBind("input", "VCF", masker.out)
|
recalibrate.rodBind :+= RodBind("input", "VCF", masker.out)
|
||||||
recalibrate.out = swapExt(masker.out,".vcf",".optimized.vcf")
|
recalibrate.out = swapExt(masker.out,".vcf",".optimized.vcf")
|
||||||
// todo -- inputs for Ti/Tv expectation and other things
|
// todo -- inputs for Ti/Tv expectation and other things -- command line
|
||||||
recalibrate.target_titv = Some(2.1)
|
recalibrate.target_titv = Some(2.1)
|
||||||
|
|
||||||
|
|
||||||
// 3.iii apply variant cuts to the clusters
|
// 3.iii apply variant cuts to the clusters
|
||||||
val cut = new ApplyVariantCuts with CommandLineGATKArgs
|
val cut = new ApplyVariantCuts with CommandLineGATKArgs
|
||||||
cut.rodBind :+= RodBind("input", "VCF", recalibrate.out)
|
cut.rodBind :+= RodBind("input", "VCF", recalibrate.out)
|
||||||
|
|
@ -177,15 +202,16 @@ def endToEnd(base: String, bamFiles: List[File]) = {
|
||||||
cut.tranchesFile = cut_tranchesFile.getAbsolutePath
|
cut.tranchesFile = cut_tranchesFile.getAbsolutePath
|
||||||
// todo -- fdr inputs, etc
|
// todo -- fdr inputs, etc
|
||||||
cut.fdr_filter_level = Some(10)
|
cut.fdr_filter_level = Some(10)
|
||||||
|
|
||||||
|
|
||||||
// 4. Variant eval the cut and the hand-filtered vcf files
|
// 4. Variant eval the cut and the hand-filtered vcf files
|
||||||
val eval = new VariantEval with CommandLineGATKArgs
|
val eval = new VariantEval with CommandLineGATKArgs
|
||||||
eval.rodBind :+= RodBind("evalOptimized", "VCF", cut_outputVCFFile)
|
eval.rodBind :+= RodBind("evalOptimized", "VCF", cut_outputVCFFile)
|
||||||
eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out)
|
eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out)
|
||||||
// todo -- make comp tracks command-line arguments or properties
|
|
||||||
eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
|
eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
|
||||||
eval.out = new File(base+".eval")
|
eval.out = new File(base+".eval")
|
||||||
|
|
||||||
add(snps,indels,annotated,masker,handFilter,clusters,recalibrate,cut,eval)
|
add(snps,indels,annotated,masker,handFilter,clusters,recalibrate,cut,eval)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue