2010-08-10 00:42:48 +08:00
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
2010-07-30 04:48:35 +08:00
import org.broadinstitute.sting.queue.QScript
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
class fullCallingPipeline extends QScript {
qscript =>
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Argument(doc="contigIntervals", shortName="contigIntervals")
var contigIntervals: File = _
2010-07-30 02:37:20 +08:00
2010-08-10 00:42:48 +08:00
@Argument(doc="numContigs", shortName="numContigs")
var numContigs: Int = _
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Argument(doc="project", shortName="project")
var project: String = _
2010-07-30 02:37:20 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="trigger", shortName="trigger", required=false)
var trigger: File = _
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
@Input(doc="compCEU",shortName="ceu",required=false)
var comp1KGCEU: File = _
2010-08-10 00:42:48 +08:00
@Input(doc="refseqTable", shortName="refseqTable")
var refseqTable: File = _
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="dbsnpTable", shortName="dbsnpTable")
var dbsnpTable: File = _
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="Picard FixMateInformation.jar. At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar. Outside the broad see http://picard.sourceforge.net/")
var picardFixMatesJar: File = _
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="intervals")
var intervals: File = _
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="bam files", shortName="I")
var bamFiles: List[File] = Nil
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
@Input(doc="gatk jar")
var gatkJar: File = _
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
@Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false)
var snpsInCluster = 4
@Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
var snpClusterWindow = 7
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals = qscript.intervals
this.jarFile = qscript.gatkJar
}
// ------------ SETUP THE PIPELINE ----------- //
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
def script = {
val projectBase: String = qscript.project
val cleanedBase: String = projectBase + ".cleaned"
val uncleanedBase: String = projectBase + ".uncleaned"
2010-08-20 04:29:48 +08:00
// there are commands that use all the bam files
2010-08-10 00:42:48 +08:00
var cleanBamFiles = List.empty[File]
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
for ( bam <- bamFiles ) {
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
// put unclean bams in unclean genotypers
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
// in advance, create the extension files
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
val indel_targets = swapExt(bam,"bam","realigner_targets.interval_list")
val cleaned_bam = swapExt(bam,"bam","cleaned.bam") // note-- the scatter is in the definition itself
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
// create the cleaning commands
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
val targetCreator = new RealignerTargetCreator with CommandLineGATKArgs
targetCreator.input_file :+= bam
targetCreator.out = indel_targets
2010-08-10 00:42:48 +08:00
2010-08-20 04:29:48 +08:00
val realigner = new IndelRealigner with CommandLineGATKArgs
realigner.input_file = targetCreator.input_file
realigner.intervals = qscript.contigIntervals
2010-08-26 02:52:44 +08:00
realigner.targetIntervals = new java.io.File(targetCreator.out.getAbsolutePath)
2010-08-20 04:29:48 +08:00
realigner.scatterCount = qscript.numContigs
realigner.out = cleaned_bam
realigner.scatterClass = classOf[ContigScatterFunction]
realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar }
realigner.jobQueue = "week"
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
// put clean bams in clean genotypers
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
cleanBamFiles :+= realigner.out
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
add(targetCreator,realigner)
}
// actually make calls
2010-08-10 00:42:48 +08:00
endToEnd(uncleanedBase,bamFiles)
endToEnd(cleanedBase,cleanBamFiles)
}
2010-07-29 21:17:14 +08:00
2010-08-20 04:29:48 +08:00
def endToEnd(base: String, bamFiles: List[File]) = {
// step through the un-indel-cleaned graph:
// 1a. call snps and indels
val snps = new UnifiedGenotyper with CommandLineGATKArgs
snps.input_file = bamFiles
snps.group :+= "Standard"
2010-08-26 02:52:44 +08:00
snps.variants_out = base+".vcf"
2010-08-20 04:29:48 +08:00
snps.standard_min_confidence_threshold_for_emitting = Some(10)
snps.min_mapping_quality_score = Some(20)
snps.min_base_quality_score = Some(20)
snps.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
snps.downsample_to_coverage = Some(200)
snps.annotation :+= "QualByDepthV2"
if (qscript.trigger != null) {
snps.trigger_min_confidence_threshold_for_calling = Some(30)
snps.rodBind :+= RodBind("trigger", "VCF", qscript.trigger)
// TODO: triggers need to get a name for comp-ing them if not dbSNP?
snps.rodBind :+= RodBind( "compTrigger", "VCF", qscript.trigger )
}
// todo -- add generalize comp inputs
if ( qscript.comp1KGCEU != null ) {
snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
}
snps.scatterCount = 50
val indels = new UnifiedGenotyper with CommandLineGATKArgs
indels.input_file = bamFiles
indels.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
indels.downsample_to_coverage = Some(200)
2010-08-26 02:52:44 +08:00
indels.variants_out = base+".indels.vcf"
2010-08-20 04:29:48 +08:00
indels.genotype_model = Some(Model.INDELS)
indels.scatterCount = 50
// 1b. genomically annotate SNPs -- slow, but scatter it
val annotated = new GenomicAnnotator with CommandLineGATKArgs
2010-08-26 02:52:44 +08:00
annotated.rodBind :+= RodBind("variant", "VCF", new File(snps.variants_out))
2010-08-20 04:29:48 +08:00
annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.refseqTable)
annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
2010-08-26 02:52:44 +08:00
annotated.vcfOutput = swapExt(new File(snps.variants_out),".vcf",".annotated.vcf").getAbsolutePath
2010-08-20 04:29:48 +08:00
annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
annotated.rodToIntervalTrackName = "variant"
annotated.scatterCount = 100
// 2.a filter on cluster and near indels
val masker = new VariantFiltration with CommandLineGATKArgs
2010-08-26 02:52:44 +08:00
masker.rodBind :+= RodBind("variant", "VCF", new File(annotated.vcfOutput))
masker.rodBind :+= RodBind("mask", "VCF", new File(indels.variants_out))
2010-08-20 04:29:48 +08:00
masker.maskName = "NearIndel"
masker.clusterWindowSize = Some(qscript.snpClusterWindow)
masker.clusterSize = Some(qscript.snpsInCluster)
2010-08-26 02:52:44 +08:00
masker.out = swapExt(new File(annotated.vcfOutput),".vcf",".indel.masked.vcf")
2010-08-20 04:29:48 +08:00
// 2.b hand filter with standard filter
val handFilter = new VariantFiltration with CommandLineGATKArgs
2010-08-26 02:52:44 +08:00
handFilter.rodBind :+= RodBind("variant", "VCF", new File(annotated.vcfOutput))
handFilter.rodBind :+= RodBind("mask", "VCF", new File(indels.variants_out))
2010-08-20 04:29:48 +08:00
handFilter.filterName ++= List("StrandBias","AlleleBalance","QualByDepth","HomopolymerRun")
handFilter.filterExpression ++= List("\"SB>=0.10\"","\"AB>=0.75\"","QD<5","\"HRun>=4\"")
2010-08-26 02:52:44 +08:00
handFilter.out = swapExt(new File(annotated.vcfOutput),".vcf",".handfiltered.vcf")
2010-08-20 04:29:48 +08:00
// 3.i generate gaussian clusters on the masked vcf
// todo -- args for annotations?
// todo -- args for resources (properties file)
val clusters = new GenerateVariantClusters with CommandLineGATKArgs
clusters.rodBind :+= RodBind("input", "VCF", masker.out)
2010-08-26 02:52:44 +08:00
val clusters_clusterFile = swapExt(new File(snps.variants_out),".vcf",".cluster").getAbsolutePath
2010-08-20 04:29:48 +08:00
clusters.memoryLimit = Some(8)
clusters.jobQueue = "hugemem"
clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
2010-08-26 02:52:44 +08:00
// clusters.path_to_resources = "/humgen/gsa-scr1/chartl/sting/R"
// clusters.path_to_Rscript = "/broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.7.2/bin/Rscript"
2010-08-20 04:29:48 +08:00
// 3.ii apply gaussian clusters to the masked vcf
val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
recalibrate.clusterFile = clusters.clusterFile
recalibrate.rodBind :+= RodBind("input", "VCF", masker.out)
recalibrate.out = swapExt(masker.out,".vcf",".optimized.vcf")
// todo -- inputs for Ti/Tv expectation and other things -- command line
2010-08-26 02:52:44 +08:00
recalibrate.target_titv = 2.1
2010-08-20 04:29:48 +08:00
// 3.iii apply variant cuts to the clusters
val cut = new ApplyVariantCuts with CommandLineGATKArgs
cut.rodBind :+= RodBind("input", "VCF", recalibrate.out)
//cut.outputVCFFile = swapExt(recalibrate.out,".vcf",".tranched.vcf")
//cut.tranchesFile = swapExt(recalibrate.out,".vcf",".tranch")
2010-08-26 02:52:44 +08:00
val cut_outputVCFFile = swapExt(recalibrate.out,".vcf",".tranched.vcf").getAbsolutePath
val cut_tranchesFile = swapExt(recalibrate.out,".vcf",".tranch").getAbsolutePath
2010-08-20 04:29:48 +08:00
// todo -- fdr inputs, etc
cut.fdr_filter_level = Some(10)
// 4. Variant eval the cut and the hand-filtered vcf files
val eval = new VariantEval with CommandLineGATKArgs
2010-08-26 02:52:44 +08:00
eval.rodBind :+= RodBind("evalOptimized", "VCF", new File(cut_outputVCFFile))
2010-08-20 04:29:48 +08:00
eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out)
eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
eval.out = new File(base+".eval")
add(snps,indels,annotated,masker,handFilter,clusters,recalibrate,cut,eval)
}
2010-07-29 21:17:14 +08:00
2010-08-10 00:42:48 +08:00
}