From 3ec0e09edd6e6be2a3fdb519b20bbe84c9a72eb9 Mon Sep 17 00:00:00 2001 From: corin Date: Fri, 24 Sep 2010 19:28:43 +0000 Subject: [PATCH] ADPR is now included in the full calling pipeline. The most up to date version of the ADPR is about to be committed and should be used with the script for now. The qscript now calls for two additional strings as inputs: the sequencing machines used and the sequencing protocol. In order for ADPR to finish successfully, a squid file for both the lane and sample level data needs to be produced, reformatted and named _lanes.txt or _samps.txt, respectively. These files need to be in the working directory. When database access is ready, this and the protocol and sequencer parameters of the r script will go away. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4345 348d0f76-0448-11de-a6fe-93d51630548a --- scala/qscript/fullCallingPipeline.q | 52 +++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/scala/qscript/fullCallingPipeline.q b/scala/qscript/fullCallingPipeline.q index c269a7af7..f5321e824 100755 --- a/scala/qscript/fullCallingPipeline.q +++ b/scala/qscript/fullCallingPipeline.q @@ -56,6 +56,15 @@ class fullCallingPipeline extends QScript { @Input(doc="Number of jobs to scatter indel genotyper",shortName="indelScatter",required=false) var num_indel_scatter_jobs = 5 + @Input(doc="ADPR script") + var adprScript: File = _ + + @Input(doc="Sequencing maching name (for use by adpr)") + var machine: String = _ + + @Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection") + var protocol: String = _ + private var pipeline: Pipeline = _ trait CommandLineGATKArgs extends CommandLineGATK { @@ -65,6 +74,8 @@ class fullCallingPipeline extends QScript { } + + // ------------ SETUP THE PIPELINE ----------- // @@ -76,7 +87,9 @@ class fullCallingPipeline extends QScript { // there are commands that use all the bam files val recalibratedSamples = qscript.pipeline.getSamples .filter(_.getBamFiles.contains("recalibrated")) - + val adprRScript = qscript.adprScript + val seq = qscript.machine + val expKind = qscript.protocol for ( sample <- recalibratedSamples ) { // put unclean bams in unclean genotypers @@ -166,12 +179,12 @@ class fullCallingPipeline extends QScript { .toList // actually make calls - endToEnd(uncleanedBase,recalibratedBamFiles) + endToEnd(uncleanedBase,recalibratedBamFiles, adprRscript, seq, expKind) // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES - endToEnd(cleanedBase,cleanBamFiles) + endToEnd(cleanedBase,cleanBamFiles, adprRscript, seq, expKind) } - def endToEnd(base: String, bamFiles: List[File]) = { + def endToEnd(base: String, bamFiles: List[File], adprthing: File, seqinfo: String, exptype: String) = { // step through the un-indel-cleaned graph: // 1a. call snps and indels @@ -306,16 +319,43 @@ class fullCallingPipeline extends QScript { eval.rodBind :+= RodBind("evalOptimized", "VCF", cut.out) eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out) eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator") - eval.out = new File(base+".eval") + eval.reportLocation = new File(base+".eval") + eval.reportType = "R" eval.analysisName = base+"_VariantEval" add(snps) + // 5. Run the ADPR and make pretty stuff + + val adpr = new CommandLineFunction{ + @Input(doc="Dependent files") var dependents: File = _ + @Output(doc="Automated Data processing report") var out: File = _ + var setname: String + var protocol: String + var sequencer: String + var scriptloc: File + def commandLine = "Rscript %s %s %s %s" + .format(scriptloc, setname, protocol, sequencer) + } + + adpr.setname = base + adpr.scriptloc = adprthing + adpr.sequencer = seqinfo + adpr.protocol = exptype + adpr.dependents = eval.reportLocation + adpr.out = new File(base + "_adpr.pdf") + adpr.analysisName = base + "_ADPR" + //In order for ADPR to finish successfully, a squid file for both the lane and sample level data needs to be + // produced, reformatted and named _lanes.txt or _samps.txt, respectively. These files + // to be in the working directory. When database access is ready, this and the protocol and sequencer parameters of + //the r script will go away. + + for ( igv2 <- indelGenotypers ) { add(igv2) } - add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval) + add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval,adpr) }