From 9820a12fa533caaf81b2a441537e453e85c1442d Mon Sep 17 00:00:00 2001
From: kiran <kiran@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 26 Sep 2010 16:55:58 +0000
Subject: [PATCH] Removed unnecessary dbSNP big-table dependency.  Ti/Tv is now
 required.  Consistent downsampling level for all programs.  Spelling
 corrections.  VariantEval now generates R-style output.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4355 348d0f76-0448-11de-a6fe-93d51630548a
---
 scala/qscript/fullCallingPipeline.q | 97 +++++++++++++----------------
 1 file changed, 44 insertions(+), 53 deletions(-)

diff --git a/scala/qscript/fullCallingPipeline.q b/scala/qscript/fullCallingPipeline.q
index 8e210ae0d..ecd0fd335 100755
--- a/scala/qscript/fullCallingPipeline.q
+++ b/scala/qscript/fullCallingPipeline.q
@@ -7,6 +7,7 @@ import org.broadinstitute.sting.queue.extensions.samtools._
 import org.broadinstitute.sting.queue.{QException, QScript}
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
 
 class fullCallingPipeline extends QScript {
   qscript =>
@@ -29,33 +30,33 @@ class fullCallingPipeline extends QScript {
   @Input(doc="refseqTable", shortName="refseqTable")
   var refseqTable: File = _
 
-  @Input(doc="dbsnpTable", shortName="dbsnpTable")
-  var dbsnpTable: File = _
-
-  @Input(doc="Picard FixMateInformation.jar.  At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar.  Outside the broad see http://picard.sourceforge.net/")
+  @Input(doc="Picard FixMateInformation.jar.  At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar.  Outside the Broad see http://picard.sourceforge.net/")
   var picardFixMatesJar: File = _
 
   @Input(doc="gatk jar")
   var gatkJar: File = _
 
   @Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false)
-  var snpsInCluster = 4
+  var snpsInCluster = 3
 
   @Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
-  var snpClusterWindow = 7
+  var snpClusterWindow = 10
 
-  @Input(doc="target titv for recalibration",shortName="titv",required=false)
-  var target_titv = 2.1
+  @Input(doc="target titv for recalibration",shortName="titv",required=true)
+  var target_titv: Float = _
 
   @Input(doc="downsampling coverage",shortName="dcov",required=false)
-  var downsampling_coverage = 200
+  var downsampling_coverage = 300
 
-  @Input(doc="Number of jobs to scatter unifeid genotyper",shortName="snpScatter",required=false)
+  @Input(doc="Number of jobs to scatter UnifiedGenotyper",shortName="snpScatter",required=false)
   var num_snp_scatter_jobs = 50
 
-  @Input(doc="Number of jobs to scatter indel genotyper",shortName="indelScatter",required=false)
+  @Input(doc="Number of jobs to scatter IndelGenotyperV2",shortName="indelScatter",required=false)
   var num_indel_scatter_jobs = 5
 
+  @Input(doc="Indel-clean BAM files",shortName="clean",required=false)
+  var clean_bam_files = 1
+
   @Input(doc="ADPR script")
   var adprScript: File = _
 
@@ -71,11 +72,10 @@ class fullCallingPipeline extends QScript {
     this.intervals = qscript.pipeline.getProject.getIntervalList
     this.jarFile = qscript.gatkJar
     this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
+    this.memoryLimit = Some(2)
   }
 
 
-
-
   // ------------ SETUP THE PIPELINE ----------- //
 
 
@@ -84,27 +84,24 @@ class fullCallingPipeline extends QScript {
     val projectBase: String = qscript.pipeline.getProject.getName
     val cleanedBase: String = projectBase + ".cleaned"
     val uncleanedBase: String = projectBase + ".uncleaned"
+
     // there are commands that use all the bam files
-    val recalibratedSamples = qscript.pipeline.getSamples
-            .filter(_.getBamFiles.contains("recalibrated"))
+    val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
     val adprRScript = qscript.adprScript
     val seq = qscript.machine
     val expKind = qscript.protocol
+
     for ( sample <- recalibratedSamples ) {
-
-      // put unclean bams in unclean genotypers
-
-      // in advance, create the extension files
-
+      // put unclean bams in unclean genotypers in advance, create the extension files
       val bam = sample.getBamFiles.get("recalibrated")
-      if (!sample.getBamFiles.contains("cleaned"))
+      if (!sample.getBamFiles.contains("cleaned")) {
         sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam"))
-      val cleaned_bam = sample.getBamFiles.get("cleaned")
+      }
 
+      val cleaned_bam = sample.getBamFiles.get("cleaned")
       val indel_targets = swapExt(bam,"bam","realigner_targets.interval_list")
 
       // create the cleaning commands
-
       val targetCreator = new RealignerTargetCreator with CommandLineGATKArgs
       targetCreator.analysisName = "CreateTargets_"+bam.getName
       targetCreator.input_file :+= bam
@@ -126,8 +123,6 @@ class fullCallingPipeline extends QScript {
           def outputBam = fixed
         }
 
-
-
       // realigner.out = cleaned_bam
       // realigner.scatterClass = classOf[ContigScatterFunction]
       // realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar }
@@ -163,27 +158,33 @@ class fullCallingPipeline extends QScript {
       samtoolsindex.analysisName = "index_"+cleaned_bam.getName
 
       // COMMENT THIS NEXT BLOCK TO SKIP CLEANING
-      if ( realigner.scatterCount > 1 )
-          add(targetCreator,realigner,samtoolsindex)
-      else
-          add(targetCreator,realigner,fixMates,samtoolsindex)
+      if (qscript.clean_bam_files == 1) {
+        if ( realigner.scatterCount > 1 ) {
+            add(targetCreator,realigner,samtoolsindex)
+        } else {
+            add(targetCreator,realigner,fixMates,samtoolsindex)
+        }
+      }
     }
 
     val recalibratedBamFiles = recalibratedSamples
             .map(_.getBamFiles.get("recalibrated"))
             .toList
-    
+
     val cleanBamFiles = qscript.pipeline.getSamples
             .filter(_.getBamFiles.contains("cleaned"))
             .map(_.getBamFiles.get("cleaned"))
             .toList
 
     // actually make calls
-    //endToEnd(uncleanedBase,recalibratedBamFiles, adprRscript, seq, expKind)
-    endToEnd(uncleanedBase,recalibratedBamFiles, seq, expKind)
-    // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES
-    //endToEnd(cleanedBase,cleanBamFiles, adprRscript, seq, expKind)
-    endToEnd(cleanedBase,cleanBamFiles, seq, expKind)
+    if (qscript.clean_bam_files == 1) {
+      // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES
+      //endToEnd(cleanedBase, cleanBamFiles, adprRscript, seq, expKind)
+      endToEnd(cleanedBase, cleanBamFiles, seq, expKind)
+    } else {
+      //endToEnd(uncleanedBase, recalibratedBamFiles, adprRscript, seq, expKind)
+      endToEnd(uncleanedBase, recalibratedBamFiles, seq, expKind)
+    }
   }
 
   //def endToEnd(base: String, bamFiles: List[File], adprthing: File, seqinfo: String, exptype: String) = {
@@ -199,7 +200,7 @@ class fullCallingPipeline extends QScript {
     snps.standard_min_confidence_threshold_for_emitting = Some(10)
     snps.min_mapping_quality_score = Some(20)
     snps.min_base_quality_score = Some(20)
-    snps.downsample_to_coverage = Some(200)
+    snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
     //snps.annotation :+= "QualByDepthV2"
 
     if (qscript.trigger != null) {
@@ -216,7 +217,6 @@ class fullCallingPipeline extends QScript {
 
     snps.scatterCount = qscript.num_snp_scatter_jobs
 
-
     // indel genotyper does one sample at a time
     var indelCallFiles = List.empty[RodBind]
     var indelGenotypers = List.empty[IndelGenotyperV2 with CommandLineGATKArgs]
@@ -227,9 +227,9 @@ class fullCallingPipeline extends QScript {
       indel.analysisName = "IndelGenotyper_"+bam.getName
       indel.input_file :+= bam
       indel.out = swapExt(bam,".bam",".indels.vcf")
-      indel.downsample_to_coverage = Some(500)
+      indel.downsample_to_coverage = Some(qscript.downsampling_coverage)
       indelCallFiles :+= RodBind("v"+loopNo.toString, "VCF", indel.out)
-      indel.scatterCount = qscript.num_indel_scatter_jobs
+      //indel.scatterCount = qscript.num_indel_scatter_jobs
 
       indelGenotypers :+= indel
 
@@ -248,21 +248,19 @@ class fullCallingPipeline extends QScript {
     mergeIndels.rodBind = indelCallFiles
     mergeIndels.analysisName = base+"_MergeIndels"
 
-
     // 1b. genomically annotate SNPs -- no longer slow
     val annotated = new GenomicAnnotator with CommandLineGATKArgs
     annotated.rodBind :+= RodBind("variant", "VCF", snps.out)
     annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.refseqTable)
-    annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
+    //annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
     annotated.out = swapExt(snps.out,".vcf",".annotated.vcf")
-    annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
+    //annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
     annotated.rodToIntervalTrackName = "variant"
     annotated.analysisName = base+"_GenomicAnnotator"
 
-
     // 2.a filter on cluster and near indels
     val masker = new VariantFiltration with CommandLineGATKArgs
-    masker.variantVCF = annotated.out 
+    masker.variantVCF = annotated.out
     masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out)
     masker.maskName = "NearIndel"
     masker.clusterWindowSize = Some(qscript.snpClusterWindow)
@@ -270,7 +268,6 @@ class fullCallingPipeline extends QScript {
     masker.out = swapExt(annotated.out,".vcf",".indel.masked.vcf")
     masker.analysisName = base+"_Cluster_and_Indel_filter"
 
-
     // 2.b hand filter with standard filter
     val handFilter = new VariantFiltration with CommandLineGATKArgs
     handFilter.variantVCF = masker.out
@@ -280,7 +277,6 @@ class fullCallingPipeline extends QScript {
     handFilter.out = swapExt(annotated.out,".vcf",".handfiltered.vcf")
     handFilter.analysisName = base+"_HandFilter"
 
-
     // 3.i generate gaussian clusters on the masked vcf
     // todo -- args for annotations?
     // todo -- args for resources (properties file)
@@ -295,7 +291,6 @@ class fullCallingPipeline extends QScript {
     clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
     clusters.analysisName = base+"_Cluster"
 
-
     // 3.ii apply gaussian clusters to the masked vcf
     val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
     recalibrate.clusterFile = clusters.clusterFile
@@ -316,14 +311,13 @@ class fullCallingPipeline extends QScript {
     cut.fdr_filter_level = Some(1)
     cut.analysisName = base+"_ApplyVariantCuts"
 
-    
     // 4. Variant eval the cut and the hand-filtered vcf files
     val eval = new VariantEval with CommandLineGATKArgs
     eval.rodBind :+= RodBind("evalOptimized", "VCF", cut.out)
     eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out)
     eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
-    //eval.reportLocation = new File(base+".eval")
-    //eval.reportType = "R"
+    eval.reportLocation = new File(base+".eval")
+    eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
     eval.analysisName = base+"_VariantEval"
 
     add(snps)
@@ -353,14 +347,11 @@ class fullCallingPipeline extends QScript {
     // to be in the working directory. When database access is ready, this and the protocol and sequencer parameters of
     //the r script will go away.
 
-
     for ( igv2 <- indelGenotypers ) {
       add(igv2)
     }
 
 //    add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval,adpr)
     add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval)
-
   }
-
 }