From bcc09f5d8c19fd4b9a340e6205d916aeddaa6139 Mon Sep 17 00:00:00 2001
From: kiran <kiran@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 26 Sep 2010 22:50:30 +0000
Subject: [PATCH] Simplifications: removed command-line arguments to control
 SNP cluster filter parameters.  Infer the number of contigs to scatter indel
 cleaning from the contig list (which we should get rid of too).  Changed the
 PY argument to just Y for specifying the path to the YAML file.  Cleaned up
 command-line argument documentation.  See
 http://iwww.broadinstitute.org/gsa/wiki/index.php/Queue-based_pipeline for a
 list of remaining issues.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4356 348d0f76-0448-11de-a6fe-93d51630548a
---
 scala/qscript/fullCallingPipeline.q | 94 ++++++++++++++---------------
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/scala/qscript/fullCallingPipeline.q b/scala/qscript/fullCallingPipeline.q
index ecd0fd335..d0f5e8e86 100755
--- a/scala/qscript/fullCallingPipeline.q
+++ b/scala/qscript/fullCallingPipeline.q
@@ -1,3 +1,4 @@
+import net.sf.picard.reference.FastaSequenceFile
 import org.broadinstitute.sting.datasources.pipeline.Pipeline
 import org.broadinstitute.sting.gatk.DownsampleType
 import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
@@ -12,59 +13,47 @@ import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
 class fullCallingPipeline extends QScript {
   qscript =>
 
-  @Argument(doc="contigIntervals", shortName="contigIntervals")
+  @Argument(doc="list of contigs in the reference over which indel-cleaning jobs should be scattered (ugly)", shortName="contigIntervals")
   var contigIntervals: File = _
 
-  @Argument(doc="numContigs", shortName="numContigs")
-  var numContigs: Int = _
+  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
+  var yamlFile: File = _
 
-  @Argument(fullName="pipeline_yaml", shortName="PY", doc="Pipeline YAML file")
-  var pipelineYamlFile: File = _
-
-  @Input(doc="trigger", shortName="trigger", required=false)
+  @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
   var trigger: File = _
 
-  @Input(doc="compCEU",shortName="ceu",required=false)
-  var comp1KGCEU: File = _
-
-  @Input(doc="refseqTable", shortName="refseqTable")
+  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
   var refseqTable: File = _
 
-  @Input(doc="Picard FixMateInformation.jar.  At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar.  Outside the Broad see http://picard.sourceforge.net/")
-  var picardFixMatesJar: File = _
+  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)
+  var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
 
-  @Input(doc="gatk jar")
+  @Input(doc="path to GATK jar")
   var gatkJar: File = _
 
-  @Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false)
-  var snpsInCluster = 3
-
-  @Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
-  var snpClusterWindow = 10
-
-  @Input(doc="target titv for recalibration",shortName="titv",required=true)
+  @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
   var target_titv: Float = _
 
-  @Input(doc="downsampling coverage",shortName="dcov",required=false)
+  @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
   var downsampling_coverage = 300
 
-  @Input(doc="Number of jobs to scatter UnifiedGenotyper",shortName="snpScatter",required=false)
-  var num_snp_scatter_jobs = 50
+  @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
+  var num_snp_scatter_jobs = 20
 
-  @Input(doc="Number of jobs to scatter IndelGenotyperV2",shortName="indelScatter",required=false)
+  @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
   var num_indel_scatter_jobs = 5
 
-  @Input(doc="Indel-clean BAM files",shortName="clean",required=false)
-  var clean_bam_files = 1
+  @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
+  var skip_cleaning = false
 
-  @Input(doc="ADPR script")
-  var adprScript: File = _
+  //@Input(doc="ADPR script")
+  //var adprScript: File = _
 
-  @Input(doc="Sequencing maching name (for use by adpr)")
-  var machine: String = _
+  //@Input(doc="Sequencing maching name (for use by adpr)")
+  //var machine: String = _
 
-  @Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
-  var protocol: String = _
+  //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
+  //var protocol: String = _
 
   private var pipeline: Pipeline = _
 
@@ -80,16 +69,23 @@ class fullCallingPipeline extends QScript {
 
 
   def script = {
-    pipeline = YamlUtils.load(classOf[Pipeline], qscript.pipelineYamlFile)
+    pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
+
     val projectBase: String = qscript.pipeline.getProject.getName
     val cleanedBase: String = projectBase + ".cleaned"
     val uncleanedBase: String = projectBase + ".uncleaned"
 
     // there are commands that use all the bam files
     val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
-    val adprRScript = qscript.adprScript
-    val seq = qscript.machine
-    val expKind = qscript.protocol
+    //val adprRScript = qscript.adprScript
+    //val seq = qscript.machine
+    //val expKind = qscript.protocol
+
+    // count number of contigs (needed for indel cleaning parallelism)
+    var contigCount = 0
+    for ( line <- scala.io.Source.fromFile(qscript.contigIntervals).getLines ) {
+      contigCount += 1
+    }
 
     for ( sample <- recalibratedSamples ) {
       // put unclean bams in unclean genotypers in advance, create the extension files
@@ -112,7 +108,7 @@ class fullCallingPipeline extends QScript {
       realigner.input_file = targetCreator.input_file
       realigner.intervals = qscript.contigIntervals
       realigner.targetIntervals = new java.io.File(targetCreator.out.getAbsolutePath)
-      realigner.scatterCount = qscript.numContigs
+      realigner.scatterCount = contigCount
 
       // may need to explicitly run fix mates
       var fixMates = new PicardBamJarFunction {
@@ -157,8 +153,7 @@ class fullCallingPipeline extends QScript {
       samtoolsindex.bamFile = cleaned_bam
       samtoolsindex.analysisName = "index_"+cleaned_bam.getName
 
-      // COMMENT THIS NEXT BLOCK TO SKIP CLEANING
-      if (qscript.clean_bam_files == 1) {
+      if (!qscript.skip_cleaning) {
         if ( realigner.scatterCount > 1 ) {
             add(targetCreator,realigner,samtoolsindex)
         } else {
@@ -177,18 +172,17 @@ class fullCallingPipeline extends QScript {
             .toList
 
     // actually make calls
-    if (qscript.clean_bam_files == 1) {
-      // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES
+    if (!qscript.skip_cleaning) {
       //endToEnd(cleanedBase, cleanBamFiles, adprRscript, seq, expKind)
-      endToEnd(cleanedBase, cleanBamFiles, seq, expKind)
+      endToEnd(cleanedBase, cleanBamFiles)
     } else {
       //endToEnd(uncleanedBase, recalibratedBamFiles, adprRscript, seq, expKind)
-      endToEnd(uncleanedBase, recalibratedBamFiles, seq, expKind)
+      endToEnd(uncleanedBase, recalibratedBamFiles)
     }
   }
 
   //def endToEnd(base: String, bamFiles: List[File], adprthing: File, seqinfo: String, exptype: String) = {
-  def endToEnd(base: String, bamFiles: List[File], seqinfo: String, exptype: String) = {
+  def endToEnd(base: String, bamFiles: List[File]) = {
 
     // step through the un-indel-cleaned graph:
     // 1a. call snps and indels
@@ -211,9 +205,9 @@ class fullCallingPipeline extends QScript {
     }
 
     // todo -- add generalize comp inputs
-    if ( qscript.comp1KGCEU != null ) {
-      snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
-    }
+    //if ( qscript.comp1KGCEU != null ) {
+    //  snps.rodBind :+= RodBind( "comp1KG_CEU", "VCF", qscript.comp1KGCEU )
+    //}
 
     snps.scatterCount = qscript.num_snp_scatter_jobs
 
@@ -263,8 +257,8 @@ class fullCallingPipeline extends QScript {
     masker.variantVCF = annotated.out
     masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out)
     masker.maskName = "NearIndel"
-    masker.clusterWindowSize = Some(qscript.snpClusterWindow)
-    masker.clusterSize = Some(qscript.snpsInCluster)
+    masker.clusterWindowSize = Some(10)
+    masker.clusterSize = Some(3)
     masker.out = swapExt(annotated.out,".vcf",".indel.masked.vcf")
     masker.analysisName = base+"_Cluster_and_Indel_filter"