Major refactoring of library and full calling pipeline (v2) structure.

Arguments to the full calling qscript (and indeed, any qscript that wants them) are now specified via the PipelineArgumentCollection Libraries require a Pipeline object for instantiation -- eliminating their previous dependence on yaml files Functions added to PipelineUtils to build out the proper Pipeline object from the PipelineArgumentCollection, which now contains additional arguments to specify pipeline properties (name, ref, bams, dbsnp, interval list); which are mutually exclusive with the yaml file. Pipeline length reduced to a mere 62 lines. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4790 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-05 02:33:54 +00:00 · 2010-12-05 02:33:54 +00:00 · 0944184832
parent bef48e7a42
commit 0944184832
6 changed files with 169 additions and 82 deletions
--- a/scala/qscript/chartl/fullCallingPipelineV2.q
+++ b/scala/qscript/chartl/fullCallingPipelineV2.q
@ -1,79 +1,31 @@
+import org.broadinstitute.sting.commandline.ArgumentCollection
 import org.broadinstitute.sting.datasources.pipeline.Pipeline
 import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK
-import org.broadinstitute.sting.queue.pipeline.{BamProcessing,VariantCalling}
+import org.broadinstitute.sting.queue.pipeline._
+import org.broadinstitute.sting.queue.util.PipelineUtils
 import org.broadinstitute.sting.queue.{QException, QScript}
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.yaml.YamlUtils

 class fullCallingPipelineV2 extends QScript {
-  qscript =>
-
-  @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false)
-  var cleaningJobs: Int = 1
-
-  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
-  var yamlFile: File = _
-
-  @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
-  var trigger: File = _
-
-  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
-  var refseqTable: File = _
-
-  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)
-  var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
-
-  @Input(doc="path to GATK jar", shortName="gatk")
-  var gatkJar: File = _
-
-  @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
-  var target_titv: Float = _
-
-  @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
-  var downsampling_coverage = 300
-
-  @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
-  var num_snp_scatter_jobs = 20
-
-  @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
-  var num_indel_scatter_jobs = 5
-
-  @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
-  var skip_cleaning = false
-
-  //@Input(doc="ADPR script")
-  //var adprScript: File = _
-
-  //@Input(doc="Sequencing maching name (for use by adpr)")
-  //var machine: String = _
-
-  //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
-  //var protocol: String = _
+  fcp =>

+  @ArgumentCollection var pipelineArgs = new PipelineArgumentCollection
+  
  private var pipeline: Pipeline = _

-  trait CommandLineGATKArgs extends CommandLineGATK {
-    this.intervals :+= qscript.pipeline.getProject.getIntervalList
-    this.jarFile = qscript.gatkJar
-    this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
-    this.memoryLimit = Some(4)
-  }
-
-
-  // ------------ SETUP THE PIPELINE ----------- //
-
-
  def script = {
-    pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
-    var callingLib: VariantCalling = new VariantCalling(qscript.yamlFile,qscript.gatkJar)
-    var cleaningLib: BamProcessing = new BamProcessing(qscript.yamlFile,qscript.gatkJar,qscript.picardFixMatesJar)
+    pipelineArgs.verifyArguments
+    pipeline = PipelineUtils.loadPipelineFromPAC(pipelineArgs)
+    var callingLib: VariantCalling = new VariantCalling(pipeline,fcp.pipelineArgs.gatkJar)
+    var cleaningLib: BamProcessing = new BamProcessing(pipeline,fcp.pipelineArgs.gatkJar,fcp.pipelineArgs.picardFixMatesJar)

-    val projectBase: String = qscript.pipeline.getProject.getName
+    val projectBase: String = fcp.pipeline.getProject.getName
    val cleanedBase: String = projectBase + ".cleaned"
    val uncleanedBase: String = projectBase + ".uncleaned"

    // there are commands that use all the bam files
-    val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
+    val recalibratedSamples = fcp.pipeline.getSamples.filter( u => ( u.getBamFiles.contains("recalibrated") || u.getBamFiles.contains("cleaned") ) )

    var bamsToClean: List[(File,File)] = Nil
    var recalBams: List[File] = Nil
@ -90,11 +42,11 @@ class fullCallingPipelineV2 extends QScript {
      cleanedBams :+= sample.getBamFiles.get("cleaned")
    }

-    if ( !qscript.skip_cleaning ) {
-      addAll(cleaningLib.StandardIndelRealign(bamsToClean,qscript.cleaningJobs))
+    if ( !fcp.pipelineArgs.skip_cleaning ) {
+      addAll(cleaningLib.StandardIndelRealign(bamsToClean,fcp.pipelineArgs.cleaningJobs))
    }

-    if (!qscript.skip_cleaning) {
+    if (!fcp.pipelineArgs.skip_cleaning ) {
      endToEnd(cleanedBase, cleanedBams, callingLib)
    } else {
      endToEnd(uncleanedBase, recalBams, callingLib)
@ -107,12 +59,8 @@ class fullCallingPipelineV2 extends QScript {
    var handfilt_vcf = new File(base+"_snps.handfiltered.annotated.vcf")
    var indel_vcf = new File(base+"_indel_calls.vcf")

-    addAll(lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,qscript.target_titv,qscript.refseqTable))
+    addAll(lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,fcp.pipelineArgs.target_titv,fcp.pipelineArgs.refseqTable))
  }

-  def addAll(clfs: List[CommandLineFunction]) = {
-    for ( c <- clfs ) {
-      add(c)
-    }
-  }
+  def addAll(clfs: List[CommandLineFunction]) = { clfs.foreach(c => add(c)) }
 }
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
@ -16,10 +16,13 @@ import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
 import org.broadinstitute.sting.commandline.{Output, Input}
 import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction

-class BamProcessing(yaml: File, gatkJar: File, fixMatesJar: File) {
+class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
  library =>

-  var attributes: Pipeline = YamlUtils.load(classOf[Pipeline],yaml)
+  var attributes : Pipeline = attribs
+
+  def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
+

  trait StandardCommandLineGATK extends CommandLineGATK {
    this.reference_sequence = library.attributes.getProject.getReferenceFile
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
@ -0,0 +1,86 @@
+package org.broadinstitute.sting.queue.pipeline
+
+import org.broadinstitute.sting.commandline.{Input, Argument, Hidden}
+import java.io.File
+
+
+class PipelineArgumentCollection {
+
+  @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false)
+  var cleaningJobs: Int = 1
+
+  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y", required=false)
+  var yamlFile: File = _
+
+  @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
+  var trigger: File = _
+
+  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
+  var refseqTable: File = _
+
+  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)
+  var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
+
+  @Input(doc="path to GATK jar", shortName="gatk")
+  var gatkJar: File = _
+
+  @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
+  var target_titv: Float = _
+
+  @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
+  var downsampling_coverage = 300
+
+  @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
+  var num_snp_scatter_jobs = 20
+
+  @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
+  var num_indel_scatter_jobs = 5
+
+  @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
+  var skip_cleaning = false
+
+  @Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 "+
+          "cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam)."+
+          "Mutually exclusive with YAML",required=false, shortName="pBams")
+  var projectBams: File = _
+
+  @Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName")
+  var projectName: String = _
+
+  @Input(doc="The reference file. Mutually exclusive with YAML.", required=false, shortName="pRef")
+  var projectRef: File = _
+
+  @Input(doc="The project interval list. Mutually exclusive with YAML.", required=false, shortName="pInt")
+  var projectIntervals: File = _
+
+  @Input(doc="The project dbsnp. Mutually exclusive with YAML",required=false, shortName="pDB")
+  var projectDBSNP: File = _
+
+  //@Input(doc="ADPR script")
+  //var adprScript: File = _
+
+  //@Input(doc="Sequencing maching name (for use by adpr)")
+  //var machine: String = _
+
+  //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
+  //var protocol: String = _
+
+  def verifyArguments = {
+    // if no yaml file we require all the mutually exclusive arguments
+    if ( yamlFile == null ) {
+      if ( projectBams == null ) throw new IllegalArgumentException("No YAML file provided; and no project bam list. Pipeline requires either YAML file, or full project specs.")
+
+      if ( projectName == null ) throw new IllegalArgumentException("No YAML file provided; and no project name. Pipeline requires either YAML file, or full project specs.")
+
+      if ( projectRef == null) throw new IllegalArgumentException("No YAML file provided; and no project reference. Pipeline requires either YAML file, or full project specs.")
+
+      if ( projectIntervals == null ) throw new IllegalArgumentException("No YAML file provided; and no project intervals. Pipeline requires either YAML file, or full project specs.")
+
+      if ( projectDBSNP == null ) throw new IllegalArgumentException("No YAML file provided; and no project DBSNP. Pipeline requires either YAML file, or full project specs.")
+    } else {
+      if ( projectBams != null || projectName != null || projectRef != null || projectIntervals != null || projectDBSNP != null )
+        throw new IllegalArgumentException("YAML file provided, along with other project spec arguments. YAML file is mutually exclusive with project specs.")
+    }
+  }
+
+}
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
@ -15,22 +15,22 @@ class ProjectManagement(stingPath: String) {

  var stingDirPath : String = stingPath

-  class PassFilterSites(vcf_files: List[File], out_list: File) extends CommandLineFunction {
+  class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
    @Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
    @Output(doc="The file to write the site list to") var out_intervals = out_list
    @Argument(doc="Path to the SortByRef script") var sortByRef: String = _
    @Argument(doc="Path to the reference file on disk") var ref: File = _

    def commandLine = {
-      "grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3}' | sort -n -k2,2 | uniq | perl %s - %s.fai | awk '{print $1\":\"$2}' > %s".format(
-        vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_list.getAbsolutePath
+      "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
+        vcf_files(1).getAbsolutePath, out_list.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_list.getAbsolutePath
        )
    }
  }

  def MergeBatches( callVCFs: List[File], allBams: List[File], mergedVCF: File, ref: File) : List[CommandLineFunction] = {
    var cmds : List[CommandLineFunction] = Nil
-    var pfSites : PassFilterSites = new PassFilterSites(callVCFs,swapExt(mergedVCF,".vcf",".pf.intervals.list"))
+    var pfSites : PassFilterAlleles = new PassFilterAlleles(callVCFs,swapExt(mergedVCF,".vcf",".pf.alleles.vcf"))
    pfSites.sortByRef = pm.stingDirPath+"perl/sortByRef.pl"
    pfSites.ref = ref

@ -46,12 +46,11 @@ class ProjectManagement(stingPath: String) {
    
  }

-  def LikelihoodCalc( bam: File, ref: File, intervals: File ) : UGCalcLikelihoods = {
+  def LikelihoodCalc( bam: File, ref: File, alleleVCF: File ) : UGCalcLikelihoods = {
    var calc = new UGCalcLikelihoods
    calc.input_file :+= bam
    calc.reference_sequence = ref
    calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
-    calc.intervals :+= intervals
    calc.downsample_to_coverage = Some(300)
    calc.memoryLimit = Some(2)
    calc.min_base_quality_score = Some(22)
@ -59,15 +58,18 @@ class ProjectManagement(stingPath: String) {
    calc.genotype = true
    calc.output_all_callable_bases = true
    calc.out = swapExt(bam,".bam",".likelihoods.vcf")
+    calc.rodBind :+= new RodBind("allele","VCF",alleleVCF)
+    calc.BTI = "allele"

    return calc
  }

-  def VariantCallMerge( likelihoods: List[File], ref: File, intervals: File, output: File) : UGCallVariants = {
+  def VariantCallMerge( likelihoods: List[File], ref: File, intervalVCF: File, output: File) : UGCallVariants = {
    var call = new UGCallVariants
    call.reference_sequence = ref
    call.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
-    call.intervals :+= intervals
+    call.rodBind :+= new RodBind("allele","vcf",intervalVCF)
+    call.BTI = "allele"
    call.memoryLimit = Some(4)
    call.out = output
    call.rodBind ++= likelihoods.map( a => new RodBind("variant"+a.getName.replace(".vcf",""),"vcf",a) )
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
@ -8,12 +8,14 @@ import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.utils.yaml.YamlUtils
 import org.broadinstitute.sting.queue.function.CommandLineFunction

-class VariantCalling(yaml: File,gatkJar: File) {
+class VariantCalling(attribs: Pipeline,gatkJar: File) {
  vc =>

  // load attributes
-  var attributes: Pipeline = YamlUtils.load(classOf[Pipeline], yaml)
-  
+  var attributes = attribs
+
+  def this(yaml: File, gatkJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar)
+
  /**
   * Trait to propagate basic attributes throughout the library
   */
--- a/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala
+++ b/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala
@ -5,6 +5,10 @@ import java.io.File
 import org.broadinstitute.sting.utils.GenomeLocParser
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.interval.IntervalUtils
+import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.datasources.pipeline.{PipelineSample, PipelineProject, Pipeline}
+import org.broadinstitute.sting.utils.text.XReadLines

 class PipelineUtils {

@ -52,4 +56,46 @@ object PipelineUtils{

    splitContigs.map{case (size, contigs) => contigs}
  }
+
+  def loadPipelineFromPAC(args: PipelineArgumentCollection) : Pipeline = {
+    if ( args.yamlFile != null ) {
+      return YamlUtils.load(classOf[Pipeline], args.yamlFile)
+    } else {
+      return loadPipelineFromSpec(args.projectName,args.projectRef,args.projectIntervals,args.projectDBSNP,args.projectBams)
+    }
+  }
+
+  def loadPipelineFromSpec(name: String, ref: File, ivals: File, dbsnp: File, pBamList: File) : Pipeline = {
+    var newPipeline : Pipeline = new Pipeline
+    var pipeProject : PipelineProject = new PipelineProject
+    var pipeSamples : List[PipelineSample] = ((new XReadLines(pBamList)).readLines).toList.map( bamSpecToSample )
+
+    pipeProject.setName(name)
+    pipeProject.setReferenceFile(ref)
+    pipeProject.setIntervalList(ivals)
+    pipeProject.setDbsnpFile(dbsnp)
+
+    newPipeline.setProject(pipeProject)
+    newPipeline.setSamples(pipeSamples)
+    
+    return newPipeline
+  }
+
+  //todo -- find a better name for this function
+  def bamSpecToSample(spec: String) : PipelineSample = {
+    var sam : PipelineSample = new PipelineSample
+    var spStr : Array[String] = spec.split("\\s")
+    sam.setId(spStr(0))
+    var tagStr : Array[String] = spStr(1).split(",")
+    var tagMap : java.util.HashMap[String,String] = new java.util.HashMap[String,String](tagStr.size)
+    tagStr.filter( u => ! u.equals("")).foreach( u => tagMap.put(u.split(":")(0),u.split(":")(1)) )
+    sam.setTags(tagMap)
+    var bamStr : Array[String] = spStr(2).split(",")
+    var bamMap : java.util.HashMap[String,File] = new java.util.HashMap[String,File](bamStr.size)
+    bamStr.foreach( u => bamMap.put( u.split(":")(0), new File(u.split(":")(1) ) ) )
+    sam.setBamFiles(bamMap)
+
+    return sam
+
+  }
 }