data processing pipeline now has on the fly bam indexing (powered by Matt) some new parameters, Indel Cleaning with constrain movement and fixMates is gone.

setting up methods development pipeline for some cosmetic changes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5277 348d0f76-0448-11de-a6fe-93d51630548a
2011-02-18 23:13:54 +00:00 · 2011-02-18 23:13:54 +00:00 · c61dd2f09f
parent 290afae047
commit c61dd2f09f
2 changed files with 67 additions and 59 deletions
--- a/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala
+++ b/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala
@ -1,7 +1,10 @@
+import org.broadinstitute.sting.commandline.ArgumentSource
 import org.broadinstitute.sting.gatk.CommandLineGATK
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
+import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
+

 class MethodsDevelopmentCallingPipeline extends QScript {
  qscript =>
@ -12,13 +15,13 @@ class MethodsDevelopmentCallingPipeline extends QScript {
  @Argument(shortName="outputDir", doc="output directory", required=true)
  var outputDir: String = "./"

-  @Argument(shortName="skipCalling", doc="If true, skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
+  @Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
  var skipCalling: Boolean = false

  @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
  var datasets: List[String] = Nil

-  @Argument(shortName="skipGoldStandard", doc="runs the pipeline with the goldstandard VCF files for comparison", required=false)
+  @Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false)
  var skipGoldStandard: Boolean = false

  @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
@ -30,7 +33,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
  @Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false)
  var eval: Boolean = false

-  @Argument(shortName="noCut", doc="adds the ApplyVariantCut walker to the pipeline", required=false)
+  @Argument(shortName="noCut", doc="removes the ApplyVariantCut walker from the pipeline", required=false)
  var noCut: Boolean = false

  @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false)
@ -183,10 +186,25 @@ class MethodsDevelopmentCallingPipeline extends QScript {
    this.out = t.rawVCF
    this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE})
    this.analysisName = t.name + "_UG"
-    if (t.dbsnpFile.endsWith(".rod"))
-      this.DBSNP = new File(t.dbsnpFile)
-    else if (t.dbsnpFile.endsWith(".vcf"))
-      this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
+    if (t.dbsnpFile.endsWith(".rod")) this.DBSNP = new File(t.dbsnpFile)
+    else if (t.dbsnpFile.endsWith(".vcf")) this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
+/*
+    this.setupScatterFunction = {
+      case scatter: ScatterFunction =>
+        scatter.commandDirectory = new File("UG/ScatterGather")
+        scatter.jobOutputFile = new File(".queue/UG/ScatterGather/Scatter.out")
+    }
+    this.setupCloneFunction = {
+      case (clone: CloneFunction, index: Int) =>
+        clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
+        clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
+    }
+    this.setupGatherFunction = {
+      case (gather: GatherFunction, source: ArgumentSource) =>
+        gather.commandDirectory = new File("UG/ScatterGather/Gather_%s".format(source.field.getName))
+        gather.jobOutputFile = new File(".queue/UG/ScatterGather/Gather_%s.out".format(source.field.getName))
+    }
+*/
  }

  // 2.) Filter SNPs
--- a/scala/qscript/oneoffs/carneiro/dataProcessing.scala
+++ b/scala/qscript/oneoffs/carneiro/dataProcessing.scala
@ -9,27 +9,31 @@ import scala.io.Source
 class dataProcessing extends QScript {
  qscript =>

-  @Input(doc="path to GATK jar", shortName="gatk", required=true)
+  @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
  var GATKjar: File = _

  @Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
  var ACJar: File = _

-  // todo -- we should support the standard GATK arguments -R, -D [dbsnp],
-  // todo -- and indel files.  Those should be defaulted to hg19 but providable on command line
-  @Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
+  @Input(doc="path to Picard's MarkDuplicates.jar", shortName="dedup", required=true)
+  var dedupJar: File = _
+
+  @Input(doc="path to R resources folder inside the Sting repository", shortName="r", required=true)
  var R: String = _

-  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", shortName="fixmates", required=false)
-  var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
-
-  @Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
-  var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
-
-  @Input(doc="input BAM file", shortName="i", required=true)
+  @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
  var input: String = _

-  @Input(doc="final output BAM file base name", shortName="p", required=false)
+  @Input(doc="Reference fasta file", shortName="R", required=false)
+  var reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
+
+  @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)     // todo -- accept any format. Not only VCF.
+  val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
+
+  @Input(doc="extra VCF files to use as reference indels for Indel Realignment", shortName="indels", required=false)  //todo -- once vcfs are merged, this will become the only indel vcf to be used and the merged file will be the default.
+  val indels: File = _
+
+  @Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", shortName="p", required=false)
  var projectName: String = "combined"

  @Input(doc="output path", shortName="outputDir", required=false)
@ -43,11 +47,7 @@ class dataProcessing extends QScript {
  var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")


-  // Reference sequence, dbsnps and RODs used by the pipeline
-  val reference: File          = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
-  val dbSNP: File              = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
-
-  // TODO -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
+  // todo -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
  val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
  val dindelAFRCalls: String   = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
  val dindelASNCalls: String   = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
@ -60,7 +60,7 @@ class dataProcessing extends QScript {
  // General arguments to all programs
  trait CommandLineGATKArgs extends CommandLineGATK {
    this.jarFile = qscript.GATKjar
-    this.reference_sequence = reference
+    this.reference_sequence = qscript.reference
    this.memoryLimit = Some(4)
    this.isIntermediate = true
  }
@ -70,16 +70,14 @@ class dataProcessing extends QScript {

    var perLaneBamList: List[String] = Nil
    var recalibratedBamList: List[File] = Nil
-    var recalibratedBamIndexList: List[File] = Nil

    // Populates the list of per lane bam files to process (single bam or list of bams).
-    if (input.endsWith("bam"))  {
+    if (input.endsWith("bam"))
      perLaneBamList :+= input
-    }
-    else {
+    else
      for (bam <- Source.fromFile(input).getLines())
        perLaneBamList :+= bam
-    }
+



@ -90,10 +88,9 @@ class dataProcessing extends QScript {
      val baseDir: String         = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)

      // BAM files generated by the pipeline
-      val cleanedBam: String      = baseName + ".cleaned.bam"
-      val fixedBam: String        = baseName + ".cleaned.fixed.bam"
-      val dedupedBam: String      = baseName + ".cleaned.fixed.dedup.bam"
-      val recalBam: String        = baseName + ".cleaned.fixed.dedup.recal.bam"
+      val cleanedBam: String      = baseName + ".clean.bam"
+      val dedupedBam: String      = baseName + ".clean.dedup.bam"
+      val recalBam: String        = baseName + ".clean.dedup.recal.bam"

      // Accessory files
      val targetIntervals: String = baseName + ".indel.intervals"
@ -104,19 +101,15 @@ class dataProcessing extends QScript {
      val postOutPath: String     = baseName + ".post"

      add(new target(perLaneBam, targetIntervals),
-          new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly), // todo -- use constrained movement mode to skip this
-          new fixMates(cleanedBam, fixedBam, intermediate), // todo -- use constrained movement mode to skip this
-          new dedup(fixedBam, dedupedBam, metricsFile),     // todo -- generate index on fly here
-          new index(dedupedBam), // todo -- remove for on the fly index
+          new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly, intermediate),
+          new dedup(cleanedBam, dedupedBam, metricsFile),
          new cov(dedupedBam, preRecalFile),
-          new recal(dedupedBam, preRecalFile, recalBam), // todo -- use GATK on the fly indexing?
-          new index(recalBam), // todo remove for on the fly indexing
+          new recal(dedupedBam, preRecalFile, recalBam),
          new cov(recalBam, postRecalFile),
          new analyzeCovariates(preRecalFile, preOutPath),
          new analyzeCovariates(postRecalFile, postOutPath))

      recalibratedBamList :+= new File(recalBam)
-      recalibratedBamIndexList :+= new File(recalBam + ".bai")  // to hold next process by this dependency
    }

    // Helpful variables
@ -125,16 +118,15 @@ class dataProcessing extends QScript {

    // BAM files generated by the pipeline
    val bamList: String         = outDir + outName + ".list"
-    val cleanedBam: String      = outDir + outName + ".cleaned.bam"
-    val fixedBam: String        = outDir + outName + ".final.bam"
+    val cleanedBam: String      = outDir + outName + ".clean.bam"
+    val fixedBam: String        = outDir + outName + ".processed.bam"

    // Accessory files
    val targetIntervals: String = outDir + outName + ".indel.intervals"

-    add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList),
-        new target(bamList, targetIntervals),
-        new clean(bamList, targetIntervals, cleanedBam, !knownsOnly), // todo -- use constrained movement mode to skip fix mates
-        new fixMates(cleanedBam, fixedBam, !intermediate)) // todo -- use constrained movement mode to skip this
+    add(new writeList(recalibratedBamList, bamList),
+        new target(bamList, targetIntervals),       // todo -- reuse previously generated intervals (see how to do that)
+        new clean(bamList, targetIntervals, cleanedBam, !knownsOnly, !intermediate))
  }

  class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
@ -146,12 +138,13 @@ class dataProcessing extends QScript {
      this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
      this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
      this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
+      if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
      this.jobName = inBams + ".tgt"
      if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
      else this.intervals :+= qscript.intervals
  }

-  class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs {
+  class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean, intermediate: Boolean) extends IndelRealigner with CommandLineGATKArgs {
    this.input_file :+= new File(inBams)
    this.targetIntervals = new File(tIntervals)
    this.out = new File(outBam)
@ -162,8 +155,11 @@ class dataProcessing extends QScript {
    this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
    this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
    this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
+    if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
    this.useOnlyKnownIndels = knownsOnly
    this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
+    this.constrainMovement = true
+    this.isIntermediate = intermediate
    this.jobName = inBams + ".clean"
    if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
    else this.intervals :+= qscript.intervals
@ -174,6 +170,7 @@ class dataProcessing extends QScript {
    @Output(doc="fixed bam") var fixed: File = new File(outBam)
    override def inputBams = List(cleaned)
    override def outputBam = fixed
+    override def commandLine = super.commandLine + " CREATE_INDEX=true"
    this.jarFile = qscript.fixMatesJar
    this.isIntermediate = intermediate
    this.memoryLimit = Some(6)
@ -185,21 +182,14 @@ class dataProcessing extends QScript {
    @Output(doc="deduped bam") var deduped: File = new File(outBam)
    override def inputBams = List(clean)
    override def outputBam = deduped
-    override def commandLine = super.commandLine + " M=" + metricsFile
+    override def commandLine = super.commandLine + " M=" + metricsFile + " CREATE_INDEX=true"
    sortOrder = null
    this.memoryLimit = Some(6)
    this.jarFile = qscript.dedupJar
+    this.isIntermediate = true
    this.jobName = inBam + ".dedup"
  }

-  // todo -- may we should use the picard version instead?  What about telling all of the picard tools to
-  // todo -- generate BAM indices on the fly?  That would be even better
-  class index (inBam: String) extends SamtoolsIndexFunction {
-    @Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
-    this.bamFile = new File(inBam)
-    this.analysisName = inBam + ".index"
-  }
-
  class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
@ -211,6 +201,7 @@ class dataProcessing extends QScript {
    this.input_file :+= new File (inBam)
    this.recal_file = new File(inRecalFile)
    this.out = new File(outBam)
+    this.index_output_bam_on_the_fly = Some(true)
  }

  class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
@ -220,8 +211,7 @@ class dataProcessing extends QScript {
    this.output_dir = outPath
  }

-  class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction {
-    @Input(doc="bam indexes") var indexes: List[File] = depIndices   // I need this dependency to hold creation of the list until all indices are done
+  class writeList(inBams: List[File], outBamList: String) extends ListWriterFunction {
    this.inputFiles = inBams
    this.listFile = new File(outBamList)
    this.jobName = "bamList"