now merging bam files per sample and processing according to cleaning options.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5477 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-18 21:31:29 +00:00 · 2011-03-18 21:31:29 +00:00 · c9442e4b21
parent 1c95208e26
commit c9442e4b21
1 changed files with 59 additions and 24 deletions
--- a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala
+++ b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala
@ -5,6 +5,9 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.function.ListWriterFunction

+import net.sf.samtools.{SAMFileReader,SAMFileHeader,SAMReadGroupRecord}
+import collection.JavaConversions._
+

 class dataProcessingV2 extends QScript {
  qscript =>
@ -62,34 +65,66 @@ class dataProcessingV2 extends QScript {
    // Helpful variables
    val outName: String         = qscript.outputDir + qscript.projectName
    
-    // BAM files generated by the pipeline
-    val joinedBams = new File(outName + ".join.bam")
-    val cleanedBam = new File(outName + ".clean.bam")
-    val dedupedBam = new File(outName + ".clean.dedup.bam")
-    val recalBam   = new File(outName + ".clean.dedup.recal.bam")
-
-    // Accessory files
-    val targetIntervals = new File(outName + ".intervals")
-    val metricsFile     = new File(outName + ".metrics")
-    val preRecalFile    = new File(outName + ".pre_recal.csv")
-    val postRecalFile   = new File(outName + ".post_recal.csv")
-    val preOutPath      = new File(outName + ".pre")
-    val postOutPath     = new File(outName + ".post")


    //todo -- process bam headers to compile bamLists of samples.
+    var sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
+    for (bam <- scala.io.Source.fromFile(input).getLines) {
+      val bamFile = new File(bam)
+      val samReader = new SAMFileReader(bamFile)
+      val header = samReader.getFileHeader()
+      val readGroup = header.getReadGroups()
+      for (rg <- readGroup) {
+        val sample = rg.getSample()
+        if (!sampleTable.contains(sample))
+          sampleTable(sample) = List(bamFile)
+        else if ( !sampleTable(sample).contains(bamFile))
+          sampleTable(sample) :+= bamFile
+      }
+    }

+    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]

+    for ((sample, flist) <- sampleTable) {
+      val sampleFileName = new File(outName + "." + sample + ".bam")
+      sampleBamFiles(sample) = sampleFileName
+      add(joinBams(flist, sampleFileName))
+    }

-    add(joinBams(input, joinedBams),
-        target(joinedBams, targetIntervals),
-        clean(joinedBams, targetIntervals, cleanedBam),
-        dedup(cleanedBam, dedupedBam, metricsFile),
-        cov(dedupedBam, preRecalFile),
-        recal(dedupedBam, preRecalFile, recalBam),
-        cov(recalBam, postRecalFile),
-        analyzeCovariates(preRecalFile, preOutPath),
-        analyzeCovariates(postRecalFile, postOutPath))
+    println("\nFound the following samples (files created as necessary): ")
+    for ((sample, file) <- sampleBamFiles)
+      println("\t" + sample + " -> " + file)
+
+    val globalIntervals = new File(outName + ".intervals")
+    if (knownsOnly)
+      add(target(null, globalIntervals))
+
+    for ((sample, bam) <- sampleBamFiles) {
+
+      // BAM files generated by the pipeline
+      val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
+      val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
+      val recalBam   = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
+
+      // Accessory files
+      val targetIntervals = if (knownsOnly) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
+      val metricsFile     = swapExt(bam, ".bam", ".metrics")
+      val preRecalFile    = swapExt(bam, ".bam", ".pre_recal.csv")
+      val postRecalFile   = swapExt(bam, ".bam", ".post_recal.csv")
+      val preOutPath      = swapExt(bam, ".bam", ".pre")
+      val postOutPath     = swapExt(bam, ".bam", ".post")
+
+      if (!knownsOnly)
+        add(target(bam, targetIntervals))
+
+      add(clean(bam, targetIntervals, cleanedBam),
+          dedup(cleanedBam, dedupedBam, metricsFile),
+          cov(dedupedBam, preRecalFile),
+          recal(dedupedBam, preRecalFile, recalBam),
+          cov(recalBam, postRecalFile),
+          analyzeCovariates(preRecalFile, preOutPath),
+          analyzeCovariates(postRecalFile, postOutPath))
+    }
  }

  // General arguments to all programs
@ -100,11 +135,11 @@ class dataProcessingV2 extends QScript {
    this.isIntermediate = true
  }

-  case class joinBams (inBams: File, outBam: File) extends PicardBamJarFunction {
+  case class joinBams (inBams: List[File], outBam: File) extends PicardBamJarFunction {
    @Input(doc="input bam list") var join = inBams
    @Output(doc="joined bam") var joined = outBam
    @Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai")
-    override def inputBams = List(join)
+    override def inputBams = join
    override def outputBam = joined
    override def commandLine = super.commandLine + " CREATE_INDEX=true"
    this.memoryLimit = Some(6)