From c9442e4b218aea5fa52ced2fce3ea1056b18d2ed Mon Sep 17 00:00:00 2001
From: carneiro <carneiro@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 18 Mar 2011 21:31:29 +0000
Subject: [PATCH] now merging bam files per sample and processing according to
 cleaning options.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5477 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../oneoffs/carneiro/dataProcessingV2.scala   | 83 +++++++++++++------
 1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala
index 38b726b41..7d85152a7 100755
--- a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala
+++ b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala
@@ -5,6 +5,9 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.function.ListWriterFunction
 
+import net.sf.samtools.{SAMFileReader,SAMFileHeader,SAMReadGroupRecord}
+import collection.JavaConversions._
+
 
 class dataProcessingV2 extends QScript {
   qscript =>
@@ -62,34 +65,66 @@ class dataProcessingV2 extends QScript {
     // Helpful variables
     val outName: String         = qscript.outputDir + qscript.projectName
     
-    // BAM files generated by the pipeline
-    val joinedBams = new File(outName + ".join.bam")
-    val cleanedBam = new File(outName + ".clean.bam")
-    val dedupedBam = new File(outName + ".clean.dedup.bam")
-    val recalBam   = new File(outName + ".clean.dedup.recal.bam")
-
-    // Accessory files
-    val targetIntervals = new File(outName + ".intervals")
-    val metricsFile     = new File(outName + ".metrics")
-    val preRecalFile    = new File(outName + ".pre_recal.csv")
-    val postRecalFile   = new File(outName + ".post_recal.csv")
-    val preOutPath      = new File(outName + ".pre")
-    val postOutPath     = new File(outName + ".post")
 
 
     //todo -- process bam headers to compile bamLists of samples.
+    var sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
+    for (bam <- scala.io.Source.fromFile(input).getLines) {
+      val bamFile = new File(bam)
+      val samReader = new SAMFileReader(bamFile)
+      val header = samReader.getFileHeader()
+      val readGroup = header.getReadGroups()
+      for (rg <- readGroup) {
+        val sample = rg.getSample()
+        if (!sampleTable.contains(sample))
+          sampleTable(sample) = List(bamFile)
+        else if ( !sampleTable(sample).contains(bamFile))
+          sampleTable(sample) :+= bamFile
+      }
+    }
 
+    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
 
+    for ((sample, flist) <- sampleTable) {
+      val sampleFileName = new File(outName + "." + sample + ".bam")
+      sampleBamFiles(sample) = sampleFileName
+      add(joinBams(flist, sampleFileName))
+    }
 
-    add(joinBams(input, joinedBams),
-        target(joinedBams, targetIntervals),
-        clean(joinedBams, targetIntervals, cleanedBam),
-        dedup(cleanedBam, dedupedBam, metricsFile),
-        cov(dedupedBam, preRecalFile),
-        recal(dedupedBam, preRecalFile, recalBam),
-        cov(recalBam, postRecalFile),
-        analyzeCovariates(preRecalFile, preOutPath),
-        analyzeCovariates(postRecalFile, postOutPath))
+    println("\nFound the following samples (files created as necessary): ")
+    for ((sample, file) <- sampleBamFiles)
+      println("\t" + sample + " -> " + file)
+
+    val globalIntervals = new File(outName + ".intervals")
+    if (knownsOnly)
+      add(target(null, globalIntervals))
+
+    for ((sample, bam) <- sampleBamFiles) {
+
+      // BAM files generated by the pipeline
+      val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
+      val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
+      val recalBam   = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
+
+      // Accessory files
+      val targetIntervals = if (knownsOnly) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
+      val metricsFile     = swapExt(bam, ".bam", ".metrics")
+      val preRecalFile    = swapExt(bam, ".bam", ".pre_recal.csv")
+      val postRecalFile   = swapExt(bam, ".bam", ".post_recal.csv")
+      val preOutPath      = swapExt(bam, ".bam", ".pre")
+      val postOutPath     = swapExt(bam, ".bam", ".post")
+
+      if (!knownsOnly)
+        add(target(bam, targetIntervals))
+
+      add(clean(bam, targetIntervals, cleanedBam),
+          dedup(cleanedBam, dedupedBam, metricsFile),
+          cov(dedupedBam, preRecalFile),
+          recal(dedupedBam, preRecalFile, recalBam),
+          cov(recalBam, postRecalFile),
+          analyzeCovariates(preRecalFile, preOutPath),
+          analyzeCovariates(postRecalFile, postOutPath))
+    }
   }
 
   // General arguments to all programs
@@ -100,11 +135,11 @@ class dataProcessingV2 extends QScript {
     this.isIntermediate = true
   }
 
-  case class joinBams (inBams: File, outBam: File) extends PicardBamJarFunction {
+  case class joinBams (inBams: List[File], outBam: File) extends PicardBamJarFunction {
     @Input(doc="input bam list") var join = inBams
     @Output(doc="joined bam") var joined = outBam
     @Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai")
-    override def inputBams = List(join)
+    override def inputBams = join
     override def outputBam = joined
     override def commandLine = super.commandLine + " CREATE_INDEX=true"
     this.memoryLimit = Some(6)