From c9442e4b218aea5fa52ced2fce3ea1056b18d2ed Mon Sep 17 00:00:00 2001 From: carneiro Date: Fri, 18 Mar 2011 21:31:29 +0000 Subject: [PATCH] now merging bam files per sample and processing according to cleaning options. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5477 348d0f76-0448-11de-a6fe-93d51630548a --- .../oneoffs/carneiro/dataProcessingV2.scala | 83 +++++++++++++------ 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala index 38b726b41..7d85152a7 100755 --- a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala +++ b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala @@ -5,6 +5,9 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction +import net.sf.samtools.{SAMFileReader,SAMFileHeader,SAMReadGroupRecord} +import collection.JavaConversions._ + class dataProcessingV2 extends QScript { qscript => @@ -62,34 +65,66 @@ class dataProcessingV2 extends QScript { // Helpful variables val outName: String = qscript.outputDir + qscript.projectName - // BAM files generated by the pipeline - val joinedBams = new File(outName + ".join.bam") - val cleanedBam = new File(outName + ".clean.bam") - val dedupedBam = new File(outName + ".clean.dedup.bam") - val recalBam = new File(outName + ".clean.dedup.recal.bam") - - // Accessory files - val targetIntervals = new File(outName + ".intervals") - val metricsFile = new File(outName + ".metrics") - val preRecalFile = new File(outName + ".pre_recal.csv") - val postRecalFile = new File(outName + ".post_recal.csv") - val preOutPath = new File(outName + ".pre") - val postOutPath = new File(outName + ".post") //todo -- process bam headers to compile bamLists of samples. + var sampleTable = scala.collection.mutable.Map.empty[String, List[File]] + for (bam <- scala.io.Source.fromFile(input).getLines) { + val bamFile = new File(bam) + val samReader = new SAMFileReader(bamFile) + val header = samReader.getFileHeader() + val readGroup = header.getReadGroups() + for (rg <- readGroup) { + val sample = rg.getSample() + if (!sampleTable.contains(sample)) + sampleTable(sample) = List(bamFile) + else if ( !sampleTable(sample).contains(bamFile)) + sampleTable(sample) :+= bamFile + } + } + val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] + for ((sample, flist) <- sampleTable) { + val sampleFileName = new File(outName + "." + sample + ".bam") + sampleBamFiles(sample) = sampleFileName + add(joinBams(flist, sampleFileName)) + } - add(joinBams(input, joinedBams), - target(joinedBams, targetIntervals), - clean(joinedBams, targetIntervals, cleanedBam), - dedup(cleanedBam, dedupedBam, metricsFile), - cov(dedupedBam, preRecalFile), - recal(dedupedBam, preRecalFile, recalBam), - cov(recalBam, postRecalFile), - analyzeCovariates(preRecalFile, preOutPath), - analyzeCovariates(postRecalFile, postOutPath)) + println("\nFound the following samples (files created as necessary): ") + for ((sample, file) <- sampleBamFiles) + println("\t" + sample + " -> " + file) + + val globalIntervals = new File(outName + ".intervals") + if (knownsOnly) + add(target(null, globalIntervals)) + + for ((sample, bam) <- sampleBamFiles) { + + // BAM files generated by the pipeline + val cleanedBam = swapExt(bam, ".bam", ".clean.bam") + val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") + val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") + + // Accessory files + val targetIntervals = if (knownsOnly) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")} + val metricsFile = swapExt(bam, ".bam", ".metrics") + val preRecalFile = swapExt(bam, ".bam", ".pre_recal.csv") + val postRecalFile = swapExt(bam, ".bam", ".post_recal.csv") + val preOutPath = swapExt(bam, ".bam", ".pre") + val postOutPath = swapExt(bam, ".bam", ".post") + + if (!knownsOnly) + add(target(bam, targetIntervals)) + + add(clean(bam, targetIntervals, cleanedBam), + dedup(cleanedBam, dedupedBam, metricsFile), + cov(dedupedBam, preRecalFile), + recal(dedupedBam, preRecalFile, recalBam), + cov(recalBam, postRecalFile), + analyzeCovariates(preRecalFile, preOutPath), + analyzeCovariates(postRecalFile, postOutPath)) + } } // General arguments to all programs @@ -100,11 +135,11 @@ class dataProcessingV2 extends QScript { this.isIntermediate = true } - case class joinBams (inBams: File, outBam: File) extends PicardBamJarFunction { + case class joinBams (inBams: List[File], outBam: File) extends PicardBamJarFunction { @Input(doc="input bam list") var join = inBams @Output(doc="joined bam") var joined = outBam @Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai") - override def inputBams = List(join) + override def inputBams = join override def outputBam = joined override def commandLine = super.commandLine + " CREATE_INDEX=true" this.memoryLimit = Some(6)