now merging bam files per sample and processing according to cleaning options.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5477 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-18 21:31:29 +00:00
parent 1c95208e26
commit c9442e4b21
1 changed files with 59 additions and 24 deletions

View File

@ -5,6 +5,9 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.function.ListWriterFunction
import net.sf.samtools.{SAMFileReader,SAMFileHeader,SAMReadGroupRecord}
import collection.JavaConversions._
class dataProcessingV2 extends QScript {
qscript =>
@ -62,34 +65,66 @@ class dataProcessingV2 extends QScript {
// Helpful variables
val outName: String = qscript.outputDir + qscript.projectName
// BAM files generated by the pipeline
val joinedBams = new File(outName + ".join.bam")
val cleanedBam = new File(outName + ".clean.bam")
val dedupedBam = new File(outName + ".clean.dedup.bam")
val recalBam = new File(outName + ".clean.dedup.recal.bam")
// Accessory files
val targetIntervals = new File(outName + ".intervals")
val metricsFile = new File(outName + ".metrics")
val preRecalFile = new File(outName + ".pre_recal.csv")
val postRecalFile = new File(outName + ".post_recal.csv")
val preOutPath = new File(outName + ".pre")
val postOutPath = new File(outName + ".post")
//todo -- process bam headers to compile bamLists of samples.
var sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
for (bam <- scala.io.Source.fromFile(input).getLines) {
val bamFile = new File(bam)
val samReader = new SAMFileReader(bamFile)
val header = samReader.getFileHeader()
val readGroup = header.getReadGroups()
for (rg <- readGroup) {
val sample = rg.getSample()
if (!sampleTable.contains(sample))
sampleTable(sample) = List(bamFile)
else if ( !sampleTable(sample).contains(bamFile))
sampleTable(sample) :+= bamFile
}
}
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
for ((sample, flist) <- sampleTable) {
val sampleFileName = new File(outName + "." + sample + ".bam")
sampleBamFiles(sample) = sampleFileName
add(joinBams(flist, sampleFileName))
}
add(joinBams(input, joinedBams),
target(joinedBams, targetIntervals),
clean(joinedBams, targetIntervals, cleanedBam),
dedup(cleanedBam, dedupedBam, metricsFile),
cov(dedupedBam, preRecalFile),
recal(dedupedBam, preRecalFile, recalBam),
cov(recalBam, postRecalFile),
analyzeCovariates(preRecalFile, preOutPath),
analyzeCovariates(postRecalFile, postOutPath))
println("\nFound the following samples (files created as necessary): ")
for ((sample, file) <- sampleBamFiles)
println("\t" + sample + " -> " + file)
val globalIntervals = new File(outName + ".intervals")
if (knownsOnly)
add(target(null, globalIntervals))
for ((sample, bam) <- sampleBamFiles) {
// BAM files generated by the pipeline
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
// Accessory files
val targetIntervals = if (knownsOnly) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
val metricsFile = swapExt(bam, ".bam", ".metrics")
val preRecalFile = swapExt(bam, ".bam", ".pre_recal.csv")
val postRecalFile = swapExt(bam, ".bam", ".post_recal.csv")
val preOutPath = swapExt(bam, ".bam", ".pre")
val postOutPath = swapExt(bam, ".bam", ".post")
if (!knownsOnly)
add(target(bam, targetIntervals))
add(clean(bam, targetIntervals, cleanedBam),
dedup(cleanedBam, dedupedBam, metricsFile),
cov(dedupedBam, preRecalFile),
recal(dedupedBam, preRecalFile, recalBam),
cov(recalBam, postRecalFile),
analyzeCovariates(preRecalFile, preOutPath),
analyzeCovariates(postRecalFile, postOutPath))
}
}
// General arguments to all programs
@ -100,11 +135,11 @@ class dataProcessingV2 extends QScript {
this.isIntermediate = true
}
case class joinBams (inBams: File, outBam: File) extends PicardBamJarFunction {
case class joinBams (inBams: List[File], outBam: File) extends PicardBamJarFunction {
@Input(doc="input bam list") var join = inBams
@Output(doc="joined bam") var joined = outBam
@Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai")
override def inputBams = List(join)
override def inputBams = join
override def outputBam = joined
override def commandLine = super.commandLine + " CREATE_INDEX=true"
this.memoryLimit = Some(6)