now merging bam files per sample and processing according to cleaning options.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5477 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1c95208e26
commit
c9442e4b21
|
|
@ -5,6 +5,9 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
|||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
||||
|
||||
import net.sf.samtools.{SAMFileReader,SAMFileHeader,SAMReadGroupRecord}
|
||||
import collection.JavaConversions._
|
||||
|
||||
|
||||
class dataProcessingV2 extends QScript {
|
||||
qscript =>
|
||||
|
|
@ -62,34 +65,66 @@ class dataProcessingV2 extends QScript {
|
|||
// Helpful variables
|
||||
val outName: String = qscript.outputDir + qscript.projectName
|
||||
|
||||
// BAM files generated by the pipeline
|
||||
val joinedBams = new File(outName + ".join.bam")
|
||||
val cleanedBam = new File(outName + ".clean.bam")
|
||||
val dedupedBam = new File(outName + ".clean.dedup.bam")
|
||||
val recalBam = new File(outName + ".clean.dedup.recal.bam")
|
||||
|
||||
// Accessory files
|
||||
val targetIntervals = new File(outName + ".intervals")
|
||||
val metricsFile = new File(outName + ".metrics")
|
||||
val preRecalFile = new File(outName + ".pre_recal.csv")
|
||||
val postRecalFile = new File(outName + ".post_recal.csv")
|
||||
val preOutPath = new File(outName + ".pre")
|
||||
val postOutPath = new File(outName + ".post")
|
||||
|
||||
|
||||
//todo -- process bam headers to compile bamLists of samples.
|
||||
var sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
|
||||
for (bam <- scala.io.Source.fromFile(input).getLines) {
|
||||
val bamFile = new File(bam)
|
||||
val samReader = new SAMFileReader(bamFile)
|
||||
val header = samReader.getFileHeader()
|
||||
val readGroup = header.getReadGroups()
|
||||
for (rg <- readGroup) {
|
||||
val sample = rg.getSample()
|
||||
if (!sampleTable.contains(sample))
|
||||
sampleTable(sample) = List(bamFile)
|
||||
else if ( !sampleTable(sample).contains(bamFile))
|
||||
sampleTable(sample) :+= bamFile
|
||||
}
|
||||
}
|
||||
|
||||
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
|
||||
|
||||
for ((sample, flist) <- sampleTable) {
|
||||
val sampleFileName = new File(outName + "." + sample + ".bam")
|
||||
sampleBamFiles(sample) = sampleFileName
|
||||
add(joinBams(flist, sampleFileName))
|
||||
}
|
||||
|
||||
add(joinBams(input, joinedBams),
|
||||
target(joinedBams, targetIntervals),
|
||||
clean(joinedBams, targetIntervals, cleanedBam),
|
||||
dedup(cleanedBam, dedupedBam, metricsFile),
|
||||
cov(dedupedBam, preRecalFile),
|
||||
recal(dedupedBam, preRecalFile, recalBam),
|
||||
cov(recalBam, postRecalFile),
|
||||
analyzeCovariates(preRecalFile, preOutPath),
|
||||
analyzeCovariates(postRecalFile, postOutPath))
|
||||
println("\nFound the following samples (files created as necessary): ")
|
||||
for ((sample, file) <- sampleBamFiles)
|
||||
println("\t" + sample + " -> " + file)
|
||||
|
||||
val globalIntervals = new File(outName + ".intervals")
|
||||
if (knownsOnly)
|
||||
add(target(null, globalIntervals))
|
||||
|
||||
for ((sample, bam) <- sampleBamFiles) {
|
||||
|
||||
// BAM files generated by the pipeline
|
||||
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
|
||||
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
|
||||
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
|
||||
|
||||
// Accessory files
|
||||
val targetIntervals = if (knownsOnly) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
|
||||
val metricsFile = swapExt(bam, ".bam", ".metrics")
|
||||
val preRecalFile = swapExt(bam, ".bam", ".pre_recal.csv")
|
||||
val postRecalFile = swapExt(bam, ".bam", ".post_recal.csv")
|
||||
val preOutPath = swapExt(bam, ".bam", ".pre")
|
||||
val postOutPath = swapExt(bam, ".bam", ".post")
|
||||
|
||||
if (!knownsOnly)
|
||||
add(target(bam, targetIntervals))
|
||||
|
||||
add(clean(bam, targetIntervals, cleanedBam),
|
||||
dedup(cleanedBam, dedupedBam, metricsFile),
|
||||
cov(dedupedBam, preRecalFile),
|
||||
recal(dedupedBam, preRecalFile, recalBam),
|
||||
cov(recalBam, postRecalFile),
|
||||
analyzeCovariates(preRecalFile, preOutPath),
|
||||
analyzeCovariates(postRecalFile, postOutPath))
|
||||
}
|
||||
}
|
||||
|
||||
// General arguments to all programs
|
||||
|
|
@ -100,11 +135,11 @@ class dataProcessingV2 extends QScript {
|
|||
this.isIntermediate = true
|
||||
}
|
||||
|
||||
case class joinBams (inBams: File, outBam: File) extends PicardBamJarFunction {
|
||||
case class joinBams (inBams: List[File], outBam: File) extends PicardBamJarFunction {
|
||||
@Input(doc="input bam list") var join = inBams
|
||||
@Output(doc="joined bam") var joined = outBam
|
||||
@Output(doc="joined bam index") var joinedIndex = new File(outBam + "bai")
|
||||
override def inputBams = List(join)
|
||||
override def inputBams = join
|
||||
override def outputBam = joined
|
||||
override def commandLine = super.commandLine + " CREATE_INDEX=true"
|
||||
this.memoryLimit = Some(6)
|
||||
|
|
|
|||
Loading…
Reference in New Issue