Fixed list dependency

Instead of creating a bam list file, I dynamically create a scala list and pass as parameters. This way the intermediate bam files don't get deleted before they should.
This commit is contained in:
Mauricio Carneiro 2011-08-24 11:10:40 -04:00
parent 219252a566
commit cd12f7f286
1 changed files with 36 additions and 40 deletions

View File

@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript {
// Because the realignment only happens after these scripts are executed, in case you are using
// bwa realignment, this function will operate over the original bam files and output over the
// (to be realigned) bam files.
def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = {
def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = {
// Creating a table with SAMPLE information from each input BAM file
val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript {
sampleTable(sample) :+= rBam
}
}
return sampleTable.toMap
println("\n\n*** INPUT FILES ***\n")
// Creating one file for each sample in the dataset
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
for ((sample, flist) <- sampleTable) {
println(sample + ":")
for (f <- flist)
println (f)
println()
val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
sampleBamFiles(sample) = sampleFileName
add(writeList(flist, sampleFileName))
}
println("*** INPUT FILES ***\n\n")
return sampleBamFiles.toMap
// println("\n\n*** INPUT FILES ***\n")
// // Creating one file for each sample in the dataset
// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
// for ((sample, flist) <- sampleTable) {
//
// println(sample + ":")
// for (f <- flist)
// println (f)
// println()
//
// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
// sampleBamFiles(sample) = sampleFileName
// //add(writeList(flist, sampleFileName))
// }
// println("*** INPUT FILES ***\n\n")
//
// return sampleBamFiles.toMap
}
// Rebuilds the Read Group string to give BWA
@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript {
def script = {
// final output list of processed bam files
var cohortList: List[File] = List()
// sets the model for the Indel Realigner
cleanModelEnum = getIndelCleaningModel()
// keep a record of the number of contigs in the first bam file in the list
@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript {
val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}
// Generate a BAM file per sample joining all per lane files if necessary
val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs)
// generate a BAM file per sample joining all per lane files if necessary
val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)
// Final output list of processed bam files
var cohortList: List[File] = List()
// Simple progress report
println("\nFound the following samples: ")
for ((sample, file) <- sampleBAMFiles)
println("\t" + sample + " -> " + file)
println("\n")
// If this is a 'knowns only' indel realignment run, do it only once for all samples.
// if this is a 'knowns only' indel realignment run, do it only once for all samples.
val globalIntervals = new File(outputDir + projectName + ".intervals")
if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(null, globalIntervals))
// Put each sample through the pipeline
for ((sample, sampleFile) <- sampleBAMFiles) {
val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile}
// put each sample through the pipeline
for ((sample, bamList) <- sampleBAMFiles) {
// BAM files generated by the pipeline
val bam = new File(qscript.projectName + "." + sample + ".bam")
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript {
// Validation is an optional step for the BAM file generated after
// alignment and the final bam file of the pipeline.
if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files
if (!noValidation) {
for (sampleFile <- bamList)
add(validate(sampleFile, preValidateLog),
validate(recalBam, postValidateLog))
}
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(sampleFile, targetIntervals))
add(target(bamList, targetIntervals))
add(clean(sampleFile, targetIntervals, cleanedBam),
add(clean(bamList, targetIntervals, cleanedBam),
dedup(cleanedBam, dedupedBam, metricsFile),
cov(dedupedBam, preRecalFile),
recal(dedupedBam, preRecalFile, recalBam),
@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript {
this.maxRecordsInRam = 100000
}
case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
this.input_file :+= inBams
this.input_file = inBams
this.out = outIntervals
this.mismatchFraction = 0.0
this.known :+= qscript.dbSNP
@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outIntervals + ".target"
}
case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= inBams
case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
this.input_file = inBams
this.targetIntervals = tIntervals
this.out = outBam
this.known :+= qscript.dbSNP