diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 724518142..2417e5620 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript { // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = { + def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = { // Creating a table with SAMPLE information from each input BAM file val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] @@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript { sampleTable(sample) :+= rBam } } + return sampleTable.toMap - println("\n\n*** INPUT FILES ***\n") - // Creating one file for each sample in the dataset - val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] - for ((sample, flist) <- sampleTable) { - - println(sample + ":") - for (f <- flist) - println (f) - println() - - val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") - sampleBamFiles(sample) = sampleFileName - add(writeList(flist, sampleFileName)) - } - println("*** INPUT FILES ***\n\n") - - return sampleBamFiles.toMap +// println("\n\n*** INPUT FILES ***\n") +// // Creating one file for each sample in the dataset +// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] +// for ((sample, flist) <- sampleTable) { +// +// println(sample + ":") +// for (f <- flist) +// println (f) +// println() +// +// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") +// sampleBamFiles(sample) = sampleFileName +// //add(writeList(flist, sampleFileName)) +// } +// println("*** INPUT FILES ***\n\n") +// +// return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA @@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript { def script = { + // final output list of processed bam files + var cohortList: List[File] = List() + // sets the model for the Indel Realigner cleanModelEnum = getIndelCleaningModel() // keep a record of the number of contigs in the first bam file in the list @@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript { val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)} - // Generate a BAM file per sample joining all per lane files if necessary - val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs) + // generate a BAM file per sample joining all per lane files if necessary + val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs) - // Final output list of processed bam files - var cohortList: List[File] = List() - - // Simple progress report - println("\nFound the following samples: ") - for ((sample, file) <- sampleBAMFiles) - println("\t" + sample + " -> " + file) - println("\n") - - // If this is a 'knowns only' indel realignment run, do it only once for all samples. + // if this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) add(target(null, globalIntervals)) - // Put each sample through the pipeline - for ((sample, sampleFile) <- sampleBAMFiles) { - val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile} + // put each sample through the pipeline + for ((sample, bamList) <- sampleBAMFiles) { // BAM files generated by the pipeline + val bam = new File(qscript.projectName + "." + sample + ".bam") val cleanedBam = swapExt(bam, ".bam", ".clean.bam") val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") @@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript { // Validation is an optional step for the BAM file generated after // alignment and the final bam file of the pipeline. - if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files + if (!noValidation) { + for (sampleFile <- bamList) add(validate(sampleFile, preValidateLog), validate(recalBam, postValidateLog)) } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(sampleFile, targetIntervals)) + add(target(bamList, targetIntervals)) - add(clean(sampleFile, targetIntervals, cleanedBam), + add(clean(bamList, targetIntervals, cleanedBam), dedup(cleanedBam, dedupedBam, metricsFile), cov(dedupedBam, preRecalFile), recal(dedupedBam, preRecalFile, recalBam), @@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript { this.maxRecordsInRam = 100000 } - case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { + case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) - this.input_file :+= inBams + this.input_file = inBams this.out = outIntervals this.mismatchFraction = 0.0 this.known :+= qscript.dbSNP @@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outIntervals + ".target" } - case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - this.input_file :+= inBams + case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { + this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam this.known :+= qscript.dbSNP