From b4da843c49e4dfbfbabbf59257c3d0a968307eb6 Mon Sep 17 00:00:00 2001 From: carneiro Date: Thu, 17 Feb 2011 02:07:22 +0000 Subject: [PATCH] now processes either a single bam file or a list of bam files in parallel. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5256 348d0f76-0448-11de-a6fe-93d51630548a --- .../oneoffs/carneiro/dataProcessing.scala | 176 ++++++++++-------- 1 file changed, 96 insertions(+), 80 deletions(-) diff --git a/scala/qscript/oneoffs/carneiro/dataProcessing.scala b/scala/qscript/oneoffs/carneiro/dataProcessing.scala index 69ae3887c..b0c38f396 100755 --- a/scala/qscript/oneoffs/carneiro/dataProcessing.scala +++ b/scala/qscript/oneoffs/carneiro/dataProcessing.scala @@ -2,6 +2,7 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import scala.io.Source class dataProcessing extends QScript { @@ -14,7 +15,7 @@ class dataProcessing extends QScript { var dedupJar: File = _ @Input(doc="input BAM file", shortName="input", required=true) - var inputBam: File = _ + var inputBam: String = _ @Input(doc="output path", shortName="outputDir", required=false) var outputDir: String = "" @@ -31,91 +32,106 @@ class dataProcessing extends QScript { def script = { - // Files generated by the pipeline - val baseName: String = swapExt(qscript.inputBam, ".bam", "").toString() - val cleanedBam: String = baseName + ".cleaned.bam" - val dedupedBam: String = baseName + ".cleaned.dedup.bam" - val metricsFile: String = swapExt(qscript.inputBam, "bam", "metrics").toString() - val targetIntervals: String = baseName + ".indel.intervals" - val recalFile: String = baseName + ".recal.csv" - val recalBam: String = baseName + ".cleaned.dedup.recal.bam" - - // Reference sequence, dbsnps and RODs used by the pipeline - val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" - val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" - val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" - val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" - - // General arguments to all programs - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = qscript.gatkJar - this.reference_sequence = reference - this.memoryLimit = Some(4) - this.jobTempDir = qscript.outputTmpDir + // Populates the list of files to process either from a single bam or from a list of bams. + var bamList: List[String] = Nil + if (inputBam.endsWith("bam")) { + bamList :+= inputBam + } + else { // todo: maybe check if the filename ends with .list? + for (bam <- Source.fromFile(inputBam).getLines()) + bamList :+= bam } + bamList.foreach { bam => - val target = new RealignerTargetCreator with CommandLineGATKArgs - target.input_file :+= qscript.inputBam - target.out = new File(targetIntervals) - target.mismatchFraction = Some(0.0) - target.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - target.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls) - target.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) - target.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) - target.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) - target.jobName = baseName + ".target" - if (!qscript.intervalString.isEmpty()) target.intervalsString ++= List(qscript.intervalString) - if (qscript.intervals != Nil) target.intervals ++= List(qscript.intervals) + println("DEBUG: bam = " + new File(bam)) - // 2.) Clean without SW + // Files generated by the pipeline + val baseName: String = swapExt(new File(bam.substring(bam.lastIndexOf("/")+1)), ".bam", "").toString() + val cleanedBam: String = baseName + ".cleaned.bam" + val dedupedBam: String = baseName + ".cleaned.dedup.bam" + val metricsFile: String = baseName + ".metrics" + val targetIntervals: String = baseName + ".indel.intervals" + val recalFile: String = baseName + ".recal.csv" + val recalBam: String = baseName + ".cleaned.dedup.recal.bam" - val clean = new IndelRealigner with CommandLineGATKArgs - clean.input_file :+= qscript.inputBam - clean.targetIntervals = new File(targetIntervals) - clean.out = new File(cleanedBam) - clean.doNotUseSW = true - clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) - clean.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - clean.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls) - clean.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) - clean.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) - clean.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) - clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true - clean.jobName = baseName + ".clean" - if (!qscript.intervalString.isEmpty()) clean.intervalsString ++= List(qscript.intervalString) - if (qscript.intervals != Nil) clean.intervals ++= List(qscript.intervals) + // Reference sequence, dbsnps and RODs used by the pipeline + val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" + val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" + val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" + val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" - // 3.) Mark Duplicates - val dedup = new PicardBamJarFunction{ - @Input(doc="cleaned bam") var clean: File = new File(cleanedBam) - @Output(doc="deduped bam") var deduped: File = new File(dedupedBam) - override def inputBams = List(clean) - override def outputBam = deduped - override def commandLine = super.commandLine + " M=" + metricsFile - sortOrder = null + // General arguments to all programs + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = reference + this.memoryLimit = Some(4) + this.jobTempDir = qscript.outputTmpDir + } + + + val target = new RealignerTargetCreator with CommandLineGATKArgs + target.input_file :+= new File(bam) + target.out = new File(targetIntervals) + target.mismatchFraction = Some(0.0) + target.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + target.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls) + target.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) + target.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) + target.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) + target.jobName = baseName + ".target" + if (!qscript.intervalString.isEmpty()) target.intervalsString ++= List(qscript.intervalString) + if (qscript.intervals != Nil) target.intervals ++= List(qscript.intervals) + + // 2.) Clean without SW + + val clean = new IndelRealigner with CommandLineGATKArgs + clean.input_file :+= new File(bam) + clean.targetIntervals = new File(targetIntervals) + clean.out = new File(cleanedBam) + clean.doNotUseSW = true + clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) + clean.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + clean.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls) + clean.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) + clean.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) + clean.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) + clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true + clean.jobName = baseName + ".clean" + if (!qscript.intervalString.isEmpty()) clean.intervalsString ++= List(qscript.intervalString) + if (qscript.intervals != Nil) clean.intervals ++= List(qscript.intervals) + + // 3.) Mark Duplicates + val dedup = new PicardBamJarFunction{ + @Input(doc="cleaned bam") var clean: File = new File(cleanedBam) + @Output(doc="deduped bam") var deduped: File = new File(dedupedBam) + override def inputBams = List(clean) + override def outputBam = deduped + override def commandLine = super.commandLine + " M=" + metricsFile + sortOrder = null + } + dedup.memoryLimit = Some(8) + dedup.jarFile = qscript.dedupJar + dedup.jobName = baseName + ".dedup" + + val index = new SamtoolsIndexFunction + index.bamFile = new File(dedupedBam) + index.analysisName = baseName + ".index" + + val cov = new CountCovariates with CommandLineGATKArgs + cov.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + cov.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + cov.input_file :+= new File(dedupedBam) + cov.recal_file = new File(recalFile) + + val recal = new TableRecalibration with CommandLineGATKArgs + recal.input_file :+= new File (dedupedBam) + recal.recal_file = new File(recalFile) + recal.out = new File(recalBam) + + add(target, clean, dedup, index, cov, recal) } - dedup.memoryLimit = Some(8) - dedup.jarFile = qscript.dedupJar - dedup.jobName = baseName + ".dedup" - - val index = new SamtoolsIndexFunction - index.bamFile = new File(dedupedBam) - index.analysisName = baseName + ".index" - - val cov = new CountCovariates with CommandLineGATKArgs - cov.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - cov.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - cov.input_file :+= new File(dedupedBam) - cov.recal_file = new File(recalFile) - - val recal = new TableRecalibration with CommandLineGATKArgs - recal.input_file :+= new File (dedupedBam) - recal.recal_file = new File(recalFile) - recal.out = new File(recalBam) - - add(target, clean, dedup, index, cov, recal) } } \ No newline at end of file