From 22d25638234c77d33080517768a3066044143c81 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 8 Aug 2011 16:02:28 -0400 Subject: [PATCH] added BWA SW alignment The pipeline now accepts fasta/fastq files and aligns them using BWA SW, adds default basequalities, creates read groups and performs BQSR. --- .../qscripts/RecalibrateBaseQualities.scala | 76 ++++++++++++------- .../sting/queue/util/QScriptUtils.scala | 10 +-- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 469325f6d..75e8c8325 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -5,6 +5,8 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.commandline.Hidden /** * Created by IntelliJ IDEA. @@ -16,7 +18,7 @@ import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceRe class RecalibrateBaseQualities extends QScript { - @Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true) + @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) var input: File = _ @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) @@ -28,51 +30,67 @@ class RecalibrateBaseQualities extends QScript { @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) var dbSNP: File = _ - @Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false) - var dbq: Int = -1 + @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + var threads: Int = 0 - @Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false) - var threads: Int = -1 + @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + var sample: String = "NA" - @Input(doc="Sample Name" , shortName = "sn", required=false) - var sample: String = "" - - @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) + @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ + @Hidden + @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + var dbq: Int = 20 + val queueLogDir: String = ".qlog/" - var nContigs: Int = 0 - var ADD_BASE_QUALITIES = false def script = { - if (dbq >= 0) - ADD_BASE_QUALITIES = true + val fileList: List[File] = QScriptUtils.createListFromFile(input) - val fileList = QScriptUtils.createListFromFile(input) - nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))} + for (file: File <- fileList) { - for (file <- fileList) { - val qualBam: File = swapExt(file, ".bam", ".quals.bam") - val rgBam: File = if (ADD_BASE_QUALITIES) {swapExt(file, ".bam", ".rg.bam")} else {file} - val recalFile1: File = swapExt(file, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(file, ".bam", ".recal2.csv") - val recalBam: File = swapExt(file, ".bam", ".recal.bam") + var USE_BWA: Boolean = false + + println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName) + + if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (bwaPath == null) { + throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); + } + USE_BWA = true + } + + // FASTA -> BAM steps + val alignedSam: File = file.getName + ".aligned.sam" + val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") + val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam") + val rgBam: File = swapExt(file, ".bam", ".rg.bam") + + val bamBase = if (USE_BWA) {rgBam} else {file} + + // BAM Steps + val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") val path1: String = recalBam + ".before" val path2: String = recalBam + ".after" - if (ADD_BASE_QUALITIES) { - add(addQuals(file, qualBam, dbq), + if (USE_BWA) { + add(align(file, alignedSam), + sortSam(alignedSam, sortedBam), + addQuals(sortedBam, qualBam, dbq), addReadGroup(qualBam, rgBam, sample)) } - add(cov(rgBam, recalFile1), - recal(rgBam, recalFile1, recalBam), + add(cov(bamBase, recalFile1), + recal(bamBase, recalFile1, recalBam), cov(recalBam, recalFile2), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) @@ -86,7 +104,7 @@ class RecalibrateBaseQualities extends QScript { this.isIntermediate = true } - trait CommandLineGATKArgs extends CommandLineGATK { + trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { this.reference_sequence = reference } @@ -112,7 +130,7 @@ class RecalibrateBaseQualities extends QScript { this.DBQ = qual } - case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups { + case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = List(inBam) this.output = outBam @@ -133,7 +151,7 @@ class RecalibrateBaseQualities extends QScript { this.recal_file = outRecalFile this.analysisName = queueLogDir + outRecalFile + ".covariates" this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = nContigs + this.scatterCount = threads } case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { @@ -143,7 +161,7 @@ class RecalibrateBaseQualities extends QScript { this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" - this.scatterCount = nContigs + this.scatterCount = threads } case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 99aaa9474..12bd880d8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -22,15 +22,15 @@ object QScriptUtils { * to have empty lines and comment lines (lines starting with #). */ def createListFromFile(in: File):List[File] = { - // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam")) + if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) return List(in) var list: List[File] = List() - for (bam <- fromFile(in).getLines) - if (!bam.startsWith("#") && !bam.isEmpty ) - list :+= new File(bam.trim()) + for (file <- fromFile(in).getLines) + if (!file.startsWith("#") && !file.isEmpty ) + list :+= new File(file.trim()) list.sortWith(_.compareTo(_) < 0) }