added BWA SW alignment

The pipeline now accepts fasta/fastq files and aligns them using BWA SW, adds default basequalities, creates read groups and performs BQSR.
This commit is contained in:
Mauricio Carneiro 2011-08-08 16:02:28 -04:00
parent bd1cf4c7bc
commit 22d2563823
2 changed files with 52 additions and 34 deletions

View File

@ -5,6 +5,8 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.util.QScriptUtils
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
import org.broadinstitute.sting.utils.exceptions.UserException
import org.broadinstitute.sting.commandline.Hidden
/**
* Created by IntelliJ IDEA.
@ -16,7 +18,7 @@ import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceRe
class RecalibrateBaseQualities extends QScript {
@Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true)
@Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
var input: File = _
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
@ -28,51 +30,67 @@ class RecalibrateBaseQualities extends QScript {
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
var dbSNP: File = _
@Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false)
var dbq: Int = -1
@Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
var threads: Int = 0
@Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false)
var threads: Int = -1
@Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
var sample: String = "NA"
@Input(doc="Sample Name" , shortName = "sn", required=false)
var sample: String = ""
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Hidden
@Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
var dbq: Int = 20
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
var ADD_BASE_QUALITIES = false
def script = {
if (dbq >= 0)
ADD_BASE_QUALITIES = true
val fileList: List[File] = QScriptUtils.createListFromFile(input)
val fileList = QScriptUtils.createListFromFile(input)
nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))}
for (file: File <- fileList) {
for (file <- fileList) {
val qualBam: File = swapExt(file, ".bam", ".quals.bam")
val rgBam: File = if (ADD_BASE_QUALITIES) {swapExt(file, ".bam", ".rg.bam")} else {file}
val recalFile1: File = swapExt(file, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(file, ".bam", ".recal2.csv")
val recalBam: File = swapExt(file, ".bam", ".recal.bam")
var USE_BWA: Boolean = false
println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
if (file.endsWith(".fasta") || file.endsWith(".fq")) {
if (bwaPath == null) {
throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
}
USE_BWA = true
}
// FASTA -> BAM steps
val alignedSam: File = file.getName + ".aligned.sam"
val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
val rgBam: File = swapExt(file, ".bam", ".rg.bam")
val bamBase = if (USE_BWA) {rgBam} else {file}
// BAM Steps
val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam")
val path1: String = recalBam + ".before"
val path2: String = recalBam + ".after"
if (ADD_BASE_QUALITIES) {
add(addQuals(file, qualBam, dbq),
if (USE_BWA) {
add(align(file, alignedSam),
sortSam(alignedSam, sortedBam),
addQuals(sortedBam, qualBam, dbq),
addReadGroup(qualBam, rgBam, sample))
}
add(cov(rgBam, recalFile1),
recal(rgBam, recalFile1, recalBam),
add(cov(bamBase, recalFile1),
recal(bamBase, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
@ -86,7 +104,7 @@ class RecalibrateBaseQualities extends QScript {
this.isIntermediate = true
}
trait CommandLineGATKArgs extends CommandLineGATK {
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
this.reference_sequence = reference
}
@ -112,7 +130,7 @@ class RecalibrateBaseQualities extends QScript {
this.DBQ = qual
}
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups {
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
this.input = List(inBam)
this.output = outBam
@ -133,7 +151,7 @@ class RecalibrateBaseQualities extends QScript {
this.recal_file = outRecalFile
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
this.scatterCount = nContigs
this.scatterCount = threads
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
@ -143,7 +161,7 @@ class RecalibrateBaseQualities extends QScript {
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
this.scatterCount = nContigs
this.scatterCount = threads
}
case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {

View File

@ -22,15 +22,15 @@ object QScriptUtils {
* to have empty lines and comment lines (lines starting with #).
*/
def createListFromFile(in: File):List[File] = {
// If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
// If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file.
// and return a list with only this file.
if (in.toString.endsWith(".bam"))
if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq"))
return List(in)
var list: List[File] = List()
for (bam <- fromFile(in).getLines)
if (!bam.startsWith("#") && !bam.isEmpty )
list :+= new File(bam.trim())
for (file <- fromFile(in).getLines)
if (!file.startsWith("#") && !file.isEmpty )
list :+= new File(file.trim())
list.sortWith(_.compareTo(_) < 0)
}