Making the BQSR pipeline publicly available and supported.

this is for all the Pacbio validation that is going on right no in the cancer group. They are all using this script, and I'm happy to support it.
This commit is contained in:
Mauricio Carneiro 2011-06-29 16:05:32 -04:00
parent ec9d8313ee
commit 1085df8b7b
1 changed files with 91 additions and 0 deletions

View File

@ -0,0 +1,91 @@
package core
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import net.sf.samtools.SAMFileReader
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 4/20/11
* Time: 16:29 PM
*/
class RecalibrateBaseQualities extends QScript {
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
var GATKjar: File = _
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
var input: File = _
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false)
var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R")
@Input(doc="Reference fasta file", shortName="R", required=false)
var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
def script = {
nContigs = getNumberOfContigs(input)
val recalFile1: File = new File("recal1.csv")
val recalFile2: File = new File("recal2.csv")
val recalBam: File = swapExt(input, ".bam", "recal.bam")
val path1: String = "before"
val path2: String = "after"
add(cov(input, recalFile1),
recal(input, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
}
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = GATKjar
this.reference_sequence = reference
this.memoryLimit = 4
this.isIntermediate = true
}
case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= inBam
this.recal_file = outRecalFile
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
this.scatterCount = nContigs
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
this.input_file :+= inBam
this.recal_file = inRecalFile
this.out = outBam
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
this.scatterCount = nContigs
}
case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
this.resources = R
this.recal_file = inRecalFile
this.output_dir = outPath.toString
this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"
this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
}
}