From 1085df8b7b3f23e853444074bf9d202c9a54be1a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 29 Jun 2011 16:05:32 -0400 Subject: [PATCH] Making the BQSR pipeline publicly available and supported. this is for all the Pacbio validation that is going on right no in the cancer group. They are all using this script, and I'm happy to support it. --- .../core/RecalibrateBaseQualities.scala | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 public/scala/qscript/core/RecalibrateBaseQualities.scala diff --git a/public/scala/qscript/core/RecalibrateBaseQualities.scala b/public/scala/qscript/core/RecalibrateBaseQualities.scala new file mode 100755 index 000000000..ba13d267a --- /dev/null +++ b/public/scala/qscript/core/RecalibrateBaseQualities.scala @@ -0,0 +1,91 @@ +package core + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import net.sf.samtools.SAMFileReader + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 4/20/11 + * Time: 16:29 PM + */ + + +class RecalibrateBaseQualities extends QScript { + + @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) + var GATKjar: File = _ + + @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + var input: File = _ + + @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) + var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") + + @Input(doc="Reference fasta file", shortName="R", required=false) + var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + + @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) + var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + + val queueLogDir: String = ".qlog/" + var nContigs: Int = 0 + + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + return samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + def script = { + + nContigs = getNumberOfContigs(input) + + val recalFile1: File = new File("recal1.csv") + val recalFile2: File = new File("recal2.csv") + val recalBam: File = swapExt(input, ".bam", "recal.bam") + val path1: String = "before" + val path2: String = "after" + + add(cov(input, recalFile1), + recal(input, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = GATKjar + this.reference_sequence = reference + this.memoryLimit = 4 + this.isIntermediate = true + } + + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + this.input_file :+= inBam + this.recal_file = outRecalFile + this.analysisName = queueLogDir + outRecalFile + ".covariates" + this.jobName = queueLogDir + outRecalFile + ".covariates" + this.scatterCount = nContigs + } + + case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { + this.input_file :+= inBam + this.recal_file = inRecalFile + this.out = outBam + this.isIntermediate = false + this.analysisName = queueLogDir + outBam + ".recalibration" + this.jobName = queueLogDir + outBam + ".recalibration" + this.scatterCount = nContigs + } + + case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { + this.resources = R + this.recal_file = inRecalFile + this.output_dir = outPath.toString + this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" + this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" + } +} \ No newline at end of file