From bd1cf4c7bcde4f15f632b360a33ab9148739090b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 8 Aug 2011 15:09:12 -0400 Subject: [PATCH 1/7] Pacbio Pipeline Added the base quality "filling" step to allow the pipeline to handle raw pacbio BAM files. This is the first step towards a generic pacbio data processing pipeline. --- .../qscripts/RecalibrateBaseQualities.scala | 102 ++++++++++++++---- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index cbe53db8d..469325f6d 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -3,6 +3,8 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils +import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups} /** * Created by IntelliJ IDEA. @@ -14,52 +16,116 @@ import org.broadinstitute.sting.queue.util.QScriptUtils class RecalibrateBaseQualities extends QScript { - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + @Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true) var input: File = _ @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) var R: String = _ @Input(doc="Reference fasta file", shortName="R", required=true) - var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + var reference: File = _ + + @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) + var dbSNP: File = _ + + @Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false) + var dbq: Int = -1 + + @Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false) + var threads: Int = -1 + + @Input(doc="Sample Name" , shortName = "sn", required=false) + var sample: String = "" + + @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) + var bwaPath: File = _ + + + - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true) - var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") val queueLogDir: String = ".qlog/" var nContigs: Int = 0 + var ADD_BASE_QUALITIES = false def script = { - val bamList = QScriptUtils.createListFromFile(input) - nContigs = QScriptUtils.getNumberOfContigs(bamList(0)) + if (dbq >= 0) + ADD_BASE_QUALITIES = true - for (bam <- bamList) { + val fileList = QScriptUtils.createListFromFile(input) + nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))} - val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") - val recalBam: File = swapExt(bam, ".bam", ".recal.bam") + for (file <- fileList) { + val qualBam: File = swapExt(file, ".bam", ".quals.bam") + val rgBam: File = if (ADD_BASE_QUALITIES) {swapExt(file, ".bam", ".rg.bam")} else {file} + val recalFile1: File = swapExt(file, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(file, ".bam", ".recal2.csv") + val recalBam: File = swapExt(file, ".bam", ".recal.bam") val path1: String = recalBam + ".before" val path2: String = recalBam + ".after" - add(cov(bam, recalFile1), - recal(bam, recalFile1, recalBam), + + if (ADD_BASE_QUALITIES) { + add(addQuals(file, qualBam, dbq), + addReadGroup(qualBam, rgBam, sample)) + } + + add(cov(rgBam, recalFile1), + recal(rgBam, recalFile1, recalBam), cov(recalBam, recalFile2), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) } } - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = GATKjar - this.reference_sequence = reference + + // General arguments to non-GATK tools + trait ExternalCommonArgs extends CommandLineFunction { this.memoryLimit = 4 this.isIntermediate = true } + trait CommandLineGATKArgs extends CommandLineGATK { + this.reference_sequence = reference + } + + + case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { + def commandLine = bwaPath + " bwasw " + reference + " " + inFastq + " > " + outSam + this.analysisName = queueLogDir + outSam + ".bwa_sam_se" + this.jobName = queueLogDir + outSam + ".bwa_sam_se" + } + + case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + this.input = List(inSam) + this.output = outBam + this.sortOrder = SortOrder.coordinate + this.analysisName = queueLogDir + outBam + ".sortSam" + this.jobName = queueLogDir + outBam + ".sortSam" + } + + case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs { + this.input_file :+= inBam + this.out = outBam + this.DBQ = qual + } + + case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + this.input = List(inBam) + this.output = outBam + this.RGID = "1" + this.RGCN = "BI" + this.RGPL = "PacBio_RS" + this.RGSM = sample + this.RGLB = "default_library" + this.RGPU = "default_pu" + this.analysisName = queueLogDir + outBam + ".rg" + this.jobName = queueLogDir + outBam + ".rg" + } + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") From 22d25638234c77d33080517768a3066044143c81 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 8 Aug 2011 16:02:28 -0400 Subject: [PATCH 2/7] added BWA SW alignment The pipeline now accepts fasta/fastq files and aligns them using BWA SW, adds default basequalities, creates read groups and performs BQSR. --- .../qscripts/RecalibrateBaseQualities.scala | 76 ++++++++++++------- .../sting/queue/util/QScriptUtils.scala | 10 +-- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 469325f6d..75e8c8325 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -5,6 +5,8 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.commandline.Hidden /** * Created by IntelliJ IDEA. @@ -16,7 +18,7 @@ import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceRe class RecalibrateBaseQualities extends QScript { - @Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true) + @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) var input: File = _ @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) @@ -28,51 +30,67 @@ class RecalibrateBaseQualities extends QScript { @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) var dbSNP: File = _ - @Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false) - var dbq: Int = -1 + @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + var threads: Int = 0 - @Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false) - var threads: Int = -1 + @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + var sample: String = "NA" - @Input(doc="Sample Name" , shortName = "sn", required=false) - var sample: String = "" - - @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) + @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ + @Hidden + @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + var dbq: Int = 20 + val queueLogDir: String = ".qlog/" - var nContigs: Int = 0 - var ADD_BASE_QUALITIES = false def script = { - if (dbq >= 0) - ADD_BASE_QUALITIES = true + val fileList: List[File] = QScriptUtils.createListFromFile(input) - val fileList = QScriptUtils.createListFromFile(input) - nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))} + for (file: File <- fileList) { - for (file <- fileList) { - val qualBam: File = swapExt(file, ".bam", ".quals.bam") - val rgBam: File = if (ADD_BASE_QUALITIES) {swapExt(file, ".bam", ".rg.bam")} else {file} - val recalFile1: File = swapExt(file, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(file, ".bam", ".recal2.csv") - val recalBam: File = swapExt(file, ".bam", ".recal.bam") + var USE_BWA: Boolean = false + + println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName) + + if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (bwaPath == null) { + throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); + } + USE_BWA = true + } + + // FASTA -> BAM steps + val alignedSam: File = file.getName + ".aligned.sam" + val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") + val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam") + val rgBam: File = swapExt(file, ".bam", ".rg.bam") + + val bamBase = if (USE_BWA) {rgBam} else {file} + + // BAM Steps + val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") val path1: String = recalBam + ".before" val path2: String = recalBam + ".after" - if (ADD_BASE_QUALITIES) { - add(addQuals(file, qualBam, dbq), + if (USE_BWA) { + add(align(file, alignedSam), + sortSam(alignedSam, sortedBam), + addQuals(sortedBam, qualBam, dbq), addReadGroup(qualBam, rgBam, sample)) } - add(cov(rgBam, recalFile1), - recal(rgBam, recalFile1, recalBam), + add(cov(bamBase, recalFile1), + recal(bamBase, recalFile1, recalBam), cov(recalBam, recalFile2), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) @@ -86,7 +104,7 @@ class RecalibrateBaseQualities extends QScript { this.isIntermediate = true } - trait CommandLineGATKArgs extends CommandLineGATK { + trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { this.reference_sequence = reference } @@ -112,7 +130,7 @@ class RecalibrateBaseQualities extends QScript { this.DBQ = qual } - case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups { + case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = List(inBam) this.output = outBam @@ -133,7 +151,7 @@ class RecalibrateBaseQualities extends QScript { this.recal_file = outRecalFile this.analysisName = queueLogDir + outRecalFile + ".covariates" this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = nContigs + this.scatterCount = threads } case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { @@ -143,7 +161,7 @@ class RecalibrateBaseQualities extends QScript { this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" - this.scatterCount = nContigs + this.scatterCount = threads } case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 99aaa9474..12bd880d8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -22,15 +22,15 @@ object QScriptUtils { * to have empty lines and comment lines (lines starting with #). */ def createListFromFile(in: File):List[File] = { - // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam")) + if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) return List(in) var list: List[File] = List() - for (bam <- fromFile(in).getLines) - if (!bam.startsWith("#") && !bam.isEmpty ) - list :+= new File(bam.trim()) + for (file <- fromFile(in).getLines) + if (!file.startsWith("#") && !file.isEmpty ) + list :+= new File(file.trim()) list.sortWith(_.compareTo(_) < 0) } From 481630da00d2060290defb6e7d0ca6fb9a93cb40 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 9 Aug 2011 11:10:29 -0400 Subject: [PATCH 3/7] BWA parameters added --- .../sting/queue/qscripts/RecalibrateBaseQualities.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 75e8c8325..9f3dd9a2c 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -110,7 +110,7 @@ class RecalibrateBaseQualities extends QScript { case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { - def commandLine = bwaPath + " bwasw " + reference + " " + inFastq + " > " + outSam + def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam this.analysisName = queueLogDir + outSam + ".bwa_sam_se" this.jobName = queueLogDir + outSam + ".bwa_sam_se" } From 86afe878a7ccfff5878432e8795130ba40068f31 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 9 Aug 2011 20:55:15 -0400 Subject: [PATCH 7/7] ReducedRead optimization: single pass likelihood calculation -- Low level add() now takes a nObs argument and rather than += likelihood now does += nObs * likelihood --- .../DiploidSNPGenotypeLikelihoods.java | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 2014801e4..5f6865d04 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation byte qual = elt.getReducedQual(); - for ( int i = 0; i < elt.getReducedCount(); i++ ) { - add(obsBase, qual, (byte)0, (byte)0); - } - return elt.getQual(); + add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods + return elt.getReducedCount(); // we added nObs bases here } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0; + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } } @@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @param qual1 * @param obsBase2 * @param qual2 can be 0, indicating no second base was observed for this fragment + * @param nObs The number of times this quad of values was seen. Generally 1, but reduced reads + * can have nObs > 1 for synthetic reads * @return */ - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. @@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { for ( DiploidGenotype g : DiploidGenotype.values() ) { double likelihood = likelihoods[g.ordinal()]; - - //if ( VERBOSE ) { - // System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", - // observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood); - //} - - log10Likelihoods[g.ordinal()] += likelihood; - log10Posteriors[g.ordinal()] += likelihood; + log10Likelihoods[g.ordinal()] += likelihood * nObs; + log10Posteriors[g.ordinal()] += likelihood * nObs; } return 1; } + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + return add(obsBase1, qual1, obsBase2, qual2, 1); + } + // ------------------------------------------------------------------------------------- // // Dealing with the cache routines