From bd1cf4c7bcde4f15f632b360a33ab9148739090b Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Mon, 8 Aug 2011 15:09:12 -0400
Subject: [PATCH 1/7] Pacbio Pipeline

Added the base quality "filling" step to allow the pipeline to handle raw pacbio BAM files. This is the first step towards a generic pacbio data processing pipeline.
---
 .../qscripts/RecalibrateBaseQualities.scala   | 102 ++++++++++++++----
 1 file changed, 84 insertions(+), 18 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
index cbe53db8d..469325f6d 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@@ -3,6 +3,8 @@ package org.broadinstitute.sting.queue.qscripts
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.util.QScriptUtils
+import net.sf.samtools.SAMFileHeader.SortOrder
+import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
 
 /**
  * Created by IntelliJ IDEA.
@@ -14,52 +16,116 @@ import org.broadinstitute.sting.queue.util.QScriptUtils
 
 class RecalibrateBaseQualities extends QScript {
 
-  @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
-  var GATKjar: File = _
-
-  @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
+  @Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true)
   var input: File = _
 
   @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
   var R: String = _
 
   @Input(doc="Reference fasta file", shortName="R", required=true)
-  var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
+  var reference: File = _
+
+  @Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
+  var dbSNP: File = _
+
+  @Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false)
+  var dbq: Int = -1
+
+  @Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false)
+  var threads: Int = -1
+
+  @Input(doc="Sample Name" , shortName = "sn", required=false)
+  var sample: String = ""
+
+  @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
+  var bwaPath: File = _
+
+
+
 
-  @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true)
-  var dbSNP: File = _     // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
 
   val queueLogDir: String = ".qlog/"
   var nContigs: Int = 0
+  var ADD_BASE_QUALITIES = false
 
   def script = {
 
-    val bamList = QScriptUtils.createListFromFile(input)
-    nContigs = QScriptUtils.getNumberOfContigs(bamList(0))
+    if (dbq >= 0)
+      ADD_BASE_QUALITIES = true
 
-    for (bam <- bamList) {
+    val fileList = QScriptUtils.createListFromFile(input)
+    nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))}
 
-      val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv")
-      val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv")
-      val recalBam: File   = swapExt(bam, ".bam", ".recal.bam")
+    for (file <- fileList) {
+      val qualBam: File = swapExt(file, ".bam", ".quals.bam")
+      val rgBam: File = if (ADD_BASE_QUALITIES)  {swapExt(file, ".bam", ".rg.bam")} else {file}
+      val recalFile1: File = swapExt(file, ".bam", ".recal1.csv")
+      val recalFile2: File = swapExt(file, ".bam", ".recal2.csv")
+      val recalBam: File   = swapExt(file, ".bam", ".recal.bam")
       val path1: String    = recalBam + ".before"
       val path2: String    = recalBam + ".after"
 
-      add(cov(bam, recalFile1),
-          recal(bam, recalFile1, recalBam),
+
+      if (ADD_BASE_QUALITIES) {
+        add(addQuals(file, qualBam, dbq),
+            addReadGroup(qualBam, rgBam, sample))
+      }
+
+      add(cov(rgBam, recalFile1),
+          recal(rgBam, recalFile1, recalBam),
           cov(recalBam, recalFile2),
           analyzeCovariates(recalFile1, path1),
           analyzeCovariates(recalFile2, path2))
     }
   }
 
-  trait CommandLineGATKArgs extends CommandLineGATK {
-    this.jarFile = GATKjar
-    this.reference_sequence = reference
+
+  // General arguments to non-GATK tools
+  trait ExternalCommonArgs extends CommandLineFunction {
     this.memoryLimit = 4
     this.isIntermediate = true
   }
 
+  trait CommandLineGATKArgs extends CommandLineGATK {
+    this.reference_sequence = reference
+  }
+
+
+  case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
+    def commandLine = bwaPath + " bwasw " + reference + " " + inFastq + " > " + outSam
+    this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
+    this.jobName = queueLogDir + outSam + ".bwa_sam_se"
+  }
+
+  case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
+    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+    this.input = List(inSam)
+    this.output = outBam
+    this.sortOrder = SortOrder.coordinate
+    this.analysisName = queueLogDir + outBam + ".sortSam"
+    this.jobName = queueLogDir + outBam + ".sortSam"
+  }
+
+  case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
+    this.input_file :+= inBam
+    this.out = outBam
+    this.DBQ = qual
+  }
+
+  case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups {
+    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+    this.input = List(inBam)
+    this.output = outBam
+    this.RGID = "1"
+    this.RGCN = "BI"
+    this.RGPL = "PacBio_RS"
+    this.RGSM = sample
+    this.RGLB = "default_library"
+    this.RGPU = "default_pu"
+    this.analysisName = queueLogDir + outBam + ".rg"
+    this.jobName = queueLogDir + outBam + ".rg"
+  }
+
   case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
     this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
     this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")

From 22d25638234c77d33080517768a3066044143c81 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Mon, 8 Aug 2011 16:02:28 -0400
Subject: [PATCH 2/7] added BWA SW alignment

The pipeline now accepts fasta/fastq files and aligns them using BWA SW, adds default basequalities, creates read groups and performs BQSR.
---
 .../qscripts/RecalibrateBaseQualities.scala   | 76 ++++++++++++-------
 .../sting/queue/util/QScriptUtils.scala       | 10 +--
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
index 469325f6d..75e8c8325 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@@ -5,6 +5,8 @@ import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.util.QScriptUtils
 import net.sf.samtools.SAMFileHeader.SortOrder
 import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
+import org.broadinstitute.sting.utils.exceptions.UserException
+import org.broadinstitute.sting.commandline.Hidden
 
 /**
  * Created by IntelliJ IDEA.
@@ -16,7 +18,7 @@ import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceRe
 
 class RecalibrateBaseQualities extends QScript {
 
-  @Input(doc="input FASTA file, BAM file - or list of FASTA/BAM files. ", shortName="i", required=true)
+  @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
   var input: File = _
 
   @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
@@ -28,51 +30,67 @@ class RecalibrateBaseQualities extends QScript {
   @Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
   var dbSNP: File = _
 
-  @Input(doc="Default base qualities. Overrides the file's original base qualities with given value. Must be used if the file does not have base qualities." , shortName = "dbq", required=false)
-  var dbq: Int = -1
+  @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
+  var threads: Int = 0
 
-  @Input(doc="Number of jobs to scatter/gather. Default is the number of contigs in the dataset" , shortName = "sg", required=false)
-  var threads: Int = -1
+  @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
+  var sample: String = "NA"
 
-  @Input(doc="Sample Name" , shortName = "sn", required=false)
-  var sample: String = ""
-
-  @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
+  @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
   var bwaPath: File = _
 
+  @Hidden
+  @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
+  var dbq: Int = 20
+
 
 
 
 
   val queueLogDir: String = ".qlog/"
-  var nContigs: Int = 0
-  var ADD_BASE_QUALITIES = false
 
   def script = {
 
-    if (dbq >= 0)
-      ADD_BASE_QUALITIES = true
+    val fileList: List[File] = QScriptUtils.createListFromFile(input)
 
-    val fileList = QScriptUtils.createListFromFile(input)
-    nContigs = if (threads >= 0) {threads} else {QScriptUtils.getNumberOfContigs(fileList(0))}
+    for (file: File <- fileList) {
 
-    for (file <- fileList) {
-      val qualBam: File = swapExt(file, ".bam", ".quals.bam")
-      val rgBam: File = if (ADD_BASE_QUALITIES)  {swapExt(file, ".bam", ".rg.bam")} else {file}
-      val recalFile1: File = swapExt(file, ".bam", ".recal1.csv")
-      val recalFile2: File = swapExt(file, ".bam", ".recal2.csv")
-      val recalBam: File   = swapExt(file, ".bam", ".recal.bam")
+      var USE_BWA: Boolean = false
+
+      println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
+
+      if (file.endsWith(".fasta") || file.endsWith(".fq")) {
+        if (bwaPath == null) {
+          throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
+        }
+        USE_BWA = true
+      }
+
+      // FASTA -> BAM steps
+      val alignedSam: File = file.getName + ".aligned.sam"
+      val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
+      val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
+      val rgBam: File = swapExt(file, ".bam", ".rg.bam")
+
+      val bamBase = if (USE_BWA) {rgBam} else {file}
+
+      // BAM Steps
+      val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
+      val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
+      val recalBam: File   = swapExt(bamBase, ".bam", ".recal.bam")
       val path1: String    = recalBam + ".before"
       val path2: String    = recalBam + ".after"
 
 
-      if (ADD_BASE_QUALITIES) {
-        add(addQuals(file, qualBam, dbq),
+      if (USE_BWA) {
+        add(align(file, alignedSam),
+            sortSam(alignedSam, sortedBam),
+            addQuals(sortedBam, qualBam, dbq),
             addReadGroup(qualBam, rgBam, sample))
       }
 
-      add(cov(rgBam, recalFile1),
-          recal(rgBam, recalFile1, recalBam),
+      add(cov(bamBase, recalFile1),
+          recal(bamBase, recalFile1, recalBam),
           cov(recalBam, recalFile2),
           analyzeCovariates(recalFile1, path1),
           analyzeCovariates(recalFile2, path2))
@@ -86,7 +104,7 @@ class RecalibrateBaseQualities extends QScript {
     this.isIntermediate = true
   }
 
-  trait CommandLineGATKArgs extends CommandLineGATK {
+  trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
     this.reference_sequence = reference
   }
 
@@ -112,7 +130,7 @@ class RecalibrateBaseQualities extends QScript {
     this.DBQ = qual
   }
 
-  case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups {
+  case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
     @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
     this.input = List(inBam)
     this.output = outBam
@@ -133,7 +151,7 @@ class RecalibrateBaseQualities extends QScript {
     this.recal_file = outRecalFile
     this.analysisName = queueLogDir + outRecalFile + ".covariates"
     this.jobName = queueLogDir + outRecalFile + ".covariates"
-    this.scatterCount = nContigs
+    this.scatterCount = threads
   }
 
   case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
@@ -143,7 +161,7 @@ class RecalibrateBaseQualities extends QScript {
     this.isIntermediate = false
     this.analysisName = queueLogDir + outBam + ".recalibration"
     this.jobName = queueLogDir + outBam + ".recalibration"
-    this.scatterCount = nContigs
+    this.scatterCount = threads
   }
 
   case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
index 99aaa9474..12bd880d8 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
@@ -22,15 +22,15 @@ object QScriptUtils {
    * to have empty lines and comment lines (lines starting with #).
    */
   def createListFromFile(in: File):List[File] = {
-    // If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
+    // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file.
     // and return a list with only this file.
-    if (in.toString.endsWith(".bam"))
+    if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq"))
       return List(in)
 
     var list: List[File] = List()
-    for (bam <- fromFile(in).getLines)
-      if (!bam.startsWith("#") && !bam.isEmpty )
-        list :+= new File(bam.trim())
+    for (file <- fromFile(in).getLines)
+      if (!file.startsWith("#") && !file.isEmpty )
+        list :+= new File(file.trim())
     list.sortWith(_.compareTo(_) < 0)
   }
 

From 481630da00d2060290defb6e7d0ca6fb9a93cb40 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Tue, 9 Aug 2011 11:10:29 -0400
Subject: [PATCH 3/7] BWA parameters added

---
 .../sting/queue/qscripts/RecalibrateBaseQualities.scala         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
index 75e8c8325..9f3dd9a2c 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@@ -110,7 +110,7 @@ class RecalibrateBaseQualities extends QScript {
 
 
   case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
-    def commandLine = bwaPath + " bwasw " + reference + " " + inFastq + " > " + outSam
+    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
     this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
     this.jobName = queueLogDir + outSam + ".bwa_sam_se"
   }

From 86afe878a7ccfff5878432e8795130ba40068f31 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Tue, 9 Aug 2011 20:55:15 -0400
Subject: [PATCH 7/7] ReducedRead optimization: single pass likelihood
 calculation

-- Low level add() now takes a nObs argument and rather than += likelihood now does += nObs * likelihood
---
 .../DiploidSNPGenotypeLikelihoods.java        | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
index 2014801e4..5f6865d04 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
         if ( elt.isReducedRead() ) {
             // reduced read representation
             byte qual = elt.getReducedQual();
-            for ( int i = 0; i < elt.getReducedCount(); i++ ) {
-                add(obsBase, qual, (byte)0, (byte)0);
-            }
-            return elt.getQual();
+            add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
+            return elt.getReducedCount(); // we added nObs bases here
         } else {
             byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
-            return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
+            return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
         }
     }
 
@@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
      * @param qual1
      * @param obsBase2
      * @param qual2 can be 0, indicating no second base was observed for this fragment
+     * @param nObs The number of times this quad of values was seen.  Generally 1, but reduced reads
+     *  can have nObs > 1 for synthetic reads
      * @return
      */
-    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
+    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
         // TODO-- Right now we assume that there are at most 2 reads per fragment.  This assumption is fine
         // TODO--   given the current state of next-gen sequencing, but may need to be fixed in the future.
         // TODO--   However, when that happens, we'll need to be a lot smarter about the caching we do here.
@@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
 
         for ( DiploidGenotype g : DiploidGenotype.values() ) {
             double likelihood = likelihoods[g.ordinal()];
-            
-            //if ( VERBOSE ) {
-            //    System.out.printf("  L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
-            //            observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood);
-            //}
-
-            log10Likelihoods[g.ordinal()] += likelihood;
-            log10Posteriors[g.ordinal()] += likelihood;
+            log10Likelihoods[g.ordinal()] += likelihood * nObs;
+            log10Posteriors[g.ordinal()] += likelihood * nObs;
         }
 
         return 1;
     }
 
+    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
+        return add(obsBase1, qual1, obsBase2, qual2, 1);
+    }
+
     // -------------------------------------------------------------------------------------
     //
     // Dealing with the cache routines