Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-08-09 23:19:51 -04:00 · 2011-08-09 23:19:51 -04:00 · b8f572b571
parent 08631546c8 86afe878a7
commit b8f572b571
3 changed files with 122 additions and 40 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
        if ( elt.isReducedRead() ) {
            // reduced read representation
            byte qual = elt.getReducedQual();
-            for ( int i = 0; i < elt.getReducedCount(); i++ ) {
-                add(obsBase, qual, (byte)0, (byte)0);
-            }
-            return elt.getQual();
+            add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
+            return elt.getReducedCount(); // we added nObs bases here
        } else {
            byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
-            return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
+            return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
        }
    }

@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
     * @param qual1
     * @param obsBase2
     * @param qual2 can be 0, indicating no second base was observed for this fragment
+     * @param nObs The number of times this quad of values was seen.  Generally 1, but reduced reads
+     *  can have nObs > 1 for synthetic reads
     * @return
     */
-    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
+    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
        // TODO-- Right now we assume that there are at most 2 reads per fragment.  This assumption is fine
        // TODO--   given the current state of next-gen sequencing, but may need to be fixed in the future.
        // TODO--   However, when that happens, we'll need to be a lot smarter about the caching we do here.
@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {

        for ( DiploidGenotype g : DiploidGenotype.values() ) {
            double likelihood = likelihoods[g.ordinal()];
-            
-            //if ( VERBOSE ) {
-            //    System.out.printf("  L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
-            //            observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood);
-            //}
-
-            log10Likelihoods[g.ordinal()] += likelihood;
-            log10Posteriors[g.ordinal()] += likelihood;
+            log10Likelihoods[g.ordinal()] += likelihood * nObs;
+            log10Posteriors[g.ordinal()] += likelihood * nObs;
        }

        return 1;
    }

+    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
+        return add(obsBase1, qual1, obsBase2, qual2, 1);
+    }
+
    // -------------------------------------------------------------------------------------
    //
    // Dealing with the cache routines
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@ -3,6 +3,10 @@ package org.broadinstitute.sting.queue.qscripts
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.util.QScriptUtils
+import net.sf.samtools.SAMFileHeader.SortOrder
+import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
+import org.broadinstitute.sting.utils.exceptions.UserException
+import org.broadinstitute.sting.commandline.Hidden

 /**
 * Created by IntelliJ IDEA.
@ -14,52 +18,132 @@ import org.broadinstitute.sting.queue.util.QScriptUtils

 class RecalibrateBaseQualities extends QScript {

-  @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
-  var GATKjar: File = _
-
-  @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
+  @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
  var input: File = _

  @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
  var R: String = _

  @Input(doc="Reference fasta file", shortName="R", required=true)
-  var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
+  var reference: File = _
+
+  @Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
+  var dbSNP: File = _
+
+  @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
+  var threads: Int = 0
+
+  @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
+  var sample: String = "NA"
+
+  @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
+  var bwaPath: File = _
+
+  @Hidden
+  @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
+  var dbq: Int = 20
+
+
+

-  @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true)
-  var dbSNP: File = _     // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")

  val queueLogDir: String = ".qlog/"
-  var nContigs: Int = 0

  def script = {

-    val bamList = QScriptUtils.createListFromFile(input)
-    nContigs = QScriptUtils.getNumberOfContigs(bamList(0))
+    val fileList: List[File] = QScriptUtils.createListFromFile(input)

-    for (bam <- bamList) {
+    for (file: File <- fileList) {

-      val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv")
-      val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv")
-      val recalBam: File   = swapExt(bam, ".bam", ".recal.bam")
+      var USE_BWA: Boolean = false
+
+      println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
+
+      if (file.endsWith(".fasta") || file.endsWith(".fq")) {
+        if (bwaPath == null) {
+          throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
+        }
+        USE_BWA = true
+      }
+
+      // FASTA -> BAM steps
+      val alignedSam: File = file.getName + ".aligned.sam"
+      val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
+      val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
+      val rgBam: File = swapExt(file, ".bam", ".rg.bam")
+
+      val bamBase = if (USE_BWA) {rgBam} else {file}
+
+      // BAM Steps
+      val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
+      val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
+      val recalBam: File   = swapExt(bamBase, ".bam", ".recal.bam")
      val path1: String    = recalBam + ".before"
      val path2: String    = recalBam + ".after"

-      add(cov(bam, recalFile1),
-          recal(bam, recalFile1, recalBam),
+
+      if (USE_BWA) {
+        add(align(file, alignedSam),
+            sortSam(alignedSam, sortedBam),
+            addQuals(sortedBam, qualBam, dbq),
+            addReadGroup(qualBam, rgBam, sample))
+      }
+
+      add(cov(bamBase, recalFile1),
+          recal(bamBase, recalFile1, recalBam),
          cov(recalBam, recalFile2),
          analyzeCovariates(recalFile1, path1),
          analyzeCovariates(recalFile2, path2))
    }
  }

-  trait CommandLineGATKArgs extends CommandLineGATK {
-    this.jarFile = GATKjar
-    this.reference_sequence = reference
+
+  // General arguments to non-GATK tools
+  trait ExternalCommonArgs extends CommandLineFunction {
    this.memoryLimit = 4
    this.isIntermediate = true
  }

+  trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
+    this.reference_sequence = reference
+  }
+
+
+  case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
+    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
+    this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
+    this.jobName = queueLogDir + outSam + ".bwa_sam_se"
+  }
+
+  case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
+    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+    this.input = List(inSam)
+    this.output = outBam
+    this.sortOrder = SortOrder.coordinate
+    this.analysisName = queueLogDir + outBam + ".sortSam"
+    this.jobName = queueLogDir + outBam + ".sortSam"
+  }
+
+  case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
+    this.input_file :+= inBam
+    this.out = outBam
+    this.DBQ = qual
+  }
+
+  case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
+    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+    this.input = List(inBam)
+    this.output = outBam
+    this.RGID = "1"
+    this.RGCN = "BI"
+    this.RGPL = "PacBio_RS"
+    this.RGSM = sample
+    this.RGLB = "default_library"
+    this.RGPU = "default_pu"
+    this.analysisName = queueLogDir + outBam + ".rg"
+    this.jobName = queueLogDir + outBam + ".rg"
+  }
+
  case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
@ -67,7 +151,7 @@ class RecalibrateBaseQualities extends QScript {
    this.recal_file = outRecalFile
    this.analysisName = queueLogDir + outRecalFile + ".covariates"
    this.jobName = queueLogDir + outRecalFile + ".covariates"
-    this.scatterCount = nContigs
+    this.scatterCount = threads
  }

  case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
@ -77,7 +161,7 @@ class RecalibrateBaseQualities extends QScript {
    this.isIntermediate = false
    this.analysisName = queueLogDir + outBam + ".recalibration"
    this.jobName = queueLogDir + outBam + ".recalibration"
-    this.scatterCount = nContigs
+    this.scatterCount = threads
  }

  case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
--- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
@ -22,15 +22,15 @@ object QScriptUtils {
   * to have empty lines and comment lines (lines starting with #).
   */
  def createListFromFile(in: File):List[File] = {
-    // If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
+    // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file.
    // and return a list with only this file.
-    if (in.toString.endsWith(".bam"))
+    if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq"))
      return List(in)

    var list: List[File] = List()
-    for (bam <- fromFile(in).getLines)
-      if (!bam.startsWith("#") && !bam.isEmpty )
-        list :+= new File(bam.trim())
+    for (file <- fromFile(in).getLines)
+      if (!file.startsWith("#") && !file.isEmpty )
+        list :+= new File(file.trim())
    list.sortWith(_.compareTo(_) < 0)
  }