Begin moving XHMM scripts to public

2012-10-25 16:18:25 -04:00 · 2012-10-25 16:18:25 -04:00 · cde4f037d3
parent 1ec137a40c
commit cde4f037d3
2 changed files with 622 additions and 0 deletions
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@ -0,0 +1,499 @@
+package org.broadinstitute.sting.queue.qscripts.CNV
+
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
+import org.broadinstitute.sting.queue.util.DoC._
+import org.broadinstitute.sting.commandline.Hidden
+import java.io.{PrintStream, PrintWriter}
+import org.broadinstitute.sting.utils.text.XReadLines
+import collection.JavaConversions._
+
+class xhmmCNVpipeline extends QScript {
+  qscript =>
+
+  @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
+  var bams: File = _
+
+  @Argument(doc = "gatk jar file", shortName = "J", required = true)
+  var gatkJarFile: File = _
+
+  @Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
+  var xhmmExec: File = _
+
+  @Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
+  var pseqExec: File = _
+
+  @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
+  var pseqSeqDB: String = _
+
+  @Argument(shortName = "R", doc = "ref", required = true)
+  var referenceFile: File = _
+
+  @Argument(shortName = "L", doc = "Intervals", required = false)
+  var intervals: File = _
+
+  @Input(doc = "level of parallelism for BAM DoC.   By default is set to 0 [no scattering].", shortName = "scatter", required = false)
+  var scatterCountInput = 0
+
+  @Input(doc = "Samples to run together for DoC.   By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
+  var samplesPerJob = 1
+
+  @Output(doc = "Base name for files to output", shortName = "o", required = true)
+  var outputBase: File = _
+
+  @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
+  var MAX_DEPTH = 20000
+
+  @Hidden
+  @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
+  var NUM_BINS = 200
+
+  @Hidden
+  @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
+  var START_BIN = 1
+
+  @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
+  var minMappingQuality = 0
+
+  @Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
+  var wholeMatrixMemory = -1
+
+  @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
+  var minTargGC : Double = 0.1
+
+  @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false)
+  var maxTargGC : Double = 0.9
+
+  @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false)
+  var minTargRepeats : Double = 0.0
+
+  @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false)
+  var maxTargRepeats : Double = 0.1
+
+  @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false)
+  var sampleIDsMap: String = ""
+
+  @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false)
+  var sampleIDsMapFromColumn = 1
+
+  @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false)
+  var sampleIDsMapToColumn = 2
+
+  @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false)
+  var targetSampleFiltersString: String = ""
+
+  @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false)
+  var PCAnormalizeMethodString: String = ""
+
+  @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false)
+  var targetSampleNormalizedFiltersString: String = ""
+
+  @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
+  var xhmmParamsArg: File = _
+
+  @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false)
+  var discoverCommandLineParams: String = ""
+
+  @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
+  var genotypeCommandLineParams: String = ""
+
+  @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false)
+  var genotypeSubsegments: Boolean = false
+
+  @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false)
+  var maxTargetsInSubsegment = 30
+
+  @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
+  var subsegmentGenotypeThreshold = 20.0
+
+  @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
+  var longJobQueue: String = ""
+
+
+  val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list"
+
+  val RD_OUTPUT_SUFFIX: String = ".RD.txt"
+
+  val TARGS_GC_SUFFIX = ".locus_GC.txt"
+  val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt"
+
+  val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt"
+  val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt"
+
+  val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt"
+  val FILTERED_SAMPS_SUFFIX: String =  ".filtered_samples.txt"
+
+
+  trait WholeMatrixMemoryLimit extends CommandLineFunction {
+    // Since loading ALL of the data can take significant memory:
+    if (wholeMatrixMemory < 0) {
+      this.memoryLimit = 24
+    }
+    else {
+      this.memoryLimit = wholeMatrixMemory
+    }
+  }
+
+  trait LongRunTime extends CommandLineFunction {
+    if (longJobQueue != "")
+      this.jobQueue = longJobQueue
+  }
+
+  def script = {
+    val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile)
+    add(prepTargets)
+
+    trait CommandLineGATKArgs extends CommandLineGATK {
+      this.intervals :+= prepTargets.out
+      this.jarFile = qscript.gatkJarFile
+      this.reference_sequence = qscript.referenceFile
+      this.logging_level = "INFO"
+    }
+
+    val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams))
+    val samples: List[String] = sampleToBams.keys.toList
+    Console.out.printf("Samples are %s%n", samples)
+
+    val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase)
+    var docs: List[DoC] = List[DoC]()
+    for (group <- groups) {
+      Console.out.printf("Group is %s%n", group)
+      docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
+    }
+    addAll(docs)
+
+    val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
+    add(mergeDepths)
+
+    var excludeTargets : List[File] = List[File]()
+    if (minTargGC > 0 || maxTargGC < 1) {
+      val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs
+      calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX
+      add(calcGCcontents)
+
+      val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC)
+      add(excludeTargetsBasedOnGC)
+      excludeTargets ::= excludeTargetsBasedOnGC.out
+    }
+
+    class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction {
+      @Input(doc="")
+      var intervals: File = prepTargets.out
+
+      @Output(doc="")
+      var out : File = new File(outFile)
+
+      val regFile : String = outputBase.getPath + ".targets.reg"
+      val locDB : String = outputBase.getPath + ".targets.LOCDB"
+
+      val removeFiles = "rm -f " + regFile + " " + locDB
+      val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile
+      val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load"
+      val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats"
+      val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out
+
+      var command: String =
+        removeFiles +
+          " && " + createRegFile +
+          " && " + createLOCDB +
+          " && " + calcRepeatMaskedPercent +
+          " && " + extractRepeatMaskedPercent
+
+      def commandLine = command
+
+      override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command
+    }
+
+    if (minTargRepeats > 0 || maxTargRepeats < 1) {
+      val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX)
+      add(calcRepeatComplexity)
+
+      val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats)
+      add(excludeTargetsBasedOnRepeats)
+      excludeTargets ::= excludeTargetsBasedOnRepeats.out
+    }
+
+    val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets)
+    add(filterCenterDepths)
+
+    val pca = new PCA(filterCenterDepths.filteredCentered)
+    add(pca)
+
+    val normalize = new Normalize(pca)
+    add(normalize)
+
+    val filterZscore = new FilterAndZscoreNormalized(normalize.normalized)
+    add(filterZscore)
+
+    val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore)
+    add(filterOriginal)
+
+    val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
+    add(discover)
+
+    val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
+    add(genotype)
+
+    if (genotypeSubsegments) {
+      val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
+      add(genotypeSegs)
+    }
+  }
+
+  class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
+    @Input(doc="")
+    var locus_value : File = locus_valueIn
+
+    @Output(doc="")
+    var out : File = new File(outputBase.getPath + outSuffix)
+
+    def run = {
+      var outWriter = new PrintWriter(new PrintStream(out))
+      var elems = asScalaIterator(new XReadLines(locus_value))
+
+      while (elems.hasNext) {
+        val line = elems.next
+        val splitLine = line.split("\\s+")
+        val locus = splitLine(0)
+        val locValStr = splitLine(1)
+        try {
+          val locVal = locValStr.toDouble
+          if (locVal < minVal || locVal > maxVal)
+            outWriter.printf("%s%n", locus)
+        }
+        catch {
+          case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus)
+          case e: Exception => throw e
+        }
+      }
+
+      outWriter.close
+    }
+  }
+
+  class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val excludeTargets = excludeTargetsIn
+
+    @Output
+    val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX)
+    @Output
+    val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX)
+    @Output
+    val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      " --centerData --centerType target" +
+      " -o " + filteredCentered +
+      " --outputExcludedTargets " + filteredTargets +
+      " --outputExcludedSamples " + filteredSamples
+    command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _)
+    if (targetSampleFiltersString != "")
+      command += " " + targetSampleFiltersString
+
+    def commandLine = command
+
+    override def description = "Filters samples and targets and then mean-centers the targets: " + command
+  }
+
+  class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    val PCAbase: String = outputBase.getPath + ".RD_PCA"
+
+    @Output
+    val outPC: File = new File(PCAbase + ".PC.txt")
+    @Output
+    val outPC_SD: File = new File(PCAbase + ".PC_SD.txt")
+    @Output
+    val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt")
+
+    var command: String =
+      xhmmExec + " --PCA" +
+      " -r " + input +
+      " --PCAfiles " + PCAbase
+
+    def commandLine = command
+
+    override def description = "Runs PCA on mean-centered data: " + command
+  }
+
+  class Normalize(pca: PCA) extends CommandLineFunction {
+    @Input(doc = "")
+    val input = pca.input
+
+    @Input(doc = "")
+    val inPC = pca.outPC
+
+    @Input(doc = "")
+    val inPC_SD = pca.outPC_SD
+
+    @Input(doc = "")
+    val inPC_LOADINGS = pca.outPC_LOADINGS
+
+    @Output
+    val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt")
+
+    var command: String =
+      xhmmExec + " --normalize" +
+      " -r " + input +
+      " --PCAfiles " + pca.PCAbase +
+      " --normalizeOutput " + normalized
+    if (PCAnormalizeMethodString != "")
+      command += " " + PCAnormalizeMethodString
+
+    def commandLine = command
+
+    override def description = "Normalizes mean-centered data using PCA information: " + command
+  }
+
+  class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Output
+    val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX)
+    @Output
+    val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX)
+    @Output
+    val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      " --centerData --centerType sample --zScoreData" +
+      " -o " + filteredZscored +
+      " --outputExcludedTargets " + filteredTargets +
+      " --outputExcludedSamples " + filteredSamples
+    if (targetSampleNormalizedFiltersString != "")
+      command += " " + targetSampleNormalizedFiltersString
+
+    def commandLine = command
+
+    override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
+  }
+
+  class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u))
+
+    @Input(doc = "")
+    val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u))
+
+    @Output
+    val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) +
+      sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) +
+      " -o " + sameFiltered
+
+    def commandLine = command
+
+    override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
+  }
+
+  class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val xhmmParams = xhmmParamsArg
+
+    @Input(doc = "")
+    val origRD = origRDParam
+
+    @Output
+    val xcnv: File = new File(outputBase.getPath + ".xcnv")
+
+    @Output
+    val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv")
+
+    val posteriorsBase = outputBase.getPath
+
+    @Output
+    val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt")
+
+    @Output
+    val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt")
+
+    @Output
+    val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt")
+
+    var command: String =
+      xhmmExec + " --discover" +
+      " -p " + xhmmParams +
+      " -r " + input +
+      " -R " + origRD +
+      " -c " + xcnv +
+      " -a " + aux_xcnv +
+      " -s " + posteriorsBase +
+      " " + discoverCommandLineParams
+
+    def commandLine = command
+
+    override def description = "Discovers CNVs in normalized data: " + command
+  }
+
+  abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val xhmmParams = xhmmParamsArg
+
+    @Input(doc = "")
+    val origRD = origRDParam
+
+    @Input(doc = "")
+    val inXcnv = xcnv
+
+    var command: String =
+      xhmmExec + " --genotype" +
+      " -p " + xhmmParams +
+      " -r " + input +
+      " -g " + inXcnv +
+      " -F " + referenceFile +
+      " -R " + origRD +
+      " " + genotypeCommandLineParams
+  }
+
+  class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
+    @Output
+    val vcf: File = new File(outputBase.getPath + ".vcf")
+
+    command +=
+      " -v " +  vcf
+
+    def commandLine = command
+
+    override def description = "Genotypes discovered CNVs in all samples: " + command
+  }
+
+  class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
+    @Output
+    val vcf: File = new File(outputBase.getPath + ".subsegments.vcf")
+
+    command +=
+      " -v " +  vcf +
+      " --subsegments" +
+      " --maxTargetsInSubsegment " + maxTargetsInSubsegment +
+      " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold
+
+    def commandLine = command
+
+    override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
+  }
+}
--- a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
@ -0,0 +1,123 @@
+package org.broadinstitute.sting.queue.util
+
+import java.io.File
+import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK}
+import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType
+import org.broadinstitute.sting.commandline.{Input, Gather, Output}
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+
+package object DoC {
+  class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
+    val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
+
+    // So that the output files of this DoC run get deleted once they're used further downstream:
+    this.isIntermediate = true
+
+    this.analysis_type = "DepthOfCoverage"
+
+    this.input_file = bams
+
+    this.downsample_to_coverage = Some(MAX_DEPTH)
+    this.downsampling_type = DownsampleType.BY_SAMPLE
+
+    this.scatterCount = scatterCountInput
+    this.scatterClass = classOf[IntervalScatterFunction]
+
+    // HACK for DoC to work properly within Queue:
+    @Output
+    @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
+    var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX)
+
+    override def commandLine = super.commandLine +
+      " --omitDepthOutputAtEachBase" +
+      " --omitLocusTable" +
+      " --minBaseQuality 0" +
+      " --minMappingQuality " + minMappingQuality +
+      " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
+      (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
+      " --includeRefNSites" +
+      " -o " + DoC_output
+
+    override def shortDescription = "DoC: " + DoC_output
+  }
+
+  class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
+    // HACK for DoC to work properly within Queue:
+    @Output
+    @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
+    var outPrefix = DoC_output
+
+    override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "")
+  }
+
+  def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
+
+    def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match {
+      case (Nil, y) =>
+        return Nil
+      case (subsamples, remaining) =>
+        return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1)
+    }
+
+    return buildDoCgroupsHelper(samples, 0)
+  }
+
+  // A group has a list of samples and bam files to use for DoC
+  class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) {
+    // getName() just includes the file name WITHOUT the path:
+    val groupOutputName = name + "." + outputBase.getName
+
+    // Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! :
+    //def DoC_output = new File(outputBase.getParentFile(), groupOutputName)
+
+    def DoC_output = new File(groupOutputName)
+
+    override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams)
+  }
+
+  class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction {
+    @Input(doc = "")
+    var inputDoCfiles: List[File] = DoCsToCombine
+
+    @Output
+    val mergedDoC: File = new File(outFile)
+    var command: String =
+      xhmmExec + " --mergeGATKdepths" +
+        inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) +
+        " --columnSuffix " + columnSuffix +
+        " -o " + mergedDoC
+    if (sampleIDsMap != "")
+      command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn
+    rdPrecisionArg match {
+      case Some(rdPrecision) => {
+        command += " --rdPrecision " + rdPrecision
+      }
+      case None => {}
+    }
+    if (outputTargetsBySamples)
+      command += " --outputTargetsBySamples"
+
+    def commandLine = command
+
+    override def description = "Combines DoC outputs for multiple samples (at same loci): " + command
+  }
+
+  class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction {
+    @Input(doc = "List of files containing targeted intervals to be prepared and merged")
+    var inIntervals: List[File] = intervalsIn
+
+    @Output(doc = "The merged intervals file to write to")
+    var out: File = new File(outIntervals)
+
+    var command: String =
+      xhmmExec + " --prepareTargets" +
+        " -F " + referenceFile +
+        inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) +
+        " --mergedTargets " + out
+
+    def commandLine = command
+
+    override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command
+  }
+}