Add option to genotype additional user-defined interval lists

Add Qscript 'ONLY_GENOTYPE_xhmmCNVpipeline' to genotype additional user-defined interval lists

Add Qscript 'ONLY_GENOTYPE_xhmmCNVpipeline' to genotype additional user-defined interval lists (and similar option to Qscript 'xhmmCNVpipeline')
This commit is contained in:
Menachem Fromer 2014-12-21 01:19:03 -05:00
parent a4a9e73ec8
commit 11cd0080c3
4 changed files with 272 additions and 119 deletions

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.queue.qscripts.CNV
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.util.VCF_BAM_utilities
import org.broadinstitute.gatk.queue.extensions.gatk.DoC._
import org.broadinstitute.gatk.utils.commandline._
import java.io.{File, PrintStream, PrintWriter}
import org.broadinstitute.gatk.utils.text.XReadLines
import collection.JavaConversions._
import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils
import org.broadinstitute.gatk.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction, ScatterGatherableFunction}
import org.broadinstitute.gatk.queue.function.{CommandLineFunction, InProcessFunction}
import org.broadinstitute.gatk.utils.io.IOUtils
class ONLY_GENOTYPE_xhmmCNVpipeline extends QScript {
qscript =>
@Input(doc = "bam input, as as a list of .bam files, or a list of bam files with sample IDs to be used ( as specified at https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_CommandLineGATK.html#--sample_rename_mapping_file )", shortName = "I", required = true)
var bams: File = _
@Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
var xhmmExec: File = _
@Input(shortName = "R", doc = "ref", required = true)
var referenceFile: File = _
@Argument(doc = "Samples to run together for DoC, CNV discovery, and CNV genotyping. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
var samplesPerJob = 1
@Output(doc = "Base name for files to output", shortName = "o", required = true)
var outputBase: File = _
@Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
var xhmmParamsArg: File = _
@Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
var genotypeCommandLineParams: String = ""
@Argument(shortName = "addGenotypeRegions", doc = "Additional interval list files to be genotyped", required = false)
var addGenotypeRegions: List[File] = List[File]()
@Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
var longJobQueue: String = ""
@Argument(shortName = "filteredZscored", doc = "File of PCA-normalized read depths, after filtering and Z-score calculation", required = true)
var filteredZscored: File = _
@Argument(shortName = "originalSameFiltered", doc = "File of original read depths, using same filters (samples and targets) as Z-score matrix [filteredZscored argument]", required = true)
var originalSameFiltered: File = _
trait LongRunTime extends CommandLineFunction {
if (longJobQueue != "")
this.jobQueue = longJobQueue
}
def script = {
val parseMixedInputBamList = parseBamListWithOptionalSampleMappings(bams)
val processMixedInputBamList = new ProcessBamListWithOptionalSampleMappings(parseMixedInputBamList, outputBase.getPath)
add(processMixedInputBamList)
val samples: List[String] = parseMixedInputBamList.sampleToBams.keys.toList
Console.out.printf("Samples are %s%n", samples)
val groups: List[Group] = buildDoCgroups(samples, parseMixedInputBamList.sampleToBams, samplesPerJob, outputBase)
for (group <- groups) {
Console.out.printf("Group is %s%n", group)
}
for (regionsFile <- addGenotypeRegions) {
val genotypeRegions = new GenotypeCNVs(filteredZscored, regionsFile, originalSameFiltered, new File(outputBase.getParent + "/" + regionsFile.getName.replace(".interval_list", "") + "." + outputBase.getName), xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime
add(genotypeRegions)
}
}
}

View File

@ -134,6 +134,9 @@ class xhmmCNVpipeline extends QScript {
@Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
var subsegmentGenotypeThreshold = 20.0
@Argument(shortName = "addGenotypeRegions", doc = "Additional interval list files to be genotyped", required = false)
var addGenotypeRegions: List[File] = List[File]()
@Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
var longJobQueue: String = ""
@ -314,41 +317,7 @@ class xhmmCNVpipeline extends QScript {
val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
add(discover)
abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, outName: String) extends SamplesScatterable(xhmmExec, groups) with LongRunTime {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val xhmmParams = xhmmParamsArg
@Input(doc = "")
val origRD = origRDParam
@Input(doc = "")
val inXcnv = xcnv
@Output
@Gather(classOf[MergeVCFsGatherFunction])
val vcf: File = new File(outName)
override def commandLine =
xhmmExec + " --genotype" +
" -p " + xhmmParams +
" -r " + input +
" -g " + inXcnv +
" -F " + referenceFile +
" -R " + origRD +
" -v " + vcf +
" " + genotypeCommandLineParams +
" " + addCommand
}
class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".vcf") {
override def description = "Genotypes discovered CNVs in all samples: " + commandLine
}
class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".subsegments.vcf") {
class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".subsegments.vcf", xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) {
override def commandLine =
super.commandLine +
" --subsegments" +
@ -358,13 +327,19 @@ class xhmmCNVpipeline extends QScript {
override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + commandLine
}
val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered, outputBase, xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime
add(genotype)
if (genotypeSubsegments) {
val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered, xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime
add(genotypeSegs)
}
addGenotypeRegions :+= prepTargets.out
for (regionsFile <- addGenotypeRegions) {
val genotypeRegions = new GenotypeCNVs(filterZscore.filteredZscored, regionsFile, filterOriginal.sameFiltered, new File(outputBase.getParent + "/" + regionsFile.getName.replace(".interval_list", "") + "." + outputBase.getName), xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime
add(genotypeRegions)
}
}
class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
@ -531,85 +506,3 @@ class xhmmCNVpipeline extends QScript {
override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
}
}
abstract class SamplesScatterable(val xhmmExec: File, val groups: List[Group]) extends ScatterGatherableFunction with CommandLineFunction {
this.scatterCount = groups.size
this.scatterClass = classOf[SamplesScatterFunction]
@Input(doc = "", required=false)
var keepSampleIDs: Option[String] = None
def addCommand = if (keepSampleIDs.isDefined) ("--keepSampleIDs " + keepSampleIDs.get) else ""
}
class SamplesScatterFunction extends ScatterFunction with InProcessFunction {
protected var groups: List[Group] = _
override def scatterCount = groups.size
@Output(doc="Scatter function outputs")
var scatterSamples: Seq[File] = Nil
override def init() {
this.groups = this.originalFunction.asInstanceOf[SamplesScatterable].groups
}
override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) {
val scatterPart = IOUtils.absolute(cloneFunction.commandDirectory, "keepSampleIDs.txt")
cloneFunction.setFieldValue("keepSampleIDs", Some(scatterPart))
this.scatterSamples :+= scatterPart
}
override def run() {
if (groups.size != this.scatterSamples.size)
throw new Exception("Internal inconsistency error in scattering jobs")
(groups, this.scatterSamples).zipped foreach {
(group, sampsFile) => {
val sampsWriter = new PrintWriter(new PrintStream(sampsFile))
for (samp <- group.samples) {
try {
sampsWriter.printf("%s%n", samp)
}
catch {
case e: Exception => throw e
}
}
sampsWriter.close
}
}
}
}
trait MergeVCFs extends CommandLineFunction {
var xhmmExec: File = _
@Input(doc = "")
var inputVCFs: List[File] = Nil
@Output
var mergedVCF: File = null
override def commandLine =
xhmmExec + " --mergeVCFs" +
inputVCFs.map(input => " --mergeVCF " + input).reduceLeft(_ + "" + _) +
" -v " + mergedVCF
override def description = "Combines VCF outputs for multiple samples (at same loci): " + commandLine
}
class MergeVCFsGatherFunction extends MergeVCFs with GatherFunction {
override def freezeFieldValues() {
super.freezeFieldValues()
this.xhmmExec = originalFunction.asInstanceOf[SamplesScatterable].xhmmExec
this.inputVCFs = this.gatherParts.toList
this.mergedVCF = this.originalOutput
}
}
class DummyGatherFunction extends InProcessFunction with GatherFunction {
override def run() {}
}

View File

@ -40,6 +40,7 @@ import org.broadinstitute.gatk.utils.downsampling.DownsampleType
// due to ongoing bugs with inner classes/objects in package objects:
// https://issues.scala-lang.org/browse/SI-4344
// https://issues.scala-lang.org/browse/SI-5954
class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int], val sampleRenameMappingFile: Option[File] = None) extends CommandLineGATK with ScatterGatherableFunction {
val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"

View File

@ -0,0 +1,156 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.queue.extensions.gatk
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk.DoC._
import org.broadinstitute.gatk.utils.commandline._
import java.io.{File, PrintStream, PrintWriter}
import collection.JavaConversions._
import org.broadinstitute.gatk.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction, ScatterGatherableFunction}
import org.broadinstitute.gatk.queue.function.{CommandLineFunction, InProcessFunction}
import org.broadinstitute.gatk.utils.io.IOUtils
// Minimal refactor from a package object to a file full of classes/objects
// due to ongoing bugs with inner classes/objects in package objects:
// https://issues.scala-lang.org/browse/SI-4344
// https://issues.scala-lang.org/browse/SI-5954
abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, outName: String, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends SamplesScatterable(xhmmExec, groups) {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val xhmmParams = xhmmParamsArg
@Input(doc = "")
val origRD = origRDParam
@Input(doc = "")
val inXcnv = xcnv
@Output
@Gather(classOf[MergeVCFsGatherFunction])
val vcf: File = new File(outName)
override def commandLine =
xhmmExec + " --genotype" +
" -p " + xhmmParams +
" -r " + input +
" -g " + inXcnv +
" -F " + referenceFile +
" -R " + origRD +
" -v " + vcf +
" " + genotypeCommandLineParams +
" " + addCommand
}
class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, genotypeOutputBase: File, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, genotypeOutputBase.getPath + ".vcf", xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) {
override def description = "Genotypes CNV regions in all samples: " + commandLine
}
abstract class SamplesScatterable(val xhmmExec: File, val groups: List[Group]) extends ScatterGatherableFunction with CommandLineFunction {
this.scatterCount = groups.size
this.scatterClass = classOf[SamplesScatterFunction]
@Input(doc = "", required=false)
var keepSampleIDs: Option[String] = None
def addCommand = if (keepSampleIDs.isDefined) ("--keepSampleIDs " + keepSampleIDs.get) else ""
}
class SamplesScatterFunction extends ScatterFunction with InProcessFunction {
protected var groups: List[Group] = _
override def scatterCount = groups.size
@Output(doc="Scatter function outputs")
var scatterSamples: Seq[File] = Nil
override def init() {
this.groups = this.originalFunction.asInstanceOf[SamplesScatterable].groups
}
override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) {
val scatterPart = IOUtils.absolute(cloneFunction.commandDirectory, "keepSampleIDs.txt")
cloneFunction.setFieldValue("keepSampleIDs", Some(scatterPart))
this.scatterSamples :+= scatterPart
}
override def run() {
if (groups.size != this.scatterSamples.size)
throw new Exception("Internal inconsistency error in scattering jobs")
(groups, this.scatterSamples).zipped foreach {
(group, sampsFile) => {
val sampsWriter = new PrintWriter(new PrintStream(sampsFile))
for (samp <- group.samples) {
try {
sampsWriter.printf("%s%n", samp)
}
catch {
case e: Exception => throw e
}
}
sampsWriter.close
}
}
}
}
trait MergeVCFs extends CommandLineFunction {
var xhmmExec: File = _
@Input(doc = "")
var inputVCFs: List[File] = Nil
@Output
var mergedVCF: File = null
override def commandLine =
xhmmExec + " --mergeVCFs" +
inputVCFs.map(input => " --mergeVCF " + input).reduceLeft(_ + "" + _) +
" -v " + mergedVCF
override def description = "Combines VCF outputs for multiple samples (at same loci): " + commandLine
}
class MergeVCFsGatherFunction extends MergeVCFs with GatherFunction {
override def freezeFieldValues() {
super.freezeFieldValues()
this.xhmmExec = originalFunction.asInstanceOf[SamplesScatterable].xhmmExec
this.inputVCFs = this.gatherParts.toList
this.mergedVCF = this.originalOutput
}
}
class DummyGatherFunction extends InProcessFunction with GatherFunction {
override def run() {}
}