Begin moving XHMM scripts to public
This commit is contained in:
parent
1ec137a40c
commit
cde4f037d3
|
|
@ -0,0 +1,499 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.CNV
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
|
||||
import org.broadinstitute.sting.queue.util.DoC._
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import java.io.{PrintStream, PrintWriter}
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions._
|
||||
|
||||
class xhmmCNVpipeline extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
|
||||
var bams: File = _
|
||||
|
||||
@Argument(doc = "gatk jar file", shortName = "J", required = true)
|
||||
var gatkJarFile: File = _
|
||||
|
||||
@Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
|
||||
var xhmmExec: File = _
|
||||
|
||||
@Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
|
||||
var pseqExec: File = _
|
||||
|
||||
@Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
|
||||
var pseqSeqDB: String = _
|
||||
|
||||
@Argument(shortName = "R", doc = "ref", required = true)
|
||||
var referenceFile: File = _
|
||||
|
||||
@Argument(shortName = "L", doc = "Intervals", required = false)
|
||||
var intervals: File = _
|
||||
|
||||
@Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false)
|
||||
var scatterCountInput = 0
|
||||
|
||||
@Input(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
|
||||
var samplesPerJob = 1
|
||||
|
||||
@Output(doc = "Base name for files to output", shortName = "o", required = true)
|
||||
var outputBase: File = _
|
||||
|
||||
@Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
|
||||
var MAX_DEPTH = 20000
|
||||
|
||||
@Hidden
|
||||
@Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
|
||||
var NUM_BINS = 200
|
||||
|
||||
@Hidden
|
||||
@Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
|
||||
var START_BIN = 1
|
||||
|
||||
@Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
|
||||
var minMappingQuality = 0
|
||||
|
||||
@Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
|
||||
var wholeMatrixMemory = -1
|
||||
|
||||
@Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
|
||||
var minTargGC : Double = 0.1
|
||||
|
||||
@Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false)
|
||||
var maxTargGC : Double = 0.9
|
||||
|
||||
@Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false)
|
||||
var minTargRepeats : Double = 0.0
|
||||
|
||||
@Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false)
|
||||
var maxTargRepeats : Double = 0.1
|
||||
|
||||
@Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false)
|
||||
var sampleIDsMap: String = ""
|
||||
|
||||
@Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false)
|
||||
var sampleIDsMapFromColumn = 1
|
||||
|
||||
@Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false)
|
||||
var sampleIDsMapToColumn = 2
|
||||
|
||||
@Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false)
|
||||
var targetSampleFiltersString: String = ""
|
||||
|
||||
@Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false)
|
||||
var PCAnormalizeMethodString: String = ""
|
||||
|
||||
@Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false)
|
||||
var targetSampleNormalizedFiltersString: String = ""
|
||||
|
||||
@Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
|
||||
var xhmmParamsArg: File = _
|
||||
|
||||
@Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false)
|
||||
var discoverCommandLineParams: String = ""
|
||||
|
||||
@Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
|
||||
var genotypeCommandLineParams: String = ""
|
||||
|
||||
@Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false)
|
||||
var genotypeSubsegments: Boolean = false
|
||||
|
||||
@Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false)
|
||||
var maxTargetsInSubsegment = 30
|
||||
|
||||
@Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
|
||||
var subsegmentGenotypeThreshold = 20.0
|
||||
|
||||
@Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
|
||||
var longJobQueue: String = ""
|
||||
|
||||
|
||||
val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list"
|
||||
|
||||
val RD_OUTPUT_SUFFIX: String = ".RD.txt"
|
||||
|
||||
val TARGS_GC_SUFFIX = ".locus_GC.txt"
|
||||
val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt"
|
||||
|
||||
val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt"
|
||||
val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt"
|
||||
|
||||
val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt"
|
||||
val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt"
|
||||
|
||||
|
||||
trait WholeMatrixMemoryLimit extends CommandLineFunction {
|
||||
// Since loading ALL of the data can take significant memory:
|
||||
if (wholeMatrixMemory < 0) {
|
||||
this.memoryLimit = 24
|
||||
}
|
||||
else {
|
||||
this.memoryLimit = wholeMatrixMemory
|
||||
}
|
||||
}
|
||||
|
||||
trait LongRunTime extends CommandLineFunction {
|
||||
if (longJobQueue != "")
|
||||
this.jobQueue = longJobQueue
|
||||
}
|
||||
|
||||
def script = {
|
||||
val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile)
|
||||
add(prepTargets)
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.intervals :+= prepTargets.out
|
||||
this.jarFile = qscript.gatkJarFile
|
||||
this.reference_sequence = qscript.referenceFile
|
||||
this.logging_level = "INFO"
|
||||
}
|
||||
|
||||
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams))
|
||||
val samples: List[String] = sampleToBams.keys.toList
|
||||
Console.out.printf("Samples are %s%n", samples)
|
||||
|
||||
val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase)
|
||||
var docs: List[DoC] = List[DoC]()
|
||||
for (group <- groups) {
|
||||
Console.out.printf("Group is %s%n", group)
|
||||
docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
|
||||
}
|
||||
addAll(docs)
|
||||
|
||||
val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
|
||||
add(mergeDepths)
|
||||
|
||||
var excludeTargets : List[File] = List[File]()
|
||||
if (minTargGC > 0 || maxTargGC < 1) {
|
||||
val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs
|
||||
calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX
|
||||
add(calcGCcontents)
|
||||
|
||||
val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC)
|
||||
add(excludeTargetsBasedOnGC)
|
||||
excludeTargets ::= excludeTargetsBasedOnGC.out
|
||||
}
|
||||
|
||||
class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction {
|
||||
@Input(doc="")
|
||||
var intervals: File = prepTargets.out
|
||||
|
||||
@Output(doc="")
|
||||
var out : File = new File(outFile)
|
||||
|
||||
val regFile : String = outputBase.getPath + ".targets.reg"
|
||||
val locDB : String = outputBase.getPath + ".targets.LOCDB"
|
||||
|
||||
val removeFiles = "rm -f " + regFile + " " + locDB
|
||||
val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile
|
||||
val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load"
|
||||
val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats"
|
||||
val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out
|
||||
|
||||
var command: String =
|
||||
removeFiles +
|
||||
" && " + createRegFile +
|
||||
" && " + createLOCDB +
|
||||
" && " + calcRepeatMaskedPercent +
|
||||
" && " + extractRepeatMaskedPercent
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command
|
||||
}
|
||||
|
||||
if (minTargRepeats > 0 || maxTargRepeats < 1) {
|
||||
val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX)
|
||||
add(calcRepeatComplexity)
|
||||
|
||||
val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats)
|
||||
add(excludeTargetsBasedOnRepeats)
|
||||
excludeTargets ::= excludeTargetsBasedOnRepeats.out
|
||||
}
|
||||
|
||||
val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets)
|
||||
add(filterCenterDepths)
|
||||
|
||||
val pca = new PCA(filterCenterDepths.filteredCentered)
|
||||
add(pca)
|
||||
|
||||
val normalize = new Normalize(pca)
|
||||
add(normalize)
|
||||
|
||||
val filterZscore = new FilterAndZscoreNormalized(normalize.normalized)
|
||||
add(filterZscore)
|
||||
|
||||
val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore)
|
||||
add(filterOriginal)
|
||||
|
||||
val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
|
||||
add(discover)
|
||||
|
||||
val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
|
||||
add(genotype)
|
||||
|
||||
if (genotypeSubsegments) {
|
||||
val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
|
||||
add(genotypeSegs)
|
||||
}
|
||||
}
|
||||
|
||||
class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
|
||||
@Input(doc="")
|
||||
var locus_value : File = locus_valueIn
|
||||
|
||||
@Output(doc="")
|
||||
var out : File = new File(outputBase.getPath + outSuffix)
|
||||
|
||||
def run = {
|
||||
var outWriter = new PrintWriter(new PrintStream(out))
|
||||
var elems = asScalaIterator(new XReadLines(locus_value))
|
||||
|
||||
while (elems.hasNext) {
|
||||
val line = elems.next
|
||||
val splitLine = line.split("\\s+")
|
||||
val locus = splitLine(0)
|
||||
val locValStr = splitLine(1)
|
||||
try {
|
||||
val locVal = locValStr.toDouble
|
||||
if (locVal < minVal || locVal > maxVal)
|
||||
outWriter.printf("%s%n", locus)
|
||||
}
|
||||
catch {
|
||||
case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus)
|
||||
case e: Exception => throw e
|
||||
}
|
||||
}
|
||||
|
||||
outWriter.close
|
||||
}
|
||||
}
|
||||
|
||||
class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val excludeTargets = excludeTargetsIn
|
||||
|
||||
@Output
|
||||
val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX)
|
||||
@Output
|
||||
val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX)
|
||||
@Output
|
||||
val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
" --centerData --centerType target" +
|
||||
" -o " + filteredCentered +
|
||||
" --outputExcludedTargets " + filteredTargets +
|
||||
" --outputExcludedSamples " + filteredSamples
|
||||
command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _)
|
||||
if (targetSampleFiltersString != "")
|
||||
command += " " + targetSampleFiltersString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters samples and targets and then mean-centers the targets: " + command
|
||||
}
|
||||
|
||||
class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
val PCAbase: String = outputBase.getPath + ".RD_PCA"
|
||||
|
||||
@Output
|
||||
val outPC: File = new File(PCAbase + ".PC.txt")
|
||||
@Output
|
||||
val outPC_SD: File = new File(PCAbase + ".PC_SD.txt")
|
||||
@Output
|
||||
val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --PCA" +
|
||||
" -r " + input +
|
||||
" --PCAfiles " + PCAbase
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Runs PCA on mean-centered data: " + command
|
||||
}
|
||||
|
||||
class Normalize(pca: PCA) extends CommandLineFunction {
|
||||
@Input(doc = "")
|
||||
val input = pca.input
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC = pca.outPC
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC_SD = pca.outPC_SD
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC_LOADINGS = pca.outPC_LOADINGS
|
||||
|
||||
@Output
|
||||
val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --normalize" +
|
||||
" -r " + input +
|
||||
" --PCAfiles " + pca.PCAbase +
|
||||
" --normalizeOutput " + normalized
|
||||
if (PCAnormalizeMethodString != "")
|
||||
command += " " + PCAnormalizeMethodString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Normalizes mean-centered data using PCA information: " + command
|
||||
}
|
||||
|
||||
class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Output
|
||||
val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX)
|
||||
@Output
|
||||
val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX)
|
||||
@Output
|
||||
val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
" --centerData --centerType sample --zScoreData" +
|
||||
" -o " + filteredZscored +
|
||||
" --outputExcludedTargets " + filteredTargets +
|
||||
" --outputExcludedSamples " + filteredSamples
|
||||
if (targetSampleNormalizedFiltersString != "")
|
||||
command += " " + targetSampleNormalizedFiltersString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
|
||||
}
|
||||
|
||||
class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u))
|
||||
|
||||
@Input(doc = "")
|
||||
val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u))
|
||||
|
||||
@Output
|
||||
val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) +
|
||||
sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) +
|
||||
" -o " + sameFiltered
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
|
||||
}
|
||||
|
||||
class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val xhmmParams = xhmmParamsArg
|
||||
|
||||
@Input(doc = "")
|
||||
val origRD = origRDParam
|
||||
|
||||
@Output
|
||||
val xcnv: File = new File(outputBase.getPath + ".xcnv")
|
||||
|
||||
@Output
|
||||
val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv")
|
||||
|
||||
val posteriorsBase = outputBase.getPath
|
||||
|
||||
@Output
|
||||
val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt")
|
||||
|
||||
@Output
|
||||
val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt")
|
||||
|
||||
@Output
|
||||
val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --discover" +
|
||||
" -p " + xhmmParams +
|
||||
" -r " + input +
|
||||
" -R " + origRD +
|
||||
" -c " + xcnv +
|
||||
" -a " + aux_xcnv +
|
||||
" -s " + posteriorsBase +
|
||||
" " + discoverCommandLineParams
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Discovers CNVs in normalized data: " + command
|
||||
}
|
||||
|
||||
abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val xhmmParams = xhmmParamsArg
|
||||
|
||||
@Input(doc = "")
|
||||
val origRD = origRDParam
|
||||
|
||||
@Input(doc = "")
|
||||
val inXcnv = xcnv
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --genotype" +
|
||||
" -p " + xhmmParams +
|
||||
" -r " + input +
|
||||
" -g " + inXcnv +
|
||||
" -F " + referenceFile +
|
||||
" -R " + origRD +
|
||||
" " + genotypeCommandLineParams
|
||||
}
|
||||
|
||||
class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
|
||||
@Output
|
||||
val vcf: File = new File(outputBase.getPath + ".vcf")
|
||||
|
||||
command +=
|
||||
" -v " + vcf
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Genotypes discovered CNVs in all samples: " + command
|
||||
}
|
||||
|
||||
class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
|
||||
@Output
|
||||
val vcf: File = new File(outputBase.getPath + ".subsegments.vcf")
|
||||
|
||||
command +=
|
||||
" -v " + vcf +
|
||||
" --subsegments" +
|
||||
" --maxTargetsInSubsegment " + maxTargetsInSubsegment +
|
||||
" --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
package org.broadinstitute.sting.queue.util
|
||||
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK}
|
||||
import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType
|
||||
import org.broadinstitute.sting.commandline.{Input, Gather, Output}
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
|
||||
package object DoC {
|
||||
class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
|
||||
val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
|
||||
|
||||
// So that the output files of this DoC run get deleted once they're used further downstream:
|
||||
this.isIntermediate = true
|
||||
|
||||
this.analysis_type = "DepthOfCoverage"
|
||||
|
||||
this.input_file = bams
|
||||
|
||||
this.downsample_to_coverage = Some(MAX_DEPTH)
|
||||
this.downsampling_type = DownsampleType.BY_SAMPLE
|
||||
|
||||
this.scatterCount = scatterCountInput
|
||||
this.scatterClass = classOf[IntervalScatterFunction]
|
||||
|
||||
// HACK for DoC to work properly within Queue:
|
||||
@Output
|
||||
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
|
||||
var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX)
|
||||
|
||||
override def commandLine = super.commandLine +
|
||||
" --omitDepthOutputAtEachBase" +
|
||||
" --omitLocusTable" +
|
||||
" --minBaseQuality 0" +
|
||||
" --minMappingQuality " + minMappingQuality +
|
||||
" --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
|
||||
(if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
|
||||
" --includeRefNSites" +
|
||||
" -o " + DoC_output
|
||||
|
||||
override def shortDescription = "DoC: " + DoC_output
|
||||
}
|
||||
|
||||
class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
|
||||
// HACK for DoC to work properly within Queue:
|
||||
@Output
|
||||
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
|
||||
var outPrefix = DoC_output
|
||||
|
||||
override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "")
|
||||
}
|
||||
|
||||
def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
|
||||
|
||||
def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match {
|
||||
case (Nil, y) =>
|
||||
return Nil
|
||||
case (subsamples, remaining) =>
|
||||
return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1)
|
||||
}
|
||||
|
||||
return buildDoCgroupsHelper(samples, 0)
|
||||
}
|
||||
|
||||
// A group has a list of samples and bam files to use for DoC
|
||||
class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) {
|
||||
// getName() just includes the file name WITHOUT the path:
|
||||
val groupOutputName = name + "." + outputBase.getName
|
||||
|
||||
// Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! :
|
||||
//def DoC_output = new File(outputBase.getParentFile(), groupOutputName)
|
||||
|
||||
def DoC_output = new File(groupOutputName)
|
||||
|
||||
override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams)
|
||||
}
|
||||
|
||||
class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction {
|
||||
@Input(doc = "")
|
||||
var inputDoCfiles: List[File] = DoCsToCombine
|
||||
|
||||
@Output
|
||||
val mergedDoC: File = new File(outFile)
|
||||
var command: String =
|
||||
xhmmExec + " --mergeGATKdepths" +
|
||||
inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) +
|
||||
" --columnSuffix " + columnSuffix +
|
||||
" -o " + mergedDoC
|
||||
if (sampleIDsMap != "")
|
||||
command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn
|
||||
rdPrecisionArg match {
|
||||
case Some(rdPrecision) => {
|
||||
command += " --rdPrecision " + rdPrecision
|
||||
}
|
||||
case None => {}
|
||||
}
|
||||
if (outputTargetsBySamples)
|
||||
command += " --outputTargetsBySamples"
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Combines DoC outputs for multiple samples (at same loci): " + command
|
||||
}
|
||||
|
||||
class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction {
|
||||
@Input(doc = "List of files containing targeted intervals to be prepared and merged")
|
||||
var inIntervals: List[File] = intervalsIn
|
||||
|
||||
@Output(doc = "The merged intervals file to write to")
|
||||
var out: File = new File(outIntervals)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --prepareTargets" +
|
||||
" -F " + referenceFile +
|
||||
inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) +
|
||||
" --mergedTargets " + out
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue