gatk-3.8/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala

142 lines
5.8 KiB
Scala
Raw Normal View History

package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.extensions.gatk._
import java.io.File
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import net.sf.picard.reference.ReferenceSequenceFileFactory
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import org.broadinstitute.sting.utils.interval.IntervalUtils
import collection.mutable.{ListBuffer, HashMap}
import collection.JavaConversions
import java.util.Arrays
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
import org.broadinstitute.sting.commandline.{Output, Input}
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
library =>
var attributes : Pipeline = attribs
def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = library.attributes.getProject.getReferenceFile
this.intervals = List(library.attributes.getProject.getIntervalList)
this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp)
this.memoryLimit = Some(2)
this.jarFile = library.gatkJar
}
/**
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
* @Returns: A CLF for the realigner target creator job
*/
def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
rtc.intervals = Nil
rtc.intervalsString = contigs
rtc.input_file :+= bam
rtc.out = output
rtc.analysisName = "RealignerTargetCreator"
return rtc
}
/**
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
* @Returns: A CLF for the indel cleaning job
*/
def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
var realigner = new IndelRealigner with StandardCommandLineGATK
realigner.intervalsString = contigs
realigner.intervals = Nil
realigner.input_file :+= bam
realigner.out = outBam
realigner.targetIntervals = targets
realigner.analysisName = "IndelClean"
realigner.bam_compression = Some(0)
return realigner
}
/**
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
*/
def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
var cmds : List[CommandLineFunction] = Nil
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
)
var bamsToMerge : List[File] = Nil
for ( spec <- jobSpecs ) {
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
bamsToMerge :+= spec._2
}
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
return cmds
}
/**
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
* of command line functions
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
* of uncleaned bams to the collection of cleaned bams
*/
def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
var commands : List[CommandLineFunction] = Nil
for ( bamPair <- bamsUncleanCleanPairs ) {
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
sam.bamFile = bamPair._2
sam.analysisName = "SamtoolsIndex"
commands :+= rtc
commands ++= icbs
commands :+= sam
}
return commands
}
/**
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
* @Returns: Command line function for the merge, fix-mate, and sort operation
*/
def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
var pfm : PicardFixMates = new PicardFixMates
pfm.bams = inBams
pfm.outBam = outBam
pfm.jarFile = picardJar
pfm.assumeSorted = Some(false)
pfm.memoryLimit = Some(4)
pfm.analysisName = "FixMates"
return pfm
}
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
class PicardFixMates extends PicardBamFunction {
@Input(doc="input bam files") var bams: List[File] = Nil
@Output(doc="output bam file") var outBam: File = null
def inputBams: List[File] = bams
def outputBam: File = outBam
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}