gatk-3.8/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala

142 lines
5.8 KiB
Scala
Executable File

package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.extensions.gatk._
import java.io.File
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import net.sf.picard.reference.ReferenceSequenceFileFactory
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import org.broadinstitute.sting.utils.interval.IntervalUtils
import collection.mutable.{ListBuffer, HashMap}
import collection.JavaConversions
import java.util.Arrays
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
import org.broadinstitute.sting.commandline.{Output, Input}
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
library =>
var attributes : Pipeline = attribs
def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = library.attributes.getProject.getReferenceFile
this.intervals = List(library.attributes.getProject.getIntervalList)
this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp)
this.memoryLimit = Some(2)
this.jarFile = library.gatkJar
}
/**
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
* @Returns: A CLF for the realigner target creator job
*/
def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
rtc.intervals = Nil
rtc.intervalsString = contigs
rtc.input_file :+= bam
rtc.out = output
rtc.analysisName = "RealignerTargetCreator"
return rtc
}
/**
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
* @Returns: A CLF for the indel cleaning job
*/
def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
var realigner = new IndelRealigner with StandardCommandLineGATK
realigner.intervalsString = contigs
realigner.intervals = Nil
realigner.input_file :+= bam
realigner.out = outBam
realigner.targetIntervals = targets
realigner.analysisName = "IndelClean"
realigner.bam_compression = Some(0)
return realigner
}
/**
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
*/
def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
var cmds : List[CommandLineFunction] = Nil
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
)
var bamsToMerge : List[File] = Nil
for ( spec <- jobSpecs ) {
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
bamsToMerge :+= spec._2
}
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
return cmds
}
/**
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
* of command line functions
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
* of uncleaned bams to the collection of cleaned bams
*/
def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
var commands : List[CommandLineFunction] = Nil
for ( bamPair <- bamsUncleanCleanPairs ) {
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
sam.bamFile = bamPair._2
sam.analysisName = "SamtoolsIndex"
commands :+= rtc
commands ++= icbs
commands :+= sam
}
return commands
}
/**
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
* @Returns: Command line function for the merge, fix-mate, and sort operation
*/
def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
var pfm : PicardFixMates = new PicardFixMates
pfm.bams = inBams
pfm.outBam = outBam
pfm.jarFile = picardJar
pfm.assumeSorted = Some(false)
pfm.memoryLimit = Some(4)
pfm.analysisName = "FixMates"
return pfm
}
class PicardFixMates extends PicardBamFunction {
@Input(doc="input bam files") var bams: List[File] = Nil
@Output(doc="output bam file") var outBam: File = null
def inputBams: List[File] = bams
def outputBam: File = outBam
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}