Removing QPipeline directory as there's no one to support it at the moment.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5757 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7ed8b4ddb0
commit
f7d9f0a1f3
|
|
@ -298,13 +298,6 @@
|
|||
<src path="${queue-extensions.source.dir}" />
|
||||
<include name="**/*.scala"/>
|
||||
<exclude name="**/gridengine/**" unless="include.gridengine" />
|
||||
<!--
|
||||
NOTE: Added include.queue.pipeline flag while VQSR v2 is being migrated
|
||||
and the pipeline package is still using the old version.
|
||||
At this moment enabling the flag turns on code that doesn't compile.
|
||||
ASAP determine if package should use v2 or be retired.
|
||||
-->
|
||||
<exclude name="**/queue/pipeline/**" unless="include.queue.pipeline" />
|
||||
</scalac>
|
||||
</target>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,141 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
import collection.mutable.{ListBuffer, HashMap}
|
||||
import collection.JavaConversions
|
||||
import java.util.Arrays
|
||||
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
|
||||
import org.broadinstitute.sting.commandline.{Output, Input}
|
||||
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
|
||||
|
||||
class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
|
||||
library =>
|
||||
|
||||
var attributes : Pipeline = attribs
|
||||
|
||||
def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
|
||||
|
||||
|
||||
trait StandardCommandLineGATK extends CommandLineGATK {
|
||||
this.reference_sequence = library.attributes.getProject.getReferenceFile
|
||||
this.intervals = List(library.attributes.getProject.getIntervalList)
|
||||
this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp)
|
||||
this.memoryLimit = Some(2)
|
||||
this.jarFile = library.gatkJar
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
|
||||
* @Returns: A CLF for the realigner target creator job
|
||||
*/
|
||||
def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
|
||||
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
|
||||
rtc.intervals = Nil
|
||||
rtc.intervalsString = contigs
|
||||
rtc.input_file :+= bam
|
||||
rtc.out = output
|
||||
rtc.analysisName = "RealignerTargetCreator"
|
||||
|
||||
return rtc
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
|
||||
* @Returns: A CLF for the indel cleaning job
|
||||
*/
|
||||
def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
|
||||
var realigner = new IndelRealigner with StandardCommandLineGATK
|
||||
realigner.intervalsString = contigs
|
||||
realigner.intervals = Nil
|
||||
realigner.input_file :+= bam
|
||||
realigner.out = outBam
|
||||
realigner.targetIntervals = targets
|
||||
realigner.analysisName = "IndelClean"
|
||||
realigner.bam_compression = Some(0)
|
||||
|
||||
return realigner
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
|
||||
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
|
||||
*/
|
||||
def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
|
||||
var cmds : List[CommandLineFunction] = Nil
|
||||
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
|
||||
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
|
||||
)
|
||||
var bamsToMerge : List[File] = Nil
|
||||
for ( spec <- jobSpecs ) {
|
||||
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
|
||||
bamsToMerge :+= spec._2
|
||||
}
|
||||
|
||||
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
|
||||
|
||||
return cmds
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
|
||||
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
|
||||
* of command line functions
|
||||
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
|
||||
* of uncleaned bams to the collection of cleaned bams
|
||||
*/
|
||||
def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
|
||||
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
|
||||
var commands : List[CommandLineFunction] = Nil
|
||||
for ( bamPair <- bamsUncleanCleanPairs ) {
|
||||
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
|
||||
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
|
||||
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
|
||||
sam.bamFile = bamPair._2
|
||||
sam.analysisName = "SamtoolsIndex"
|
||||
commands :+= rtc
|
||||
commands ++= icbs
|
||||
commands :+= sam
|
||||
}
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
|
||||
* @Returns: Command line function for the merge, fix-mate, and sort operation
|
||||
*/
|
||||
def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
|
||||
var pfm : PicardFixMates = new PicardFixMates
|
||||
pfm.bams = inBams
|
||||
pfm.outBam = outBam
|
||||
pfm.jarFile = picardJar
|
||||
pfm.assumeSorted = Some(false)
|
||||
pfm.memoryLimit = Some(4)
|
||||
pfm.analysisName = "FixMates"
|
||||
|
||||
return pfm
|
||||
}
|
||||
|
||||
class PicardFixMates extends PicardBamFunction {
|
||||
@Input(doc="input bam files") var bams: List[File] = Nil
|
||||
@Output(doc="output bam file") var outBam: File = null
|
||||
|
||||
def inputBams: List[File] = bams
|
||||
def outputBam: File = outBam
|
||||
|
||||
}
|
||||
|
||||
|
||||
def swapExt(file: File, oldExtension: String, newExtension: String) =
|
||||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.commandline.{Input, Argument, Hidden}
|
||||
import java.io.File
|
||||
|
||||
|
||||
class PipelineArgumentCollection {
|
||||
|
||||
@Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false)
|
||||
var cleaningJobs: Int = 1
|
||||
|
||||
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y", required=false)
|
||||
var yamlFile: File = _
|
||||
|
||||
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
|
||||
var trigger: File = _
|
||||
|
||||
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false)
|
||||
var refseqTable: File = _
|
||||
|
||||
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)
|
||||
var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
|
||||
|
||||
@Input(doc="path to GATK jar", shortName="gatk")
|
||||
var gatkJar: File = _
|
||||
|
||||
@Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
|
||||
var target_titv: Float = _
|
||||
|
||||
@Input(doc="per-sample downsampling level",shortName="dcov",required=false)
|
||||
var downsampling_coverage = 300
|
||||
|
||||
@Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
|
||||
var num_snp_scatter_jobs = 20
|
||||
|
||||
@Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
|
||||
var num_indel_scatter_jobs = 5
|
||||
|
||||
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
|
||||
var skip_cleaning = false
|
||||
|
||||
@Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam). Mutually exclusive with YAML",
|
||||
required=false, shortName="pBams")
|
||||
var projectBams: File = _
|
||||
|
||||
@Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName")
|
||||
var projectName: String = _
|
||||
|
||||
@Input(doc="The reference file. Mutually exclusive with YAML.", required=false, shortName="pRef")
|
||||
var projectRef: File = _
|
||||
|
||||
@Input(doc="The project interval list. Mutually exclusive with YAML.", required=false, shortName="pInt")
|
||||
var projectIntervals: File = _
|
||||
|
||||
@Input(doc="The project dbsnp. Mutually exclusive with YAML",required=false, shortName="pDB")
|
||||
var projectDBSNP: File = _
|
||||
|
||||
//@Input(doc="ADPR script")
|
||||
//var adprScript: File = _
|
||||
|
||||
//@Input(doc="Sequencing maching name (for use by adpr)")
|
||||
//var machine: String = _
|
||||
|
||||
//@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
|
||||
//var protocol: String = _
|
||||
|
||||
def verifyArguments = {
|
||||
// if no yaml file we require all the mutually exclusive arguments
|
||||
if ( yamlFile == null ) {
|
||||
if ( projectBams == null ) throw new IllegalArgumentException("No YAML file provided; and no project bam list. Pipeline requires either YAML file, or full project specs.")
|
||||
|
||||
if ( projectName == null ) throw new IllegalArgumentException("No YAML file provided; and no project name. Pipeline requires either YAML file, or full project specs.")
|
||||
|
||||
if ( projectRef == null) throw new IllegalArgumentException("No YAML file provided; and no project reference. Pipeline requires either YAML file, or full project specs.")
|
||||
|
||||
if ( projectIntervals == null ) throw new IllegalArgumentException("No YAML file provided; and no project intervals. Pipeline requires either YAML file, or full project specs.")
|
||||
|
||||
if ( projectDBSNP == null ) throw new IllegalArgumentException("No YAML file provided; and no project DBSNP. Pipeline requires either YAML file, or full project specs.")
|
||||
} else {
|
||||
if ( projectBams != null || projectName != null || projectRef != null || projectIntervals != null || projectDBSNP != null )
|
||||
throw new IllegalArgumentException("YAML file provided, along with other project spec arguments. YAML file is mutually exclusive with project specs.")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import org.broadinstitute.sting.queue.util._
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
|
||||
class ProjectManagement(stingPath: String) {
|
||||
// TODO -- MAJOR scatter/gather numbers are hard-coded. Really need a function to set it to the right number. These are optimized to get everything on 'hour'
|
||||
|
||||
pm =>
|
||||
|
||||
var stingDirPath : String = stingPath
|
||||
|
||||
def this(f : File) = this(f.getAbsolutePath)
|
||||
|
||||
class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
|
||||
@Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
|
||||
@Output(doc="The file to write the site list to") var out_vcf = out_list
|
||||
@Argument(doc="Path to the SortByRef script") var sortByRef: String = _
|
||||
@Argument(doc="Path to the reference file on disk") var ref: File = _
|
||||
|
||||
def commandLine = {
|
||||
"egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
|
||||
vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
class Vcf2Intervals(in_vcf: File, out_intervals: File) extends CommandLineFunction {
|
||||
@Input(doc="The VCF to convert") var vcf: File = in_vcf
|
||||
@Output(doc="The output bed file") var intervals: File = out_intervals
|
||||
|
||||
def commandLine = {
|
||||
"grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(vcf.getAbsolutePath,intervals.getAbsolutePath)
|
||||
}
|
||||
}
|
||||
|
||||
def MergeBatches( callVCFs: List[File], allBams: List[File], mergedVCF: File, ref: File, size: Int) : List[CommandLineFunction] = {
|
||||
var cmds : List[CommandLineFunction] = Nil
|
||||
var pfSites : PassFilterAlleles = new PassFilterAlleles(callVCFs,swapExt(mergedVCF,".vcf",".pf.alleles.vcf"))
|
||||
pfSites.sortByRef = pm.stingDirPath+"perl/sortByRef.pl"
|
||||
pfSites.ref = ref
|
||||
|
||||
cmds :+= pfSites
|
||||
|
||||
var ints : Vcf2Intervals = new Vcf2Intervals(pfSites.out_vcf,swapExt(pfSites.out_vcf,".vcf",".intervals.list"))
|
||||
|
||||
cmds :+= ints
|
||||
|
||||
var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2))))
|
||||
|
||||
cmds ++= calcs
|
||||
|
||||
cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, ints.intervals, mergedVCF)
|
||||
|
||||
return cmds
|
||||
|
||||
}
|
||||
|
||||
def LikelihoodCalc( bams: List[File], ref: File, intervals: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = {
|
||||
var calc = new UGCalcLikelihoods
|
||||
calc.input_file ++= bams
|
||||
calc.reference_sequence = ref
|
||||
calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
|
||||
calc.downsample_to_coverage = Some(300)
|
||||
calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6)
|
||||
calc.scatterCount = if (bams.size < 5 ) 1 else if (bams.size < 50) 60 else 120
|
||||
calc.min_base_quality_score = Some(22)
|
||||
calc.min_mapping_quality_score = Some(20)
|
||||
//calc.genotyping_mode = Option[GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES]
|
||||
calc.out = outVCF
|
||||
calc.rodBind :+= new RodBind("allele","VCF",alleleVCF)
|
||||
calc.intervals :+= intervals
|
||||
|
||||
return calc
|
||||
}
|
||||
|
||||
def VariantCallMerge( likelihoods: List[File], ref: File, intervals: File, output: File) : UGCallVariants = {
|
||||
var call = new UGCallVariants
|
||||
call.reference_sequence = ref
|
||||
call.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
|
||||
call.intervals :+= intervals
|
||||
call.memoryLimit = Some(8)
|
||||
call.out = output
|
||||
call.rodBind ++= likelihoods.map( a => new RodBind("variant"+a.getName.replace(".vcf",""),"vcf",a) )
|
||||
call.scatterCount = 30
|
||||
|
||||
return call
|
||||
}
|
||||
|
||||
def swapExt(file: File, oldExtension: String, newExtension: String) =
|
||||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
|
||||
trait QPipeline {
|
||||
var commands: List[CommandLineFunction] = Nil
|
||||
|
||||
def generateCommands
|
||||
|
||||
def track // todo -- implement me
|
||||
|
||||
def removeIntermediate // todo -- implement me
|
||||
}
|
||||
|
|
@ -1,420 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import org.broadinstitute.sting.queue.util._
|
||||
import java.io.File
|
||||
import scala.collection.JavaConversions._
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
class VariantCalling(attribs: Pipeline,gatkJar: File) {
|
||||
vc =>
|
||||
|
||||
// load attributes
|
||||
var attributes = attribs
|
||||
|
||||
def this(yaml: File, gatkJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar)
|
||||
|
||||
/**
|
||||
* Trait to propagate basic attributes throughout the library
|
||||
*/
|
||||
trait StandardCommandLineGATK extends CommandLineGATK {
|
||||
this.reference_sequence = vc.attributes.getProject.getReferenceFile
|
||||
this.intervals = List(vc.attributes.getProject.getIntervalList)
|
||||
this.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
|
||||
// set global memory limit on the low side. Additional input bams will affect it.
|
||||
this.memoryLimit = Some(2)
|
||||
this.jarFile = vc.gatkJar
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Adds the trait data to a command line gatk that is passed in
|
||||
* @Return: the input CLGATK with the SCLGATK data propagated into it
|
||||
* @TODO: This should be better written, it'd be nice just to call it with addTrait[T], and return a T with SCLGATK
|
||||
*/
|
||||
def addTrait[T <: CommandLineGATK](c : T) : T = {
|
||||
c.reference_sequence = vc.attributes.getProject.getReferenceFile
|
||||
c.intervals = List(vc.attributes.getProject.getIntervalList)
|
||||
c.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
|
||||
// set global memory limit on the low side. Additional input bams will affect it.
|
||||
c.memoryLimit = Some(2)
|
||||
c.jarFile = vc.gatkJar
|
||||
c
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
|
||||
* @Return: UnifiedGenotyper with the standard GSA arguments
|
||||
* @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
|
||||
*/
|
||||
def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
|
||||
var ug = new UnifiedGenotyper with StandardCommandLineGATK
|
||||
ug.analysisName = "UnifiedGenotyper"
|
||||
ug.input_file = bams
|
||||
ug.out = output
|
||||
ug.downsample_to_coverage = Some(300)
|
||||
ug.dt = DownsampleType.BY_SAMPLE
|
||||
ug.scatterCount = 50
|
||||
|
||||
if ( bams.size > 40 ) {
|
||||
ug.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
if ( bams.size > 90 ) {
|
||||
ug.memoryLimit = Some(6)
|
||||
}
|
||||
|
||||
if ( bams.size > 140 ) {
|
||||
ug.memoryLimit = Some(8)
|
||||
}
|
||||
|
||||
return ug
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
|
||||
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
|
||||
*/
|
||||
def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
|
||||
var ig = new IndelGenotyperV2 with StandardCommandLineGATK
|
||||
ig.analysisName = "IndelGenotyper"
|
||||
ig.input_file :+= bam
|
||||
ig.out = output
|
||||
ig.downsample_to_coverage = Some(300)
|
||||
return ig
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
|
||||
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
|
||||
*/
|
||||
private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
|
||||
var ig = StandardIndelGenotyper(bam,output)
|
||||
return ig
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Combines a list of indel VCFs to a single output file
|
||||
* @Returns: A CombineVariants CLF with standard GSA arguments
|
||||
*/
|
||||
def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
|
||||
var cv = new CombineVariants with StandardCommandLineGATK
|
||||
cv.out = output
|
||||
cv.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY
|
||||
cv.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
|
||||
cv.analysisName = "IndelGenotyper"
|
||||
cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
|
||||
//cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
|
||||
cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
|
||||
|
||||
if ( igList.size > 50 ) {
|
||||
cv.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
return cv
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
|
||||
* @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
|
||||
*/
|
||||
def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
|
||||
var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
|
||||
var combine = this.StandardIndelCombine( genotypers, output )
|
||||
var callFunctions: List[CommandLineGATK] = genotypers
|
||||
callFunctions = combine :: callFunctions
|
||||
|
||||
return callFunctions
|
||||
}
|
||||
|
||||
def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
|
||||
var iFil = new VariantFiltration with StandardCommandLineGATK
|
||||
iFil.analysisName = "FilterAtIndels"
|
||||
iFil.out = output
|
||||
iFil.mask = indels.getAbsolutePath
|
||||
iFil.maskName = "Indel_Mask"
|
||||
iFil.variantVCF = snps
|
||||
// todo -- cluster size varies with # bams
|
||||
iFil.clusterSize = Some(5)
|
||||
iFil.clusterWindowSize = Some(8)
|
||||
|
||||
return iFil
|
||||
}
|
||||
|
||||
def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
|
||||
var hFil = new VariantFiltration with StandardCommandLineGATK
|
||||
hFil.analysisName = "HandFilter"
|
||||
hFil.out = output
|
||||
hFil.variantVCF = snps
|
||||
hFil.filterExpression :+= "\"QD<5.0\""
|
||||
hFil.filterName :+= "LowQualByDepth"
|
||||
hFil.filterExpression :+= "\"SB>-0.10\""
|
||||
hFil.filterName :+= "HighStrandBias"
|
||||
|
||||
return hFil
|
||||
}
|
||||
|
||||
def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
|
||||
var genC = new GenerateVariantClusters with StandardCommandLineGATK
|
||||
genC.analysisName = "VariantQualityRecalibration"
|
||||
genC.rodBind :+= new RodBind("input","VCF",snps)
|
||||
genC.cluster_file = output
|
||||
genC.use_annotation :+= "QD"
|
||||
genC.use_annotation :+= "SB"
|
||||
genC.use_annotation :+= "HaplotypeScore"
|
||||
genC.use_annotation :+= "HRun"
|
||||
|
||||
return genC
|
||||
}
|
||||
|
||||
def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
|
||||
out_tranches: File) : VariantRecalibrator = {
|
||||
var vr = new VariantRecalibrator with StandardCommandLineGATK
|
||||
vr.analysisName = "VariantQualityRecalibration"
|
||||
vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
|
||||
vr.cluster_file = cluster
|
||||
vr.target_titv = Some(target_titv)
|
||||
vr.out = out_vcf
|
||||
vr.tranches_file = out_tranches
|
||||
vr.tranche :+= "0.1"
|
||||
vr.tranche :+= "1"
|
||||
vr.tranche :+= "5"
|
||||
vr.tranche :+= "10"
|
||||
|
||||
return vr
|
||||
|
||||
}
|
||||
|
||||
def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
|
||||
var avc = new ApplyVariantCuts with StandardCommandLineGATK
|
||||
avc.analysisName = "VariantQualityRecalibration"
|
||||
avc.rodBind :+= new RodBind("input","VCF",snpRecal)
|
||||
avc.out = output
|
||||
avc.tranches_file = tranches
|
||||
avc.fdr_filter_level = Some(5)
|
||||
|
||||
return avc
|
||||
}
|
||||
|
||||
def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
|
||||
var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
|
||||
var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
|
||||
swapExt(snps,".vcf",".recal.tranch"))
|
||||
var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,swapExt(recal.out,".vcf",".tranched.vcf"))
|
||||
|
||||
var cmds: List[CommandLineGATK] = Nil
|
||||
cmds :+= clust
|
||||
cmds :+= recal
|
||||
cmds :+= cut
|
||||
|
||||
return cmds
|
||||
}
|
||||
|
||||
def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
|
||||
var ga = new GenomicAnnotator with StandardCommandLineGATK
|
||||
ga.analysisName = "GenomicAnnotator"
|
||||
ga.variantVCF = snps
|
||||
ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
|
||||
ga.rodToIntervalTrackName = "variant"
|
||||
ga.out = outputVCF
|
||||
|
||||
return ga
|
||||
}
|
||||
|
||||
def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
|
||||
var commands : List[CommandLineGATK] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( output.getParent != null ) {
|
||||
dir = output.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
commands ++= ig
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
|
||||
var commands : List[CommandLineGATK] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( recalOut.getParent != null ) {
|
||||
dir = recalOut.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
commands ++= ig
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
|
||||
|
||||
commands :+= handFilt
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
class VCF2Mask extends CommandLineFunction {
|
||||
@Input(doc="the indel vcf") var indel_vcf : File = _
|
||||
@Argument(doc="the window size") var win_size : Int = 2
|
||||
@Output(doc="the mask bed") var out_mask : File = _
|
||||
|
||||
def commandLine = { "grep PASS %s | awk '{print $1,$2-%d,$2+%d}' > %s".format(indel_vcf.getAbsolutePath,win_size,win_size,out_mask.getAbsolutePath) }
|
||||
}
|
||||
|
||||
def IndelVCF2Mask(vcf: File, size: Int) : VCF2Mask = {
|
||||
var masker: VCF2Mask = new VCF2Mask()
|
||||
masker.indel_vcf = vcf
|
||||
masker.win_size = size
|
||||
masker.out_mask = swapExt(vcf,".vcf",".indel_mask.bed")
|
||||
|
||||
return masker
|
||||
}
|
||||
|
||||
def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineFunction] = {
|
||||
var commands : List[CommandLineFunction] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( recalOut.getParent != null ) {
|
||||
dir = recalOut.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject.getName+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = indelOut
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
var indel_mask : VCF2Mask = IndelVCF2Mask(indelOut,5)
|
||||
|
||||
commands ++= ig
|
||||
commands :+= indel_mask
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,indel_mask.out_mask,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
|
||||
|
||||
commands :+= handFilt
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
def swapExt(file: File, oldExtension: String, newExtension: String) =
|
||||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
||||
object VariantCalling {
|
||||
|
||||
/**
|
||||
* @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line
|
||||
* @Doc - Constructs a UG object based on the settings in the header of the input VCF
|
||||
* @Returns an instance of the UG object with all information propagated
|
||||
*/
|
||||
def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = {
|
||||
var myUG : UnifiedGenotyper = new UnifiedGenotyper
|
||||
var xrl : XReadLines = new XReadLines(vcf)
|
||||
val header_limit = 1000
|
||||
var line = ""
|
||||
var line_no = 1
|
||||
while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) {
|
||||
line = xrl.next
|
||||
line_no += 1
|
||||
}
|
||||
var ugLine : String = line
|
||||
if ( ! ugLine.startsWith("##UnifiedGenotyper") ) {
|
||||
return myUG // nothing to add
|
||||
}
|
||||
|
||||
var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ")
|
||||
var ugMap = new HashMap[String,String]
|
||||
def addEntry( s : String ) : Unit = {
|
||||
val sps = s.split("=")
|
||||
val k = sps(0)
|
||||
val v = sps(1)
|
||||
val kv = (k,v)
|
||||
ugMap += kv
|
||||
}
|
||||
ugTokens.foreach( addEntry(_) )
|
||||
|
||||
myUG.reference_sequence = new File(ugMap("reference_sequence"))
|
||||
myUG.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq"))
|
||||
myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble)
|
||||
myUG.DBSNP = new File(ugMap("DBSNP"))
|
||||
|
||||
// todo -- more args
|
||||
|
||||
return myUG
|
||||
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue