Removing QPipeline directory as there's no one to support it at the moment.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5757 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-05-04 18:36:02 +00:00
parent 7ed8b4ddb0
commit f7d9f0a1f3
6 changed files with 0 additions and 766 deletions

View File

@ -298,13 +298,6 @@
<src path="${queue-extensions.source.dir}" />
<include name="**/*.scala"/>
<exclude name="**/gridengine/**" unless="include.gridengine" />
<!--
NOTE: Added include.queue.pipeline flag while VQSR v2 is being migrated
and the pipeline package is still using the old version.
At this moment enabling the flag turns on code that doesn't compile.
ASAP determine if package should use v2 or be retired.
-->
<exclude name="**/queue/pipeline/**" unless="include.queue.pipeline" />
</scalac>
</target>

View File

@ -1,141 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.extensions.gatk._
import java.io.File
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import net.sf.picard.reference.ReferenceSequenceFileFactory
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import org.broadinstitute.sting.utils.interval.IntervalUtils
import collection.mutable.{ListBuffer, HashMap}
import collection.JavaConversions
import java.util.Arrays
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
import org.broadinstitute.sting.commandline.{Output, Input}
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
library =>
var attributes : Pipeline = attribs
def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = library.attributes.getProject.getReferenceFile
this.intervals = List(library.attributes.getProject.getIntervalList)
this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp)
this.memoryLimit = Some(2)
this.jarFile = library.gatkJar
}
/**
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
* @Returns: A CLF for the realigner target creator job
*/
def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
rtc.intervals = Nil
rtc.intervalsString = contigs
rtc.input_file :+= bam
rtc.out = output
rtc.analysisName = "RealignerTargetCreator"
return rtc
}
/**
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
* @Returns: A CLF for the indel cleaning job
*/
def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
var realigner = new IndelRealigner with StandardCommandLineGATK
realigner.intervalsString = contigs
realigner.intervals = Nil
realigner.input_file :+= bam
realigner.out = outBam
realigner.targetIntervals = targets
realigner.analysisName = "IndelClean"
realigner.bam_compression = Some(0)
return realigner
}
/**
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
*/
def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
var cmds : List[CommandLineFunction] = Nil
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
)
var bamsToMerge : List[File] = Nil
for ( spec <- jobSpecs ) {
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
bamsToMerge :+= spec._2
}
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
return cmds
}
/**
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
* of command line functions
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
* of uncleaned bams to the collection of cleaned bams
*/
def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
var commands : List[CommandLineFunction] = Nil
for ( bamPair <- bamsUncleanCleanPairs ) {
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
sam.bamFile = bamPair._2
sam.analysisName = "SamtoolsIndex"
commands :+= rtc
commands ++= icbs
commands :+= sam
}
return commands
}
/**
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
* @Returns: Command line function for the merge, fix-mate, and sort operation
*/
def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
var pfm : PicardFixMates = new PicardFixMates
pfm.bams = inBams
pfm.outBam = outBam
pfm.jarFile = picardJar
pfm.assumeSorted = Some(false)
pfm.memoryLimit = Some(4)
pfm.analysisName = "FixMates"
return pfm
}
class PicardFixMates extends PicardBamFunction {
@Input(doc="input bam files") var bams: List[File] = Nil
@Output(doc="output bam file") var outBam: File = null
def inputBams: List[File] = bams
def outputBam: File = outBam
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}

View File

@ -1,85 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.commandline.{Input, Argument, Hidden}
import java.io.File
class PipelineArgumentCollection {
@Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false)
var cleaningJobs: Int = 1
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y", required=false)
var yamlFile: File = _
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
var trigger: File = _
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false)
var refseqTable: File = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)
var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to GATK jar", shortName="gatk")
var gatkJar: File = _
@Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
var target_titv: Float = _
@Input(doc="per-sample downsampling level",shortName="dcov",required=false)
var downsampling_coverage = 300
@Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
var num_snp_scatter_jobs = 20
@Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
var num_indel_scatter_jobs = 5
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
var skip_cleaning = false
@Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam). Mutually exclusive with YAML",
required=false, shortName="pBams")
var projectBams: File = _
@Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName")
var projectName: String = _
@Input(doc="The reference file. Mutually exclusive with YAML.", required=false, shortName="pRef")
var projectRef: File = _
@Input(doc="The project interval list. Mutually exclusive with YAML.", required=false, shortName="pInt")
var projectIntervals: File = _
@Input(doc="The project dbsnp. Mutually exclusive with YAML",required=false, shortName="pDB")
var projectDBSNP: File = _
//@Input(doc="ADPR script")
//var adprScript: File = _
//@Input(doc="Sequencing maching name (for use by adpr)")
//var machine: String = _
//@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
//var protocol: String = _
def verifyArguments = {
// if no yaml file we require all the mutually exclusive arguments
if ( yamlFile == null ) {
if ( projectBams == null ) throw new IllegalArgumentException("No YAML file provided; and no project bam list. Pipeline requires either YAML file, or full project specs.")
if ( projectName == null ) throw new IllegalArgumentException("No YAML file provided; and no project name. Pipeline requires either YAML file, or full project specs.")
if ( projectRef == null) throw new IllegalArgumentException("No YAML file provided; and no project reference. Pipeline requires either YAML file, or full project specs.")
if ( projectIntervals == null ) throw new IllegalArgumentException("No YAML file provided; and no project intervals. Pipeline requires either YAML file, or full project specs.")
if ( projectDBSNP == null ) throw new IllegalArgumentException("No YAML file provided; and no project DBSNP. Pipeline requires either YAML file, or full project specs.")
} else {
if ( projectBams != null || projectName != null || projectRef != null || projectIntervals != null || projectDBSNP != null )
throw new IllegalArgumentException("YAML file provided, along with other project spec arguments. YAML file is mutually exclusive with project specs.")
}
}
}

View File

@ -1,100 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.util._
import java.io.File
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.queue.function.CommandLineFunction
class ProjectManagement(stingPath: String) {
// TODO -- MAJOR scatter/gather numbers are hard-coded. Really need a function to set it to the right number. These are optimized to get everything on 'hour'
pm =>
var stingDirPath : String = stingPath
def this(f : File) = this(f.getAbsolutePath)
class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
@Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
@Output(doc="The file to write the site list to") var out_vcf = out_list
@Argument(doc="Path to the SortByRef script") var sortByRef: String = _
@Argument(doc="Path to the reference file on disk") var ref: File = _
def commandLine = {
"egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
)
}
}
class Vcf2Intervals(in_vcf: File, out_intervals: File) extends CommandLineFunction {
@Input(doc="The VCF to convert") var vcf: File = in_vcf
@Output(doc="The output bed file") var intervals: File = out_intervals
def commandLine = {
"grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(vcf.getAbsolutePath,intervals.getAbsolutePath)
}
}
def MergeBatches( callVCFs: List[File], allBams: List[File], mergedVCF: File, ref: File, size: Int) : List[CommandLineFunction] = {
var cmds : List[CommandLineFunction] = Nil
var pfSites : PassFilterAlleles = new PassFilterAlleles(callVCFs,swapExt(mergedVCF,".vcf",".pf.alleles.vcf"))
pfSites.sortByRef = pm.stingDirPath+"perl/sortByRef.pl"
pfSites.ref = ref
cmds :+= pfSites
var ints : Vcf2Intervals = new Vcf2Intervals(pfSites.out_vcf,swapExt(pfSites.out_vcf,".vcf",".intervals.list"))
cmds :+= ints
var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2))))
cmds ++= calcs
cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, ints.intervals, mergedVCF)
return cmds
}
def LikelihoodCalc( bams: List[File], ref: File, intervals: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = {
var calc = new UGCalcLikelihoods
calc.input_file ++= bams
calc.reference_sequence = ref
calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
calc.downsample_to_coverage = Some(300)
calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6)
calc.scatterCount = if (bams.size < 5 ) 1 else if (bams.size < 50) 60 else 120
calc.min_base_quality_score = Some(22)
calc.min_mapping_quality_score = Some(20)
//calc.genotyping_mode = Option[GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES]
calc.out = outVCF
calc.rodBind :+= new RodBind("allele","VCF",alleleVCF)
calc.intervals :+= intervals
return calc
}
def VariantCallMerge( likelihoods: List[File], ref: File, intervals: File, output: File) : UGCallVariants = {
var call = new UGCallVariants
call.reference_sequence = ref
call.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
call.intervals :+= intervals
call.memoryLimit = Some(8)
call.out = output
call.rodBind ++= likelihoods.map( a => new RodBind("variant"+a.getName.replace(".vcf",""),"vcf",a) )
call.scatterCount = 30
return call
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}

View File

@ -1,13 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.function.CommandLineFunction
trait QPipeline {
var commands: List[CommandLineFunction] = Nil
def generateCommands
def track // todo -- implement me
def removeIntermediate // todo -- implement me
}

View File

@ -1,420 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.util._
import java.io.File
import scala.collection.JavaConversions._
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.text.XReadLines
import scala.collection.mutable.HashMap
class VariantCalling(attribs: Pipeline,gatkJar: File) {
vc =>
// load attributes
var attributes = attribs
def this(yaml: File, gatkJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar)
/**
* Trait to propagate basic attributes throughout the library
*/
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = vc.attributes.getProject.getReferenceFile
this.intervals = List(vc.attributes.getProject.getIntervalList)
this.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
// set global memory limit on the low side. Additional input bams will affect it.
this.memoryLimit = Some(2)
this.jarFile = vc.gatkJar
}
/**
* @Doc: Adds the trait data to a command line gatk that is passed in
* @Return: the input CLGATK with the SCLGATK data propagated into it
* @TODO: This should be better written, it'd be nice just to call it with addTrait[T], and return a T with SCLGATK
*/
def addTrait[T <: CommandLineGATK](c : T) : T = {
c.reference_sequence = vc.attributes.getProject.getReferenceFile
c.intervals = List(vc.attributes.getProject.getIntervalList)
c.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
// set global memory limit on the low side. Additional input bams will affect it.
c.memoryLimit = Some(2)
c.jarFile = vc.gatkJar
c
}
/**
* @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
* @Return: UnifiedGenotyper with the standard GSA arguments
* @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
*/
def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
var ug = new UnifiedGenotyper with StandardCommandLineGATK
ug.analysisName = "UnifiedGenotyper"
ug.input_file = bams
ug.out = output
ug.downsample_to_coverage = Some(300)
ug.dt = DownsampleType.BY_SAMPLE
ug.scatterCount = 50
if ( bams.size > 40 ) {
ug.memoryLimit = Some(4)
}
if ( bams.size > 90 ) {
ug.memoryLimit = Some(6)
}
if ( bams.size > 140 ) {
ug.memoryLimit = Some(8)
}
return ug
}
/**
* @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
*/
def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
var ig = new IndelGenotyperV2 with StandardCommandLineGATK
ig.analysisName = "IndelGenotyper"
ig.input_file :+= bam
ig.out = output
ig.downsample_to_coverage = Some(300)
return ig
}
/**
* @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
*/
private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
var ig = StandardIndelGenotyper(bam,output)
return ig
}
/**
* @Doc: Combines a list of indel VCFs to a single output file
* @Returns: A CombineVariants CLF with standard GSA arguments
*/
def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
var cv = new CombineVariants with StandardCommandLineGATK
cv.out = output
cv.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY
cv.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
cv.analysisName = "IndelGenotyper"
cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
//cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
if ( igList.size > 50 ) {
cv.memoryLimit = Some(4)
}
return cv
}
/**
* @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
* @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
*/
def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
var combine = this.StandardIndelCombine( genotypers, output )
var callFunctions: List[CommandLineGATK] = genotypers
callFunctions = combine :: callFunctions
return callFunctions
}
def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
var iFil = new VariantFiltration with StandardCommandLineGATK
iFil.analysisName = "FilterAtIndels"
iFil.out = output
iFil.mask = indels.getAbsolutePath
iFil.maskName = "Indel_Mask"
iFil.variantVCF = snps
// todo -- cluster size varies with # bams
iFil.clusterSize = Some(5)
iFil.clusterWindowSize = Some(8)
return iFil
}
def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
var hFil = new VariantFiltration with StandardCommandLineGATK
hFil.analysisName = "HandFilter"
hFil.out = output
hFil.variantVCF = snps
hFil.filterExpression :+= "\"QD<5.0\""
hFil.filterName :+= "LowQualByDepth"
hFil.filterExpression :+= "\"SB>-0.10\""
hFil.filterName :+= "HighStrandBias"
return hFil
}
def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
var genC = new GenerateVariantClusters with StandardCommandLineGATK
genC.analysisName = "VariantQualityRecalibration"
genC.rodBind :+= new RodBind("input","VCF",snps)
genC.cluster_file = output
genC.use_annotation :+= "QD"
genC.use_annotation :+= "SB"
genC.use_annotation :+= "HaplotypeScore"
genC.use_annotation :+= "HRun"
return genC
}
def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
out_tranches: File) : VariantRecalibrator = {
var vr = new VariantRecalibrator with StandardCommandLineGATK
vr.analysisName = "VariantQualityRecalibration"
vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
vr.cluster_file = cluster
vr.target_titv = Some(target_titv)
vr.out = out_vcf
vr.tranches_file = out_tranches
vr.tranche :+= "0.1"
vr.tranche :+= "1"
vr.tranche :+= "5"
vr.tranche :+= "10"
return vr
}
def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
var avc = new ApplyVariantCuts with StandardCommandLineGATK
avc.analysisName = "VariantQualityRecalibration"
avc.rodBind :+= new RodBind("input","VCF",snpRecal)
avc.out = output
avc.tranches_file = tranches
avc.fdr_filter_level = Some(5)
return avc
}
def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
swapExt(snps,".vcf",".recal.tranch"))
var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,swapExt(recal.out,".vcf",".tranched.vcf"))
var cmds: List[CommandLineGATK] = Nil
cmds :+= clust
cmds :+= recal
cmds :+= cut
return cmds
}
def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
var ga = new GenomicAnnotator with StandardCommandLineGATK
ga.analysisName = "GenomicAnnotator"
ga.variantVCF = snps
ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
ga.rodToIntervalTrackName = "variant"
ga.out = outputVCF
return ga
}
def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
var commands : List[CommandLineGATK] = Nil
var dir = ""
if ( output.getParent != null ) {
dir = output.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
var ig = StandardIndelCalls(bams,raw_indel)
commands ++= ig
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
commands ++= recal
return commands
}
def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
var commands : List[CommandLineGATK] = Nil
var dir = ""
if ( recalOut.getParent != null ) {
dir = recalOut.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
var ig = StandardIndelCalls(bams,raw_indel)
commands ++= ig
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
commands ++= recal
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
commands :+= handFilt
return commands
}
class VCF2Mask extends CommandLineFunction {
@Input(doc="the indel vcf") var indel_vcf : File = _
@Argument(doc="the window size") var win_size : Int = 2
@Output(doc="the mask bed") var out_mask : File = _
def commandLine = { "grep PASS %s | awk '{print $1,$2-%d,$2+%d}' > %s".format(indel_vcf.getAbsolutePath,win_size,win_size,out_mask.getAbsolutePath) }
}
def IndelVCF2Mask(vcf: File, size: Int) : VCF2Mask = {
var masker: VCF2Mask = new VCF2Mask()
masker.indel_vcf = vcf
masker.win_size = size
masker.out_mask = swapExt(vcf,".vcf",".indel_mask.bed")
return masker
}
def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineFunction] = {
var commands : List[CommandLineFunction] = Nil
var dir = ""
if ( recalOut.getParent != null ) {
dir = recalOut.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject.getName+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = indelOut
var ig = StandardIndelCalls(bams,raw_indel)
var indel_mask : VCF2Mask = IndelVCF2Mask(indelOut,5)
commands ++= ig
commands :+= indel_mask
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,indel_mask.out_mask,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
commands ++= recal
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
commands :+= handFilt
return commands
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}
object VariantCalling {
/**
* @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line
* @Doc - Constructs a UG object based on the settings in the header of the input VCF
* @Returns an instance of the UG object with all information propagated
*/
def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = {
var myUG : UnifiedGenotyper = new UnifiedGenotyper
var xrl : XReadLines = new XReadLines(vcf)
val header_limit = 1000
var line = ""
var line_no = 1
while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) {
line = xrl.next
line_no += 1
}
var ugLine : String = line
if ( ! ugLine.startsWith("##UnifiedGenotyper") ) {
return myUG // nothing to add
}
var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ")
var ugMap = new HashMap[String,String]
def addEntry( s : String ) : Unit = {
val sps = s.split("=")
val k = sps(0)
val v = sps(1)
val kv = (k,v)
ugMap += kv
}
ugTokens.foreach( addEntry(_) )
myUG.reference_sequence = new File(ugMap("reference_sequence"))
myUG.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq"))
myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble)
myUG.DBSNP = new File(ugMap("DBSNP"))
// todo -- more args
return myUG
}
}