diff --git a/build.xml b/build.xml
index e18dfa8de..4e7c642ba 100644
--- a/build.xml
+++ b/build.xml
@@ -298,13 +298,6 @@
-
-
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
deleted file mode 100755
index 76e5a4fae..000000000
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-package org.broadinstitute.sting.queue.pipeline
-
-import org.broadinstitute.sting.queue.extensions.gatk._
-import java.io.File
-import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
-import org.broadinstitute.sting.queue.function.CommandLineFunction
-import org.broadinstitute.sting.utils.yaml.YamlUtils
-import org.broadinstitute.sting.datasources.pipeline.Pipeline
-import net.sf.picard.reference.ReferenceSequenceFileFactory
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
-import org.broadinstitute.sting.utils.interval.IntervalUtils
-import collection.mutable.{ListBuffer, HashMap}
-import collection.JavaConversions
-import java.util.Arrays
-import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
-import org.broadinstitute.sting.commandline.{Output, Input}
-import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
-
-class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) {
- library =>
-
- var attributes : Pipeline = attribs
-
- def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar)
-
-
- trait StandardCommandLineGATK extends CommandLineGATK {
- this.reference_sequence = library.attributes.getProject.getReferenceFile
- this.intervals = List(library.attributes.getProject.getIntervalList)
- this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp)
- this.memoryLimit = Some(2)
- this.jarFile = library.gatkJar
- }
-
- /**
- * @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
- * @Returns: A CLF for the realigner target creator job
- */
- def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
- var rtc = new RealignerTargetCreator with StandardCommandLineGATK
- rtc.intervals = Nil
- rtc.intervalsString = contigs
- rtc.input_file :+= bam
- rtc.out = output
- rtc.analysisName = "RealignerTargetCreator"
-
- return rtc
- }
-
- /**
- * @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
- * @Returns: A CLF for the indel cleaning job
- */
- def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
- var realigner = new IndelRealigner with StandardCommandLineGATK
- realigner.intervalsString = contigs
- realigner.intervals = Nil
- realigner.input_file :+= bam
- realigner.out = outBam
- realigner.targetIntervals = targets
- realigner.analysisName = "IndelClean"
- realigner.bam_compression = Some(0)
-
- return realigner
- }
-
- /**
- * @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
- * @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
- */
- def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
- var cmds : List[CommandLineFunction] = Nil
- var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
- ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
- )
- var bamsToMerge : List[File] = Nil
- for ( spec <- jobSpecs ) {
- cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
- bamsToMerge :+= spec._2
- }
-
- cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
-
- return cmds
-
- }
-
- /**
- * @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
- * command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
- * of command line functions
- * @Returns: A list of command line functions for the full indel realignment pipeline from the collection
- * of uncleaned bams to the collection of cleaned bams
- */
- def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
- val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
- var commands : List[CommandLineFunction] = Nil
- for ( bamPair <- bamsUncleanCleanPairs ) {
- val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
- val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
- val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
- sam.bamFile = bamPair._2
- sam.analysisName = "SamtoolsIndex"
- commands :+= rtc
- commands ++= icbs
- commands :+= sam
- }
-
- return commands
- }
-
- /**
- * @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
- * @Returns: Command line function for the merge, fix-mate, and sort operation
- */
- def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
- var pfm : PicardFixMates = new PicardFixMates
- pfm.bams = inBams
- pfm.outBam = outBam
- pfm.jarFile = picardJar
- pfm.assumeSorted = Some(false)
- pfm.memoryLimit = Some(4)
- pfm.analysisName = "FixMates"
-
- return pfm
- }
-
- class PicardFixMates extends PicardBamFunction {
- @Input(doc="input bam files") var bams: List[File] = Nil
- @Output(doc="output bam file") var outBam: File = null
-
- def inputBams: List[File] = bams
- def outputBam: File = outBam
-
- }
-
-
- def swapExt(file: File, oldExtension: String, newExtension: String) =
- new File(file.getName.stripSuffix(oldExtension) + newExtension)
-
-}
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
deleted file mode 100755
index 0a0136923..000000000
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
+++ /dev/null
@@ -1,85 +0,0 @@
-package org.broadinstitute.sting.queue.pipeline
-
-import org.broadinstitute.sting.commandline.{Input, Argument, Hidden}
-import java.io.File
-
-
-class PipelineArgumentCollection {
-
- @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false)
- var cleaningJobs: Int = 1
-
- @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y", required=false)
- var yamlFile: File = _
-
- @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
- var trigger: File = _
-
- @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false)
- var refseqTable: File = _
-
- @Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)
- var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
-
- @Input(doc="path to GATK jar", shortName="gatk")
- var gatkJar: File = _
-
- @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
- var target_titv: Float = _
-
- @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
- var downsampling_coverage = 300
-
- @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
- var num_snp_scatter_jobs = 20
-
- @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
- var num_indel_scatter_jobs = 5
-
- @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
- var skip_cleaning = false
-
- @Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam). Mutually exclusive with YAML",
- required=false, shortName="pBams")
- var projectBams: File = _
-
- @Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName")
- var projectName: String = _
-
- @Input(doc="The reference file. Mutually exclusive with YAML.", required=false, shortName="pRef")
- var projectRef: File = _
-
- @Input(doc="The project interval list. Mutually exclusive with YAML.", required=false, shortName="pInt")
- var projectIntervals: File = _
-
- @Input(doc="The project dbsnp. Mutually exclusive with YAML",required=false, shortName="pDB")
- var projectDBSNP: File = _
-
- //@Input(doc="ADPR script")
- //var adprScript: File = _
-
- //@Input(doc="Sequencing maching name (for use by adpr)")
- //var machine: String = _
-
- //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
- //var protocol: String = _
-
- def verifyArguments = {
- // if no yaml file we require all the mutually exclusive arguments
- if ( yamlFile == null ) {
- if ( projectBams == null ) throw new IllegalArgumentException("No YAML file provided; and no project bam list. Pipeline requires either YAML file, or full project specs.")
-
- if ( projectName == null ) throw new IllegalArgumentException("No YAML file provided; and no project name. Pipeline requires either YAML file, or full project specs.")
-
- if ( projectRef == null) throw new IllegalArgumentException("No YAML file provided; and no project reference. Pipeline requires either YAML file, or full project specs.")
-
- if ( projectIntervals == null ) throw new IllegalArgumentException("No YAML file provided; and no project intervals. Pipeline requires either YAML file, or full project specs.")
-
- if ( projectDBSNP == null ) throw new IllegalArgumentException("No YAML file provided; and no project DBSNP. Pipeline requires either YAML file, or full project specs.")
- } else {
- if ( projectBams != null || projectName != null || projectRef != null || projectIntervals != null || projectDBSNP != null )
- throw new IllegalArgumentException("YAML file provided, along with other project spec arguments. YAML file is mutually exclusive with project specs.")
- }
- }
-
-}
\ No newline at end of file
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
deleted file mode 100755
index 1cb4b82af..000000000
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
+++ /dev/null
@@ -1,100 +0,0 @@
-package org.broadinstitute.sting.queue.pipeline
-
-import org.broadinstitute.sting.commandline._
-import org.broadinstitute.sting.queue.util._
-import java.io.File
-import org.broadinstitute.sting.datasources.pipeline.Pipeline
-import org.broadinstitute.sting.gatk.DownsampleType
-import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
-import org.broadinstitute.sting.queue.extensions.gatk._
-import org.broadinstitute.sting.utils.yaml.YamlUtils
-import org.broadinstitute.sting.queue.function.CommandLineFunction
-
-class ProjectManagement(stingPath: String) {
- // TODO -- MAJOR scatter/gather numbers are hard-coded. Really need a function to set it to the right number. These are optimized to get everything on 'hour'
-
- pm =>
-
- var stingDirPath : String = stingPath
-
- def this(f : File) = this(f.getAbsolutePath)
-
- class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
- @Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
- @Output(doc="The file to write the site list to") var out_vcf = out_list
- @Argument(doc="Path to the SortByRef script") var sortByRef: String = _
- @Argument(doc="Path to the reference file on disk") var ref: File = _
-
- def commandLine = {
- "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
- vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
- )
- }
- }
-
- class Vcf2Intervals(in_vcf: File, out_intervals: File) extends CommandLineFunction {
- @Input(doc="The VCF to convert") var vcf: File = in_vcf
- @Output(doc="The output bed file") var intervals: File = out_intervals
-
- def commandLine = {
- "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(vcf.getAbsolutePath,intervals.getAbsolutePath)
- }
- }
-
- def MergeBatches( callVCFs: List[File], allBams: List[File], mergedVCF: File, ref: File, size: Int) : List[CommandLineFunction] = {
- var cmds : List[CommandLineFunction] = Nil
- var pfSites : PassFilterAlleles = new PassFilterAlleles(callVCFs,swapExt(mergedVCF,".vcf",".pf.alleles.vcf"))
- pfSites.sortByRef = pm.stingDirPath+"perl/sortByRef.pl"
- pfSites.ref = ref
-
- cmds :+= pfSites
-
- var ints : Vcf2Intervals = new Vcf2Intervals(pfSites.out_vcf,swapExt(pfSites.out_vcf,".vcf",".intervals.list"))
-
- cmds :+= ints
-
- var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2))))
-
- cmds ++= calcs
-
- cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, ints.intervals, mergedVCF)
-
- return cmds
-
- }
-
- def LikelihoodCalc( bams: List[File], ref: File, intervals: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = {
- var calc = new UGCalcLikelihoods
- calc.input_file ++= bams
- calc.reference_sequence = ref
- calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
- calc.downsample_to_coverage = Some(300)
- calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6)
- calc.scatterCount = if (bams.size < 5 ) 1 else if (bams.size < 50) 60 else 120
- calc.min_base_quality_score = Some(22)
- calc.min_mapping_quality_score = Some(20)
- //calc.genotyping_mode = Option[GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES]
- calc.out = outVCF
- calc.rodBind :+= new RodBind("allele","VCF",alleleVCF)
- calc.intervals :+= intervals
-
- return calc
- }
-
- def VariantCallMerge( likelihoods: List[File], ref: File, intervals: File, output: File) : UGCallVariants = {
- var call = new UGCallVariants
- call.reference_sequence = ref
- call.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
- call.intervals :+= intervals
- call.memoryLimit = Some(8)
- call.out = output
- call.rodBind ++= likelihoods.map( a => new RodBind("variant"+a.getName.replace(".vcf",""),"vcf",a) )
- call.scatterCount = 30
-
- return call
- }
-
- def swapExt(file: File, oldExtension: String, newExtension: String) =
- new File(file.getName.stripSuffix(oldExtension) + newExtension)
-
-}
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala
deleted file mode 100755
index 8cbff5f94..000000000
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala
+++ /dev/null
@@ -1,13 +0,0 @@
-package org.broadinstitute.sting.queue.pipeline
-
-import org.broadinstitute.sting.queue.function.CommandLineFunction
-
-trait QPipeline {
- var commands: List[CommandLineFunction] = Nil
-
- def generateCommands
-
- def track // todo -- implement me
-
- def removeIntermediate // todo -- implement me
-}
\ No newline at end of file
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
deleted file mode 100755
index 12afd50d5..000000000
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
+++ /dev/null
@@ -1,420 +0,0 @@
-package org.broadinstitute.sting.queue.pipeline
-import org.broadinstitute.sting.commandline._
-import org.broadinstitute.sting.queue.util._
-import java.io.File
-import scala.collection.JavaConversions._
-import org.broadinstitute.sting.datasources.pipeline.Pipeline
-import org.broadinstitute.sting.gatk.DownsampleType
-import org.broadinstitute.sting.queue.extensions.gatk._
-import org.broadinstitute.sting.utils.yaml.YamlUtils
-import org.broadinstitute.sting.queue.function.CommandLineFunction
-import org.broadinstitute.sting.utils.text.XReadLines
-import scala.collection.mutable.HashMap
-
-class VariantCalling(attribs: Pipeline,gatkJar: File) {
- vc =>
-
- // load attributes
- var attributes = attribs
-
- def this(yaml: File, gatkJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar)
-
- /**
- * Trait to propagate basic attributes throughout the library
- */
- trait StandardCommandLineGATK extends CommandLineGATK {
- this.reference_sequence = vc.attributes.getProject.getReferenceFile
- this.intervals = List(vc.attributes.getProject.getIntervalList)
- this.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
- // set global memory limit on the low side. Additional input bams will affect it.
- this.memoryLimit = Some(2)
- this.jarFile = vc.gatkJar
- }
-
- /**
- * @Doc: Adds the trait data to a command line gatk that is passed in
- * @Return: the input CLGATK with the SCLGATK data propagated into it
- * @TODO: This should be better written, it'd be nice just to call it with addTrait[T], and return a T with SCLGATK
- */
- def addTrait[T <: CommandLineGATK](c : T) : T = {
- c.reference_sequence = vc.attributes.getProject.getReferenceFile
- c.intervals = List(vc.attributes.getProject.getIntervalList)
- c.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp)
- // set global memory limit on the low side. Additional input bams will affect it.
- c.memoryLimit = Some(2)
- c.jarFile = vc.gatkJar
- c
- }
-
- /**
- * @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
- * @Return: UnifiedGenotyper with the standard GSA arguments
- * @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
- */
- def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
- var ug = new UnifiedGenotyper with StandardCommandLineGATK
- ug.analysisName = "UnifiedGenotyper"
- ug.input_file = bams
- ug.out = output
- ug.downsample_to_coverage = Some(300)
- ug.dt = DownsampleType.BY_SAMPLE
- ug.scatterCount = 50
-
- if ( bams.size > 40 ) {
- ug.memoryLimit = Some(4)
- }
-
- if ( bams.size > 90 ) {
- ug.memoryLimit = Some(6)
- }
-
- if ( bams.size > 140 ) {
- ug.memoryLimit = Some(8)
- }
-
- return ug
-
- }
-
- /**
- * @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
- * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
- */
- def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
- var ig = new IndelGenotyperV2 with StandardCommandLineGATK
- ig.analysisName = "IndelGenotyper"
- ig.input_file :+= bam
- ig.out = output
- ig.downsample_to_coverage = Some(300)
- return ig
- }
-
- /**
- * @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
- * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
- */
- private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
- var ig = StandardIndelGenotyper(bam,output)
- return ig
- }
-
- /**
- * @Doc: Combines a list of indel VCFs to a single output file
- * @Returns: A CombineVariants CLF with standard GSA arguments
- */
- def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
- var cv = new CombineVariants with StandardCommandLineGATK
- cv.out = output
- cv.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY
- cv.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
- cv.analysisName = "IndelGenotyper"
- cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
- //cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
- cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
-
- if ( igList.size > 50 ) {
- cv.memoryLimit = Some(4)
- }
-
- return cv
-
- }
-
- /**
- * @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
- * @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
- */
- def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
- var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
- var combine = this.StandardIndelCombine( genotypers, output )
- var callFunctions: List[CommandLineGATK] = genotypers
- callFunctions = combine :: callFunctions
-
- return callFunctions
- }
-
- def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
- var iFil = new VariantFiltration with StandardCommandLineGATK
- iFil.analysisName = "FilterAtIndels"
- iFil.out = output
- iFil.mask = indels.getAbsolutePath
- iFil.maskName = "Indel_Mask"
- iFil.variantVCF = snps
- // todo -- cluster size varies with # bams
- iFil.clusterSize = Some(5)
- iFil.clusterWindowSize = Some(8)
-
- return iFil
- }
-
- def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
- var hFil = new VariantFiltration with StandardCommandLineGATK
- hFil.analysisName = "HandFilter"
- hFil.out = output
- hFil.variantVCF = snps
- hFil.filterExpression :+= "\"QD<5.0\""
- hFil.filterName :+= "LowQualByDepth"
- hFil.filterExpression :+= "\"SB>-0.10\""
- hFil.filterName :+= "HighStrandBias"
-
- return hFil
- }
-
- def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
- var genC = new GenerateVariantClusters with StandardCommandLineGATK
- genC.analysisName = "VariantQualityRecalibration"
- genC.rodBind :+= new RodBind("input","VCF",snps)
- genC.cluster_file = output
- genC.use_annotation :+= "QD"
- genC.use_annotation :+= "SB"
- genC.use_annotation :+= "HaplotypeScore"
- genC.use_annotation :+= "HRun"
-
- return genC
- }
-
- def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
- out_tranches: File) : VariantRecalibrator = {
- var vr = new VariantRecalibrator with StandardCommandLineGATK
- vr.analysisName = "VariantQualityRecalibration"
- vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
- vr.cluster_file = cluster
- vr.target_titv = Some(target_titv)
- vr.out = out_vcf
- vr.tranches_file = out_tranches
- vr.tranche :+= "0.1"
- vr.tranche :+= "1"
- vr.tranche :+= "5"
- vr.tranche :+= "10"
-
- return vr
-
- }
-
- def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
- var avc = new ApplyVariantCuts with StandardCommandLineGATK
- avc.analysisName = "VariantQualityRecalibration"
- avc.rodBind :+= new RodBind("input","VCF",snpRecal)
- avc.out = output
- avc.tranches_file = tranches
- avc.fdr_filter_level = Some(5)
-
- return avc
- }
-
- def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
- var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
- var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
- swapExt(snps,".vcf",".recal.tranch"))
- var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,swapExt(recal.out,".vcf",".tranched.vcf"))
-
- var cmds: List[CommandLineGATK] = Nil
- cmds :+= clust
- cmds :+= recal
- cmds :+= cut
-
- return cmds
- }
-
- def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
- var ga = new GenomicAnnotator with StandardCommandLineGATK
- ga.analysisName = "GenomicAnnotator"
- ga.variantVCF = snps
- ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
- ga.rodToIntervalTrackName = "variant"
- ga.out = outputVCF
-
- return ga
- }
-
- def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
- var commands : List[CommandLineGATK] = Nil
- var dir = ""
-
- if ( output.getParent != null ) {
- dir = output.getParent+"/"
- }
-
- var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
- var ug = StandardUnifiedGenotyper(bams, raw_snp)
-
- commands :+= ug
-
- var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
- var ig = StandardIndelCalls(bams,raw_indel)
-
- commands ++= ig
-
- var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
- var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
-
- commands :+= iFilt
-
- var annoSNP = prefilt_snp
- if ( refGene != null ) {
- annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
- var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
-
- commands :+= annotate
- }
-
- var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
-
- commands ++= recal
-
- return commands
- }
-
- def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
- var commands : List[CommandLineGATK] = Nil
- var dir = ""
-
- if ( recalOut.getParent != null ) {
- dir = recalOut.getParent+"/"
- }
-
- var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
- var ug = StandardUnifiedGenotyper(bams, raw_snp)
-
- commands :+= ug
-
- var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
- var ig = StandardIndelCalls(bams,raw_indel)
-
- commands ++= ig
-
- var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
- var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
-
- commands :+= iFilt
-
- var annoSNP = prefilt_snp
- if ( refGene != null ) {
- annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
- var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
-
- commands :+= annotate
- }
-
- var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
-
- commands ++= recal
-
- var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
-
- commands :+= handFilt
-
- return commands
- }
-
- class VCF2Mask extends CommandLineFunction {
- @Input(doc="the indel vcf") var indel_vcf : File = _
- @Argument(doc="the window size") var win_size : Int = 2
- @Output(doc="the mask bed") var out_mask : File = _
-
- def commandLine = { "grep PASS %s | awk '{print $1,$2-%d,$2+%d}' > %s".format(indel_vcf.getAbsolutePath,win_size,win_size,out_mask.getAbsolutePath) }
- }
-
- def IndelVCF2Mask(vcf: File, size: Int) : VCF2Mask = {
- var masker: VCF2Mask = new VCF2Mask()
- masker.indel_vcf = vcf
- masker.win_size = size
- masker.out_mask = swapExt(vcf,".vcf",".indel_mask.bed")
-
- return masker
- }
-
- def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineFunction] = {
- var commands : List[CommandLineFunction] = Nil
- var dir = ""
-
- if ( recalOut.getParent != null ) {
- dir = recalOut.getParent+"/"
- }
-
- var raw_snp = new File(dir+vc.attributes.getProject.getName+".raw_snps.vcf")
- var ug = StandardUnifiedGenotyper(bams, raw_snp)
-
- commands :+= ug
-
- var raw_indel = indelOut
- var ig = StandardIndelCalls(bams,raw_indel)
-
- var indel_mask : VCF2Mask = IndelVCF2Mask(indelOut,5)
-
- commands ++= ig
- commands :+= indel_mask
-
- var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
- var iFilt = StandardFilterAtIndels(raw_snp,indel_mask.out_mask,prefilt_snp)
-
- commands :+= iFilt
-
- var annoSNP = prefilt_snp
- if ( refGene != null ) {
- annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
- var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
-
- commands :+= annotate
- }
-
- var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
-
- commands ++= recal
-
- var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
-
- commands :+= handFilt
-
- return commands
- }
-
- def swapExt(file: File, oldExtension: String, newExtension: String) =
- new File(file.getName.stripSuffix(oldExtension) + newExtension)
-
-}
-
-object VariantCalling {
-
- /**
- * @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line
- * @Doc - Constructs a UG object based on the settings in the header of the input VCF
- * @Returns an instance of the UG object with all information propagated
- */
- def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = {
- var myUG : UnifiedGenotyper = new UnifiedGenotyper
- var xrl : XReadLines = new XReadLines(vcf)
- val header_limit = 1000
- var line = ""
- var line_no = 1
- while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) {
- line = xrl.next
- line_no += 1
- }
- var ugLine : String = line
- if ( ! ugLine.startsWith("##UnifiedGenotyper") ) {
- return myUG // nothing to add
- }
-
- var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ")
- var ugMap = new HashMap[String,String]
- def addEntry( s : String ) : Unit = {
- val sps = s.split("=")
- val k = sps(0)
- val v = sps(1)
- val kv = (k,v)
- ugMap += kv
- }
- ugTokens.foreach( addEntry(_) )
-
- myUG.reference_sequence = new File(ugMap("reference_sequence"))
- myUG.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq"))
- myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble)
- myUG.DBSNP = new File(ugMap("DBSNP"))
-
- // todo -- more args
-
- return myUG
-
- }
-}