diff --git a/build.xml b/build.xml index e18dfa8de..4e7c642ba 100644 --- a/build.xml +++ b/build.xml @@ -298,13 +298,6 @@ - - diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala deleted file mode 100755 index 76e5a4fae..000000000 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala +++ /dev/null @@ -1,141 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline - -import org.broadinstitute.sting.queue.extensions.gatk._ -import java.io.File -import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.CommandLineFunction -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import net.sf.picard.reference.ReferenceSequenceFileFactory -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import org.broadinstitute.sting.utils.interval.IntervalUtils -import collection.mutable.{ListBuffer, HashMap} -import collection.JavaConversions -import java.util.Arrays -import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils} -import org.broadinstitute.sting.commandline.{Output, Input} -import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction - -class BamProcessing(attribs: Pipeline, gatkJar: File, fixMatesJar: File) { - library => - - var attributes : Pipeline = attribs - - def this(yaml: File, gatkJar: File, fixMatesJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar,fixMatesJar) - - - trait StandardCommandLineGATK extends CommandLineGATK { - this.reference_sequence = library.attributes.getProject.getReferenceFile - this.intervals = List(library.attributes.getProject.getIntervalList) - this.rodBind :+= new RodBind("dbsnp", library.attributes.getProject.getGenotypeDbsnpType, library.attributes.getProject.getGenotypeDbsnp) - this.memoryLimit = Some(2) - this.jarFile = library.gatkJar - } - - /** - * @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run - * @Returns: A CLF for the realigner target creator job - */ - def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = { - var rtc = new RealignerTargetCreator with StandardCommandLineGATK - rtc.intervals = Nil - rtc.intervalsString = contigs - rtc.input_file :+= bam - rtc.out = output - rtc.analysisName = "RealignerTargetCreator" - - return rtc - } - - /** - * @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file - * @Returns: A CLF for the indel cleaning job - */ - def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = { - var realigner = new IndelRealigner with StandardCommandLineGATK - realigner.intervalsString = contigs - realigner.intervals = Nil - realigner.input_file :+= bam - realigner.out = outBam - realigner.targetIntervals = targets - realigner.analysisName = "IndelClean" - realigner.bam_compression = Some(0) - - return realigner - } - - /** - * @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to - * @Returns: A list of CLFs (todo -- wrapped in a Pipeline) - */ - def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = { - var cmds : List[CommandLineFunction] = Nil - var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]]( - ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) } - ) - var bamsToMerge : List[File] = Nil - for ( spec <- jobSpecs ) { - cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2) - bamsToMerge :+= spec._2 - } - - cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar) - - return cmds - - } - - /** - * @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of - * command line functions to do the target-creating, splitting, cleaning, and merging, returning that list - * of command line functions - * @Returns: A list of command line functions for the full indel realignment pipeline from the collection - * of uncleaned bams to the collection of cleaned bams - */ - def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = { - val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs) - var commands : List[CommandLineFunction] = Nil - for ( bamPair <- bamsUncleanCleanPairs ) { - val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") ) - val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2) - val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction - sam.bamFile = bamPair._2 - sam.analysisName = "SamtoolsIndex" - commands :+= rtc - commands ++= icbs - commands :+= sam - } - - return commands - } - - /** - * @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted - * @Returns: Command line function for the merge, fix-mate, and sort operation - */ - def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = { - var pfm : PicardFixMates = new PicardFixMates - pfm.bams = inBams - pfm.outBam = outBam - pfm.jarFile = picardJar - pfm.assumeSorted = Some(false) - pfm.memoryLimit = Some(4) - pfm.analysisName = "FixMates" - - return pfm - } - - class PicardFixMates extends PicardBamFunction { - @Input(doc="input bam files") var bams: List[File] = Nil - @Output(doc="output bam file") var outBam: File = null - - def inputBams: List[File] = bams - def outputBam: File = outBam - - } - - - def swapExt(file: File, oldExtension: String, newExtension: String) = - new File(file.getName.stripSuffix(oldExtension) + newExtension) - -} diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala deleted file mode 100755 index 0a0136923..000000000 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline - -import org.broadinstitute.sting.commandline.{Input, Argument, Hidden} -import java.io.File - - -class PipelineArgumentCollection { - - @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs", required=false) - var cleaningJobs: Int = 1 - - @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y", required=false) - var yamlFile: File = _ - - @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false) - var trigger: File = _ - - @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false) - var refseqTable: File = _ - - @Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false) - var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar") - - @Input(doc="path to GATK jar", shortName="gatk") - var gatkJar: File = _ - - @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true) - var target_titv: Float = _ - - @Input(doc="per-sample downsampling level",shortName="dcov",required=false) - var downsampling_coverage = 300 - - @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false) - var num_snp_scatter_jobs = 20 - - @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false) - var num_indel_scatter_jobs = 5 - - @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false) - var skip_cleaning = false - - @Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam). Mutually exclusive with YAML", - required=false, shortName="pBams") - var projectBams: File = _ - - @Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName") - var projectName: String = _ - - @Input(doc="The reference file. Mutually exclusive with YAML.", required=false, shortName="pRef") - var projectRef: File = _ - - @Input(doc="The project interval list. Mutually exclusive with YAML.", required=false, shortName="pInt") - var projectIntervals: File = _ - - @Input(doc="The project dbsnp. Mutually exclusive with YAML",required=false, shortName="pDB") - var projectDBSNP: File = _ - - //@Input(doc="ADPR script") - //var adprScript: File = _ - - //@Input(doc="Sequencing maching name (for use by adpr)") - //var machine: String = _ - - //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection") - //var protocol: String = _ - - def verifyArguments = { - // if no yaml file we require all the mutually exclusive arguments - if ( yamlFile == null ) { - if ( projectBams == null ) throw new IllegalArgumentException("No YAML file provided; and no project bam list. Pipeline requires either YAML file, or full project specs.") - - if ( projectName == null ) throw new IllegalArgumentException("No YAML file provided; and no project name. Pipeline requires either YAML file, or full project specs.") - - if ( projectRef == null) throw new IllegalArgumentException("No YAML file provided; and no project reference. Pipeline requires either YAML file, or full project specs.") - - if ( projectIntervals == null ) throw new IllegalArgumentException("No YAML file provided; and no project intervals. Pipeline requires either YAML file, or full project specs.") - - if ( projectDBSNP == null ) throw new IllegalArgumentException("No YAML file provided; and no project DBSNP. Pipeline requires either YAML file, or full project specs.") - } else { - if ( projectBams != null || projectName != null || projectRef != null || projectIntervals != null || projectDBSNP != null ) - throw new IllegalArgumentException("YAML file provided, along with other project spec arguments. YAML file is mutually exclusive with project specs.") - } - } - -} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala deleted file mode 100755 index 1cb4b82af..000000000 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala +++ /dev/null @@ -1,100 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline - -import org.broadinstitute.sting.commandline._ -import org.broadinstitute.sting.queue.util._ -import java.io.File -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.queue.function.CommandLineFunction - -class ProjectManagement(stingPath: String) { - // TODO -- MAJOR scatter/gather numbers are hard-coded. Really need a function to set it to the right number. These are optimized to get everything on 'hour' - - pm => - - var stingDirPath : String = stingPath - - def this(f : File) = this(f.getAbsolutePath) - - class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction { - @Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files - @Output(doc="The file to write the site list to") var out_vcf = out_list - @Argument(doc="Path to the SortByRef script") var sortByRef: String = _ - @Argument(doc="Path to the reference file on disk") var ref: File = _ - - def commandLine = { - "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format( - vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath - ) - } - } - - class Vcf2Intervals(in_vcf: File, out_intervals: File) extends CommandLineFunction { - @Input(doc="The VCF to convert") var vcf: File = in_vcf - @Output(doc="The output bed file") var intervals: File = out_intervals - - def commandLine = { - "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(vcf.getAbsolutePath,intervals.getAbsolutePath) - } - } - - def MergeBatches( callVCFs: List[File], allBams: List[File], mergedVCF: File, ref: File, size: Int) : List[CommandLineFunction] = { - var cmds : List[CommandLineFunction] = Nil - var pfSites : PassFilterAlleles = new PassFilterAlleles(callVCFs,swapExt(mergedVCF,".vcf",".pf.alleles.vcf")) - pfSites.sortByRef = pm.stingDirPath+"perl/sortByRef.pl" - pfSites.ref = ref - - cmds :+= pfSites - - var ints : Vcf2Intervals = new Vcf2Intervals(pfSites.out_vcf,swapExt(pfSites.out_vcf,".vcf",".intervals.list")) - - cmds :+= ints - - var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2)))) - - cmds ++= calcs - - cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, ints.intervals, mergedVCF) - - return cmds - - } - - def LikelihoodCalc( bams: List[File], ref: File, intervals: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = { - var calc = new UGCalcLikelihoods - calc.input_file ++= bams - calc.reference_sequence = ref - calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar") - calc.downsample_to_coverage = Some(300) - calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6) - calc.scatterCount = if (bams.size < 5 ) 1 else if (bams.size < 50) 60 else 120 - calc.min_base_quality_score = Some(22) - calc.min_mapping_quality_score = Some(20) - //calc.genotyping_mode = Option[GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES] - calc.out = outVCF - calc.rodBind :+= new RodBind("allele","VCF",alleleVCF) - calc.intervals :+= intervals - - return calc - } - - def VariantCallMerge( likelihoods: List[File], ref: File, intervals: File, output: File) : UGCallVariants = { - var call = new UGCallVariants - call.reference_sequence = ref - call.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar") - call.intervals :+= intervals - call.memoryLimit = Some(8) - call.out = output - call.rodBind ++= likelihoods.map( a => new RodBind("variant"+a.getName.replace(".vcf",""),"vcf",a) ) - call.scatterCount = 30 - - return call - } - - def swapExt(file: File, oldExtension: String, newExtension: String) = - new File(file.getName.stripSuffix(oldExtension) + newExtension) - -} diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala deleted file mode 100755 index 8cbff5f94..000000000 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala +++ /dev/null @@ -1,13 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline - -import org.broadinstitute.sting.queue.function.CommandLineFunction - -trait QPipeline { - var commands: List[CommandLineFunction] = Nil - - def generateCommands - - def track // todo -- implement me - - def removeIntermediate // todo -- implement me -} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala deleted file mode 100755 index 12afd50d5..000000000 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala +++ /dev/null @@ -1,420 +0,0 @@ -package org.broadinstitute.sting.queue.pipeline -import org.broadinstitute.sting.commandline._ -import org.broadinstitute.sting.queue.util._ -import java.io.File -import scala.collection.JavaConversions._ -import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.utils.yaml.YamlUtils -import org.broadinstitute.sting.queue.function.CommandLineFunction -import org.broadinstitute.sting.utils.text.XReadLines -import scala.collection.mutable.HashMap - -class VariantCalling(attribs: Pipeline,gatkJar: File) { - vc => - - // load attributes - var attributes = attribs - - def this(yaml: File, gatkJar: File) = this(YamlUtils.load(classOf[Pipeline],yaml),gatkJar) - - /** - * Trait to propagate basic attributes throughout the library - */ - trait StandardCommandLineGATK extends CommandLineGATK { - this.reference_sequence = vc.attributes.getProject.getReferenceFile - this.intervals = List(vc.attributes.getProject.getIntervalList) - this.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp) - // set global memory limit on the low side. Additional input bams will affect it. - this.memoryLimit = Some(2) - this.jarFile = vc.gatkJar - } - - /** - * @Doc: Adds the trait data to a command line gatk that is passed in - * @Return: the input CLGATK with the SCLGATK data propagated into it - * @TODO: This should be better written, it'd be nice just to call it with addTrait[T], and return a T with SCLGATK - */ - def addTrait[T <: CommandLineGATK](c : T) : T = { - c.reference_sequence = vc.attributes.getProject.getReferenceFile - c.intervals = List(vc.attributes.getProject.getIntervalList) - c.rodBind :+= new RodBind("dbsnp", vc.attributes.getProject.getGenotypeDbsnpType, vc.attributes.getProject.getGenotypeDbsnp) - // set global memory limit on the low side. Additional input bams will affect it. - c.memoryLimit = Some(2) - c.jarFile = vc.gatkJar - c - } - - /** - * @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file - * @Return: UnifiedGenotyper with the standard GSA arguments - * @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information - */ - def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = { - var ug = new UnifiedGenotyper with StandardCommandLineGATK - ug.analysisName = "UnifiedGenotyper" - ug.input_file = bams - ug.out = output - ug.downsample_to_coverage = Some(300) - ug.dt = DownsampleType.BY_SAMPLE - ug.scatterCount = 50 - - if ( bams.size > 40 ) { - ug.memoryLimit = Some(4) - } - - if ( bams.size > 90 ) { - ug.memoryLimit = Some(6) - } - - if ( bams.size > 140 ) { - ug.memoryLimit = Some(8) - } - - return ug - - } - - /** - * @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file - * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments - */ - def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = { - var ig = new IndelGenotyperV2 with StandardCommandLineGATK - ig.analysisName = "IndelGenotyper" - ig.input_file :+= bam - ig.out = output - ig.downsample_to_coverage = Some(300) - return ig - } - - /** - * @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline - * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job - */ - private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = { - var ig = StandardIndelGenotyper(bam,output) - return ig - } - - /** - * @Doc: Combines a list of indel VCFs to a single output file - * @Returns: A CombineVariants CLF with standard GSA arguments - */ - def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = { - var cv = new CombineVariants with StandardCommandLineGATK - cv.out = output - cv.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY - cv.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION - cv.analysisName = "IndelGenotyper" - cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",") - //cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",") - cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out)) - - if ( igList.size > 50 ) { - cv.memoryLimit = Some(4) - } - - return cv - - } - - /** - * @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline. - * @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names. - */ - def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = { - var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false))) - var combine = this.StandardIndelCombine( genotypers, output ) - var callFunctions: List[CommandLineGATK] = genotypers - callFunctions = combine :: callFunctions - - return callFunctions - } - - def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = { - var iFil = new VariantFiltration with StandardCommandLineGATK - iFil.analysisName = "FilterAtIndels" - iFil.out = output - iFil.mask = indels.getAbsolutePath - iFil.maskName = "Indel_Mask" - iFil.variantVCF = snps - // todo -- cluster size varies with # bams - iFil.clusterSize = Some(5) - iFil.clusterWindowSize = Some(8) - - return iFil - } - - def StandardHandfilter( snps: File, output: File ) : VariantFiltration = { - var hFil = new VariantFiltration with StandardCommandLineGATK - hFil.analysisName = "HandFilter" - hFil.out = output - hFil.variantVCF = snps - hFil.filterExpression :+= "\"QD<5.0\"" - hFil.filterName :+= "LowQualByDepth" - hFil.filterExpression :+= "\"SB>-0.10\"" - hFil.filterName :+= "HighStrandBias" - - return hFil - } - - def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = { - var genC = new GenerateVariantClusters with StandardCommandLineGATK - genC.analysisName = "VariantQualityRecalibration" - genC.rodBind :+= new RodBind("input","VCF",snps) - genC.cluster_file = output - genC.use_annotation :+= "QD" - genC.use_annotation :+= "SB" - genC.use_annotation :+= "HaplotypeScore" - genC.use_annotation :+= "HRun" - - return genC - } - - def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File, - out_tranches: File) : VariantRecalibrator = { - var vr = new VariantRecalibrator with StandardCommandLineGATK - vr.analysisName = "VariantQualityRecalibration" - vr.rodBind :+= new RodBind("input","VCF",raw_vcf) - vr.cluster_file = cluster - vr.target_titv = Some(target_titv) - vr.out = out_vcf - vr.tranches_file = out_tranches - vr.tranche :+= "0.1" - vr.tranche :+= "1" - vr.tranche :+= "5" - vr.tranche :+= "10" - - return vr - - } - - def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = { - var avc = new ApplyVariantCuts with StandardCommandLineGATK - avc.analysisName = "VariantQualityRecalibration" - avc.rodBind :+= new RodBind("input","VCF",snpRecal) - avc.out = output - avc.tranches_file = tranches - avc.fdr_filter_level = Some(5) - - return avc - } - - def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = { - var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster")) - var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"), - swapExt(snps,".vcf",".recal.tranch")) - var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,swapExt(recal.out,".vcf",".tranched.vcf")) - - var cmds: List[CommandLineGATK] = Nil - cmds :+= clust - cmds :+= recal - cmds :+= cut - - return cmds - } - - def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = { - var ga = new GenomicAnnotator with StandardCommandLineGATK - ga.analysisName = "GenomicAnnotator" - ga.variantVCF = snps - ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile) - ga.rodToIntervalTrackName = "variant" - ga.out = outputVCF - - return ga - } - - def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = { - var commands : List[CommandLineGATK] = Nil - var dir = "" - - if ( output.getParent != null ) { - dir = output.getParent+"/" - } - - var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf") - var ug = StandardUnifiedGenotyper(bams, raw_snp) - - commands :+= ug - - var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf") - var ig = StandardIndelCalls(bams,raw_indel) - - commands ++= ig - - var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") - var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp) - - commands :+= iFilt - - var annoSNP = prefilt_snp - if ( refGene != null ) { - annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") - var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) - - commands :+= annotate - } - - var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output) - - commands ++= recal - - return commands - } - - def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = { - var commands : List[CommandLineGATK] = Nil - var dir = "" - - if ( recalOut.getParent != null ) { - dir = recalOut.getParent+"/" - } - - var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf") - var ug = StandardUnifiedGenotyper(bams, raw_snp) - - commands :+= ug - - var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf") - var ig = StandardIndelCalls(bams,raw_indel) - - commands ++= ig - - var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") - var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp) - - commands :+= iFilt - - var annoSNP = prefilt_snp - if ( refGene != null ) { - annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") - var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) - - commands :+= annotate - } - - var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut) - - commands ++= recal - - var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut) - - commands :+= handFilt - - return commands - } - - class VCF2Mask extends CommandLineFunction { - @Input(doc="the indel vcf") var indel_vcf : File = _ - @Argument(doc="the window size") var win_size : Int = 2 - @Output(doc="the mask bed") var out_mask : File = _ - - def commandLine = { "grep PASS %s | awk '{print $1,$2-%d,$2+%d}' > %s".format(indel_vcf.getAbsolutePath,win_size,win_size,out_mask.getAbsolutePath) } - } - - def IndelVCF2Mask(vcf: File, size: Int) : VCF2Mask = { - var masker: VCF2Mask = new VCF2Mask() - masker.indel_vcf = vcf - masker.win_size = size - masker.out_mask = swapExt(vcf,".vcf",".indel_mask.bed") - - return masker - } - - def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineFunction] = { - var commands : List[CommandLineFunction] = Nil - var dir = "" - - if ( recalOut.getParent != null ) { - dir = recalOut.getParent+"/" - } - - var raw_snp = new File(dir+vc.attributes.getProject.getName+".raw_snps.vcf") - var ug = StandardUnifiedGenotyper(bams, raw_snp) - - commands :+= ug - - var raw_indel = indelOut - var ig = StandardIndelCalls(bams,raw_indel) - - var indel_mask : VCF2Mask = IndelVCF2Mask(indelOut,5) - - commands ++= ig - commands :+= indel_mask - - var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") - var iFilt = StandardFilterAtIndels(raw_snp,indel_mask.out_mask,prefilt_snp) - - commands :+= iFilt - - var annoSNP = prefilt_snp - if ( refGene != null ) { - annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") - var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) - - commands :+= annotate - } - - var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut) - - commands ++= recal - - var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut) - - commands :+= handFilt - - return commands - } - - def swapExt(file: File, oldExtension: String, newExtension: String) = - new File(file.getName.stripSuffix(oldExtension) + newExtension) - -} - -object VariantCalling { - - /** - * @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line - * @Doc - Constructs a UG object based on the settings in the header of the input VCF - * @Returns an instance of the UG object with all information propagated - */ - def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = { - var myUG : UnifiedGenotyper = new UnifiedGenotyper - var xrl : XReadLines = new XReadLines(vcf) - val header_limit = 1000 - var line = "" - var line_no = 1 - while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) { - line = xrl.next - line_no += 1 - } - var ugLine : String = line - if ( ! ugLine.startsWith("##UnifiedGenotyper") ) { - return myUG // nothing to add - } - - var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ") - var ugMap = new HashMap[String,String] - def addEntry( s : String ) : Unit = { - val sps = s.split("=") - val k = sps(0) - val v = sps(1) - val kv = (k,v) - ugMap += kv - } - ugTokens.foreach( addEntry(_) ) - - myUG.reference_sequence = new File(ugMap("reference_sequence")) - myUG.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq")) - myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble) - myUG.DBSNP = new File(ugMap("DBSNP")) - - // todo -- more args - - return myUG - - } -}