Expands targets by 50-bp on both sides when the expandIntervals argument is greater than 0.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5251 348d0f76-0448-11de-a6fe-93d51630548a
2011-02-16 14:47:52 +00:00 · 2011-02-16 14:47:52 +00:00 · c0a4af3809
parent 6d3b878dde
commit c0a4af3809
1 changed files with 124 additions and 139 deletions
--- a/scala/qscript/playground/FullCallingPipeline.q
+++ b/scala/qscript/playground/FullCallingPipeline.q
@ -1,4 +1,3 @@
-
 import org.broadinstitute.sting.commandline.ArgumentSource
 import org.broadinstitute.sting.datasources.pipeline.Pipeline
 import org.broadinstitute.sting.queue.extensions.gatk._
@ -6,6 +5,7 @@ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
 import org.broadinstitute.sting.queue.extensions.samtools._
 import org.broadinstitute.sting.queue.function.ListWriterFunction
 import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
+import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
 import org.broadinstitute.sting.queue.QScript
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.yaml.YamlUtils
@ -16,42 +16,27 @@ class FullCallingPipeline extends QScript {
  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
  var yamlFile: File = _

-  @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
-  var trigger: File = _
-
-  @Input(doc="path to refseqTable (for GenomicAnnotator) if not present in the YAML", shortName="refseqTable", required=false)
-  var refseqTable: File = _
-
-  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)
+  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", shortName="P", required=false)
  var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")

-  @Input(doc="path to GATK jar")
+  @Input(doc="path to GATK jar", shortName="G")
  var gatkJar: File = _

-  @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
-  var downsampling_coverage = 600
-
  @Input(doc="level of parallelism for IndelRealigner.  By default is set to 1.", shortName="cleanerScatter", required=false)
  var num_cleaner_scatter_jobs = 1

-  @Input(doc="level of parallelism for UnifiedGenotyper (both for Snps and Indels).   By default is set to 20.", shortName="varScatter", required=false)
+  @Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels).  By default is set to 20.", shortName="varScatter", required=false)
  var num_var_scatter_jobs = 20

+  @Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false)
+  var expandIntervals = 50
+
  @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
  var skip_cleaning = false

  @Input(doc="ADPR script", shortName ="tearScript", required=false)
  var tearScript: File = _

-  // TODO: Fix command lines that pass -bigMemQueue
-  @Argument(doc="Unused", shortName="bigMemQueue", required=false)
-  var big_mem_queue: String = _
-
-  @Argument(doc="Job queue for short run jobs (<1hr)", shortName="shortJobQueue", required=false)
-  var short_job_queue: String = _
-
-
-
  private var pipeline: Pipeline = _

  trait CommandLineGATKArgs extends CommandLineGATK {
@ -70,16 +55,10 @@ class FullCallingPipeline extends QScript {

    val projectBase: String = qscript.pipeline.getProject.getName

-    if (qscript.refseqTable != null)
-      qscript.pipeline.getProject.setRefseqTable(qscript.refseqTable)
    if (qscript.skip_cleaning) {
-
-
      endToEnd(projectBase + ".uncleaned", "recalibrated")
    } else {
-
      val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
-      /

      for ( sample <- recalibratedSamples ) {
        val sampleId = sample.getId
@ -166,140 +145,151 @@ class FullCallingPipeline extends QScript {
        samtoolsindex.jobOutputFile = new File(".queue/logs/Cleaning/%s/SamtoolsIndex.out".format(sampleId))
        samtoolsindex.bamFile = cleaned_bam
        samtoolsindex.analysisName = "index_cleaned_"+sampleId
-        samtoolsindex.jobQueue = qscript.short_job_queue
+        //samtoolsindex.jobQueue = qscript.short_job_queue
        add(samtoolsindex)
      }

-      //endToEnd(projectBase + ".cleaned", "cleaned", adprRscript, seq, expKind)
      endToEnd(projectBase + ".cleaned", "cleaned")
    }
  }

  def endToEnd(base: String, bamType: String) = {
-
    val samples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains(bamType)).toList
    val bamFiles = samples.map(_.getBamFiles.get(bamType))

-    // step through the un-indel-cleaned graph:
-    // 1a. call snps and indels
-    val snps = new UnifiedGenotyper with CommandLineGATKArgs
-    snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.out")
-    snps.analysisName = base+"_SNP_calls"
-    snps.input_file = bamFiles
-    snps.input_file = bamFiles
-    snps.group :+= "Standard"
-    snps.out = new File("SnpCalls", base+".vcf")
-    snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
-    snps.memoryLimit = Some(6)
+    val ei : ExpandIntervals = new ExpandIntervals(qscript.pipeline.getProject.getIntervalList, 1, qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), qscript.pipeline.getProject.getReferenceFile, "INTERVALS", "INTERVALS")
+    ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out")

-    snps.scatterCount = qscript.num_var_scatter_jobs
-        snps.setupScatterFunction = {
-          case scatter: ScatterFunction =>
-            scatter.commandDirectory = new File("SnpCalls/ScatterGather")
-            scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
-        }
-        snps.setupCloneFunction = {
-          case (clone: CloneFunction, index: Int) =>
-            clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
-            clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
-        }
-        snps.setupGatherFunction = {
-          case (gather: GatherFunction, source: ArgumentSource) =>
-            gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
-            gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
-        }
+    trait ExpandedIntervals extends CommandLineGATK {
+      if (qscript.expandIntervals > 0) {
+        this.intervals :+= ei.outList

-    val indels = new UnifiedGenotyper with CommandLineGATKArgs
-    indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.out")
-    indels.analysisName = base+"_Indel_calls"
-    indels.input_file = bamFiles
-    indels.input_file = bamFiles
-    indels.group :+= "Standard"
-    indels.out = new File("IndelCalls", base+".vcf")
-    indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
+        add(ei)
+      }
+    }
+
+    // Call indels
+    val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
+    indels.analysisName = base + "_indels"
+    indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out")
    indels.memoryLimit = Some(6)
+    indels.downsample_to_coverage = Some(600)
    indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
-
-
+    indels.input_file = bamFiles
+    indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
+    indels.out = new File("IndelCalls", base+".indels.vcf")

    indels.scatterCount = qscript.num_var_scatter_jobs
-        indels.setupScatterFunction = {
-          case scatter: ScatterFunction =>
-            scatter.commandDirectory = new File("IndelCalls/ScatterGather")
-            scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
-        }
-        indels.setupCloneFunction = {
-          case (clone: CloneFunction, index: Int) =>
-            clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
-            clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
-        }
-        indels.setupGatherFunction = {
-          case (gather: GatherFunction, source: ArgumentSource) =>
-            gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
-            gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
-        }
+    indels.setupScatterFunction = {
+      case scatter: ScatterFunction =>
+        scatter.commandDirectory = new File("IndelCalls/ScatterGather")
+        scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
+    }
+    indels.setupCloneFunction = {
+      case (clone: CloneFunction, index: Int) =>
+        clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
+        clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
+    }
+    indels.setupGatherFunction = {
+      case (gather: GatherFunction, source: ArgumentSource) =>
+        gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
+        gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
+    }

+    // Filter indels
+    val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
+    filteredIndels.analysisName = base + "_filteredIndels"
+    filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out")
+    filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter")
+    filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"")
+    filteredIndels.variantVCF = indels.out
+    filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf")

-    // 1b. genomically annotate SNPs -- no longer slow
-    val annotated = new GenomicAnnotator with CommandLineGATKArgs
-    annotated.jobOutputFile = new File(".queue/logs/SNPCalling/GenomicAnnotator.out")
-    annotated.rodBind :+= RodBind("variant", "VCF", snps.out)
-    annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable)
-    annotated.out = swapExt("SnpCalls",snps.out,".vcf",".annotated.vcf")
+    // Call snps
+    val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
+    snps.analysisName = base+"_snps"
+    snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out")
+    snps.memoryLimit = Some(6)
+    snps.downsample_to_coverage = Some(600)
+    snps.input_file = bamFiles
+    snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
+    snps.out = new File("SnpCalls", base+".snps.vcf")
+
+    snps.scatterCount = qscript.num_var_scatter_jobs
+    snps.setupScatterFunction = {
+      case scatter: ScatterFunction =>
+        scatter.commandDirectory = new File("SnpCalls/ScatterGather")
+        scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
+    }
+    snps.setupCloneFunction = {
+      case (clone: CloneFunction, index: Int) =>
+        clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
+        clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
+    }
+    snps.setupGatherFunction = {
+      case (gather: GatherFunction, source: ArgumentSource) =>
+        gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
+        gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
+    }
+
+    // Filter snps
+    val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
+    filteredSNPs.analysisName = base+"_filteredSNPs"
+    filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out")
+    filteredSNPs.filterName ++= List("SNPSBFilter","SNPQDFilter","SNPHRunFilter")
+    filteredSNPs.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"")
+    filteredSNPs.clusterWindowSize = Some(10)
+    filteredSNPs.clusterSize = Some(3)
+    filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out)
+    filteredSNPs.variantVCF = snps.out
+    filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf")
+
+    // Combine indels and snps into one VCF
+    val combineAll = new CombineVariants with CommandLineGATKArgs with ExpandedIntervals
+    combineAll.analysisName = base + "_combineAll"
+    combineAll.jobOutputFile = new File(".queue/logs/Combined/CombineVariants.out")
+    combineAll.variantMergeOptions = Option(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
+    combineAll.rod_priority_list = "Indels,SNPs"
+    combineAll.rodBind :+= RodBind("Indels", "VCF", filteredIndels.out)
+    combineAll.rodBind :+= RodBind("SNPs", "VCF", filteredSNPs.out)
+    combineAll.out = new File("CombinedCalls", base + ".allVariants.filtered.vcf")
+
+    // Annotate variants
+    val annotated = new GenomicAnnotator with CommandLineGATKArgs with ExpandedIntervals
+    annotated.analysisName = base+"_annotated"
+    annotated.jobOutputFile = new File(".queue/logs/Combined/GenomicAnnotator.out")
    annotated.rodToIntervalTrackName = "variant"
-    annotated.analysisName = base+"_GenomicAnnotator"
+    annotated.rodBind :+= RodBind("variant", "VCF", combineAll.out)
+    annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.pipeline.getProject.getRefseqTable)
+    annotated.out = new File(base + ".snps_and_indels.filtered.annotated.vcf")
+    //annotated.out = swapExt("VariantCalls", combineAll.out, ".vcf", ".annotated.vcf")

-    // 2. hand filter with standard filter;  mask indels; filter snp clusters
-    val handFilter = new VariantFiltration with CommandLineGATKArgs
-    handFilter.jobOutputFile = new File(".queue/logs/SNPCalling/HandFilter.out")
-    handFilter.variantVCF = annotated.out
-    handFilter.rodBind :+= RodBind("mask", "VCF", indels.out)
-    handFilter.maskname = "IndelSite"
-    handFilter.clusterWindowSize = Some(10)
-    handfilter.clusterSize = Some(3)
-    handFilter.filterName ++= List("StrandBias","QualByDepth","HomopolymerRun")
-    handFilter.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"")
-    handFilter.out = swapExt("SnpCalls",annotated.out,".vcf",".handfiltered.vcf")
-    handFilter.analysisName = base+"_HandFilter"
-
-
-    // 3. Variant eval the cut and the hand-filtered vcf files
-    val smallEval = new VariantEval with CommandLineGATKArgs
-    smallEval.jobOutputFile = new File(".queue/logs/SNPCalling/VariantEval.out")
-    smallEval.rodBind :+= RodBind("eval", "VCF", handFilter.out)
-    smallEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
-    smallEval.stratificationModule ++= List("EvalRod", "Novelty")
-    smallEval.out = new File("SnpCalls", base+".gatkreport")
+    // Variant eval the cut and the hand-filtered vcf files
+    val smallEval = new VariantEval with CommandLineGATKArgs with ExpandedIntervals
    smallEval.analysisName = base+"_VariantEval"
+    smallEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.out")
    smallEval.noST = true
    smallEval.noEV = true
+    smallEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
+    smallEval.stratificationModule ++= List("EvalRod", "Novelty")
    smallEval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
+    smallEval.rodBind :+= RodBind("eval", "VCF", annotated.out)
+    smallEval.out = swapExt(annotated.out, ".vcf", ".eval")
+    //smallEval.out = new File("VariantCalls", base+".gatkreport")

-    // 4. Combine indels and Snps into one VCF
-    val combineAll = new CombineVariants with CommandLineGATKArgs
-    combineAll.jobOutputFile = new File(".queue/logs/Overall/CombineVariants.out")
-    combineAll.rod_priority_list = List("Indels", "SNPs")
-    combineAll.rodBind :+= RodBind("SNPs", "VCF", handFilter.out)
-    combineAll.rodBind :+= RodBind("Indels", "VCF", indels.out)
-    combineAll.variantMergeOptions = Option(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
-    combineAll.analysisName = base + "_CombineCallsets"
-    combineAll.out = new File(base + "allVariants.vcf")
-
-    // 5. Make the bam list
-    val listOfBams =  new File(base +".BamFiles.list")
+    // Make the bam list
+    val listOfBams =  new File("Resources", base +".BamFiles.list")

    val writeBamList = new ListWriterFunction
+    writeBamList.analysisName = base + "_BamList"
+    writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out")
    writeBamList.inputFiles = bamFiles
    writeBamList.listFile = listOfBams
-    writeBamList.analysisName = base + "_BamList"
-    writeBamList.jobOutputFile = new File(".queue/logs/SNPCalling/bamlist.out")

-    // 6. Run the ADPR and make pretty stuff
+    add(indels, filteredIndels, snps, filteredSNPs, combineAll, annotated, smallEval, writeBamList)

-    add(snps, indels, annotated,masker,handFilter,eval,writeBamList)
-    if (qscript.tearScript != null){
+    // Run the ADPR and make pretty stuff
+    if (qscript.tearScript != null) {
      class rCommand extends CommandLineFunction{
        @Input(doc="R script")
        var script: File = _
@ -315,20 +305,15 @@ class FullCallingPipeline extends QScript {
      }

     val adpr = new rCommand
+     adpr.analysisName = base + "_ADPR"
     adpr.bamlist = listOfBams
     adpr.yaml = qscript.yamlFile.getAbsoluteFile
     adpr.script = qscript.tearScript
-     adpr.evalroot = eval.output
-     adpr.jobOutputFile = new File(".queue/logs/SNPCalling/adpr.out")
-     adpr.tearsheet = new File("SnpCalls", base + ".tearsheet.pdf")
-     adpr.analysisName = base + "_ADPR"
-
+     adpr.evalroot = smallEval.out
+     adpr.jobOutputFile = new File(".queue/logs/Overall/ADPR.out")
+     adpr.tearsheet = new File("VariantCalls", base + ".tearsheet.pdf")

     add(adpr)
    }
-
-
-
-
  }
 }