Graph structure must be known at compile time. Removing GroupIntervals until a future point where in-process-functions can predict their output based on inputs [though this is probably forever: the inputs may not exist at compile time!]

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4886 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-20 21:22:58 +00:00 · 2010-12-20 21:22:58 +00:00 · fc33901810
parent 61d5daa65c
commit fc33901810
2 changed files with 74 additions and 82 deletions
--- a/scala/qscript/chartl/expanded_targets.q
+++ b/scala/qscript/chartl/expanded_targets.q
@ -1,6 +1,7 @@
 import org.broadinstitute.sting.commandline.ArgumentCollection
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals
+import org.broadinstitute.sting.queue.library.ipf.intervals.GroupIntervals
 import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.utils.text.XReadLines
@ -14,52 +15,91 @@ class expanded_targets extends QScript {
  def script = {

    val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => {
-      new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"INTERVALS")
+      new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"TSV","INTERVALS")
    })

+     trait GATKArgs extends CommandLineGATK {
+      this.reference_sequence = args.projectRef
+      this.DBSNP = args.projectDBSNP
+      this.jarFile = args.gatkJar
+    }
+
+    val userDir = System.getProperty("user.dir")
+
    addAll(intervalExpands)

-    val callFiles: List[File] = intervalExpands.map(_.outList).map(makeCalls(_,20))
+    val cleanIntervals : ExpandIntervals = new ExpandIntervals(args.projectIntervals,1,210,new File(userDir+"/"+args.projectName+"_expanded_full.interval_list"),args.projectRef,"TSV","INTERVALS")
+
+    add(cleanIntervals)
+
+    val uncleanBams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u))
+    val realign : List[RealignerTargetCreator] = uncleanBams.map(u => {
+      var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs
+      rtc.out = swapExt(userDir,u,".bam",".expanded.targets.txt")
+      rtc.input_file :+= u.getAbsoluteFile
+      rtc.intervals :+= cleanIntervals.outList
+      rtc
+    })
+    val clean : List[IndelRealigner] = realign.map( u => {
+      var clean : IndelRealigner = new IndelRealigner with GATKArgs
+      clean.targetIntervals = u.out
+      clean.input_file = u.input_file
+      clean.memoryLimit = Some(4)
+      clean.out = new File(userDir+"/"+swapExt(u.out,".bam",".expanded.targets.bam").getName)
+      clean
+    })
+
+    addAll(realign)
+    addAll(clean)
+
+    val callFiles: List[File] = intervalExpands.map(u => makeCalls(u.outList,20,clean.map(h => h.out)))

  }

-  def makeCalls(iList: File, scatterNo: Int): List[File] = {
-    var scatters : List[(File,File)] = _
+  def makeCalls(iList: File, scatterNo: Int, bams: List[File]): File = {
+    var scatters : GroupIntervals = new GroupIntervals(iList,20,true,Some(System.getProperty("user.dir")))
+    var filtered : List[(File,File)] = scatters.outputList.zipWithIndex.map(v => reallyMakeCalls(v._1,bams,v._2))
+    var gatherStandard : VcfGatherFunction = new VcfGatherFunction
+    gatherStandard.gatherParts = filtered.map(u => u._1)
+    gatherStandard.originalOutput = swapExt(iList,".interval_list",".filtered.calls.vcf")
+    var gatherHiseq : VcfGatherFunction = new VcfGatherFunction
+    gatherHiseq.gatherParts = filtered.map(u => u._2)
+    gatherHiseq.originalOutput = swapExt(iList,".interval_list",".filtered.hiseq.vcf")

-    var eval : VariantEval = new VariantEval
-    eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out)
-    eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf"))
-    eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out)
-    eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf"))
-    eval.out = swapExt(iList,".interval_list",".eval")
-    eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV)
-
-  }
-
-  def makeCalls(iList: File) : File = {
-
-    var bams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u))
+    add(scatters)
+    add(gatherStandard)
+    add(gatherHiseq)

    trait GATKArgs extends CommandLineGATK {
      this.reference_sequence = args.projectRef
      this.DBSNP = args.projectDBSNP
-      this.intervals = List(iList)
      this.jarFile = args.gatkJar
-      this.memoryLimit = Some(6)
    }

-    var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs
-    rtc.out = swapExt(iList,".interval_list",".targets.txt")
-    rtc.input_file = bams
-    var clean : IndelRealigner = new IndelRealigner with GATKArgs
-    clean.targetIntervals = rtc.out
-    clean.out = swapExt(iList,".interval_list",".cleaned.bam")
-    clean.input_file = bams
-    clean.maxReads = Some(100000)
+
+    var eval : VariantEval = new VariantEval with GATKArgs
+    eval.rodBind :+= new RodBind("evalInterval","vcf",gatherStandard.originalOutput)
+    eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf"))
+    eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",gatherHiseq.originalOutput)
+    eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf"))
+    eval.out = swapExt(iList,".interval_list",".eval")
+    eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV)
+
+    eval.out
+  }
+
+  def reallyMakeCalls(iList: File, bams : List[File], scatterNo: Int) : (File,File) = {
+
+    trait GATKArgs extends CommandLineGATK {
+      this.reference_sequence = args.projectRef
+      this.DBSNP = args.projectDBSNP
+      this.jarFile = args.gatkJar
+      this.intervals :+= iList
+    }
+
    var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
-    call.scatterCount = 4
-    call.input_file = List(clean.out)
-    call.out = swapExt(iList,".interval_list",".raw.vcf")
+    call.input_file = bams
+    call.out = swapExt(iList,".interval_list",".scatter%d.raw.vcf".format(scatterNo))
    call.trig_emit_conf = Some(0.0)
    call.rodBind :+= new RodBind("trigger","vcf",thisTrigger)
    var filter : VariantFiltration = new VariantFiltration with GATKArgs
@ -68,16 +108,16 @@ class expanded_targets extends QScript {
    filter.filterName :+= "LowQualByDepth"
    filter.filterExpression :+= "\"SB>-0.10\""
    filter.filterName :+= "HighStrandBias"
-    filter.out = swapExt(iList,".interval_list",".filtered.vcf")
+    filter.out = swapExt(iList,".interval_list",".scatter%d.filtered.vcf".format(scatterNo))
    var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
    callHiseq.input_file = List(new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"))
    callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out)
-    callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf")
+    callHiseq.out = swapExt(iList,".interval_list",".scatter%d.hiSeq.genotypes.vcf".format(scatterNo))
    callHiseq.trig_emit_conf = Some(0.0)

-    add(rtc,clean,call,filter,callHiseq)
+    add(call,filter,callHiseq)

-    return (filter.out,callHiSeq.out)
+    return (filter.out,callHiseq.out)

  }
 }
--- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala
+++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala
@ -1,48 +0,0 @@
-package org.broadinstitute.sting.queue.library.ipf.intervals
-
-import org.broadinstitute.sting.queue.function.InProcessFunction
-import org.broadinstitute.sting.commandline.{Argument, Output, Input}
-import collection.JavaConversions._
-import org.broadinstitute.sting.utils.text.XReadLines
-import java.io.{PrintStream, File}
-
-/**
- * Utility for explicitly grouping intervals together in sets of size num
- * @Author chartl
- */
-
-class GroupIntervals(iList: File, num: Int, head: Boolean, dir: Option[String]) extends InProcessFunction {
-  @Input(doc="Interval list to split into groups") var intervalList : File = iList
-  @Output(doc="The output interval lists created") var outputList : List[File] = Nil
-  @Argument(doc="The number of groups wanted") var nGroups : Int = num
-  @Argument(doc="The base location you want the files") var directory : Option[String] = dir
-  @Argument(doc="Has header (@ lines)") var hasHeader : Boolean = head
-
-  var written : Int = 0
-  var header : List[String] = Nil
-
-  def run = {
-    written = 0
-    header = Nil
-    asScalaIterator(new XReadLines(intervalList)).filter(u => ! isHeader(u)).grouped(num).foreach(n => write(n))
-  }
-
-  def write(lines : Seq[String]) = {
-    val oFile : File = if ( dir.isEmpty) new File(intervalList.getAbsolutePath+".group%d".format(written))else new File(dir.get+"/"+intervalList.getAbsolutePath+".group%d".format(written))
-    outputList :+= oFile
-    val output : PrintStream = new PrintStream(oFile)
-    header.foreach( u => output.print("%s%n".format(u)))
-    lines.foreach(u => output.print("%s%n".format(u)))
-    output.close()
-  }
-
-  def isHeader(s : String) : Boolean = {
-    if ( hasHeader && s.startsWith("@") ) {
-      header :+= s
-      return true
-    } else {
-      return false
-    }
-  }
-
-}