From 6db86ae0c6c2bda0f34988fbfd0bc3e460d68452 Mon Sep 17 00:00:00 2001
From: chartl <chartl@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sat, 18 Dec 2010 19:04:52 +0000
Subject: [PATCH] PAC sets the refseq file to optional.

Additional ipf for expanding interval lists. Either there's heavy latency on the toro -- or it's not working properly yet (e.g. the system print in the same scope as the file print outputs a line, but no file shows up on the system)


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4874 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../ipf/intervals/ExpandIntervals.scala       | 83 +++++++++++++++++++
 .../ipf/intervals/IntersectIntervals.scala    | 54 ++++++++++++
 .../pipeline/PipelineArgumentCollection.scala |  2 +-
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100755 scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
 create mode 100755 scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala

diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
new file mode 100755
index 000000000..ddc85eb59
--- /dev/null
+++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
@@ -0,0 +1,83 @@
+package org.broadinstitute.sting.queue.library.ipf
+
+import org.broadinstitute.sting.queue.function.InProcessFunction
+import org.broadinstitute.sting.commandline._
+import java.io.{PrintStream, File}
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.text.XReadLines
+import net.sf.picard.reference.FastaSequenceFile
+import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
+
+// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
+class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, opType: String) extends InProcessFunction {
+  @Input(doc="The interval list to expand") val inList : File = in
+  @Input(doc="The reference sequence") val refDict : File = ref
+  @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
+  @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
+  @Output(doc="The output intervals file to write to") val outList : File = out
+  @Argument(doc="The output format for the intervals") val outTypeStr = opType
+
+  val output : PrintStream = new PrintStream(outList)
+  val parser : GenomeLocParser = new GenomeLocParser(new FastaSequenceFile(ref,true))
+  val xrl : XReadLines = new XReadLines(inList)
+  val outType = if ( outTypeStr.equals("INTERVALS") ) IntervalOutputType.INTERVALS else IntervalOutputType.BED
+
+  var previous : GenomeLoc = _
+  var current : GenomeLoc = _
+  var next : GenomeLoc = _
+
+  def run = {
+    // todo -- there's got to be a special view that gives prevous, current, next
+    asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).foreach(expand(_))
+  }
+
+  def expand(loc : GenomeLoc) : Unit = {
+    if ( start > 0 ) {
+      previous = current
+      current = next
+      next = loc
+      if ( current == null ) {
+        return
+      }
+      val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt)
+      val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt)
+      if ( ok(new1) ) {
+        System.out.println("Printing! %s".format(repr(new1)))
+        output.print("%s%n".format(repr(new1)))
+      }
+      if ( ok(new2) ) {
+        System.out.println("Printing! %s".format(repr(new2)))
+        output.print("%s%n".format(repr(new2)))
+      }
+      previous = current
+    } else {
+      output.print("%s%n".format(repr(parser.createGenomeLoc(loc.getContig,loc.getStart-sizeInt,loc.getStop+sizeInt))))
+    }
+  }
+
+  def ok( loc : GenomeLoc ) : Boolean = {
+    //System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
+    ( next == null || loc.distance(next) >= start) && (previous == null || loc.distance(previous) >= start)
+  }
+
+  def repr(loc : GenomeLoc) : String = {
+    if ( loc == null ) return "null"
+    if ( outType == IntervalOutputType.INTERVALS ) {
+      return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
+    } else if ( outType == IntervalOutputType.BED ) {
+      return "%s\t%d\t%d".format(loc.getContig,loc.getStart-1,loc.getStop)
+    } else {
+      return "FORMAT?"
+    }
+  }
+
+  def parseGenomeInterval( s : String ) : GenomeLoc = {
+    val sp = s.split("\\s+")
+    if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt,sp(2).toInt)
+  }
+
+  object IntervalOutputType extends Enumeration("INTERVALS","BED") {
+    type IntervalOutputType = Value
+    val INTERVALS,BED = Value
+  }
+}
\ No newline at end of file
diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
new file mode 100755
index 000000000..2b3c81b29
--- /dev/null
+++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
@@ -0,0 +1,54 @@
+package org.broadinstitute.sting.queue.library.ipf.intervals
+
+import org.broadinstitute.sting.queue.function.InProcessFunction
+import org.broadinstitute.sting.commandline._
+import java.io.{PrintStream, File}
+import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
+import org.broadinstitute.sting.utils.text.XReadLines
+import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
+
+abstract class IntersectIntervals(iVals: List[File], outFile: File) extends InProcessFunction {
+ /* @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
+  @Output(doc="Output interval file to which to write") val output : File = outFile
+  @Argument(doc="List of contigs for the interval list (defaults to human b37 autosome); IN ORDER")
+  var contigs : List[String] = (new Range(1,22)).map(u => "%d".format(u)) ::: List("X","Y","MT")
+  @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
+
+
+  val out : PrintStream = new PrintStream(output)
+
+  def run = {
+    var dict : SAMSequenceDictionary = new SAMSequenceDictionary
+    contigs.map( u => new SAMSequenceRecord(u, Integer.MAX_VALUE ) ).foreach(u => dict.addSequence(u))
+    val parser : GenomeLocParser = new GenomeLocParser(dict)
+    val locList : IntervalIterator = new IntervalIterator(intervals,parser,assumeSorted)
+    
+  }
+
+  class IntervalIterator(ivals : List[File], parser: GenomeLocParser, sorted: Boolean) extends Iterable[GenomeLoc] {
+    val readers : List[XReadLinesBuffer] = ivals.map(f => new XReadLinesBuffer(new XReadLines(f), parser))
+    val filesAreSorted : Boolean = sorted
+
+    def hasNext = ! readers.filter( _.hasNext ).isEmpty
+    def next = if ( assumeSorted ) nextElement else nextSortedElement
+    def nextElement =
+
+
+  }
+
+  class XReadLinesBuffer( x: XReadLines, p: GenomeLocParser) extends Iterable[GenomeLoc] {
+    val xrl: XReadLines = x
+    val parser : GenomeLocParser = p
+    var current : GenomeLoc = _
+
+    def hasNext : Boolean = curLine != null || x.hasNext
+    def next : String = {
+      if ( curLine == null && x.hasNext ) curLine = x.next
+      val retLine = curLine
+      if ( x.hasNext ) curLine = x.next else curLine = null
+      retLine
+    }
+
+  } */
+
+}
\ No newline at end of file
diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
index e7c741cef..8d9fc68e4 100755
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala
@@ -15,7 +15,7 @@ class PipelineArgumentCollection {
   @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
   var trigger: File = _
 
-  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
+  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false)
   var refseqTable: File = _
 
   @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)