From 6db86ae0c6c2bda0f34988fbfd0bc3e460d68452 Mon Sep 17 00:00:00 2001 From: chartl Date: Sat, 18 Dec 2010 19:04:52 +0000 Subject: [PATCH] PAC sets the refseq file to optional. Additional ipf for expanding interval lists. Either there's heavy latency on the toro -- or it's not working properly yet (e.g. the system print in the same scope as the file print outputs a line, but no file shows up on the system) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4874 348d0f76-0448-11de-a6fe-93d51630548a --- .../ipf/intervals/ExpandIntervals.scala | 83 +++++++++++++++++++ .../ipf/intervals/IntersectIntervals.scala | 54 ++++++++++++ .../pipeline/PipelineArgumentCollection.scala | 2 +- 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100755 scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala create mode 100755 scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala new file mode 100755 index 000000000..ddc85eb59 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.queue.library.ipf + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline._ +import java.io.{PrintStream, File} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} + +// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created +class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, opType: String) extends InProcessFunction { + @Input(doc="The interval list to expand") val inList : File = in + @Input(doc="The reference sequence") val refDict : File = ref + @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start + @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size + @Output(doc="The output intervals file to write to") val outList : File = out + @Argument(doc="The output format for the intervals") val outTypeStr = opType + + val output : PrintStream = new PrintStream(outList) + val parser : GenomeLocParser = new GenomeLocParser(new FastaSequenceFile(ref,true)) + val xrl : XReadLines = new XReadLines(inList) + val outType = if ( outTypeStr.equals("INTERVALS") ) IntervalOutputType.INTERVALS else IntervalOutputType.BED + + var previous : GenomeLoc = _ + var current : GenomeLoc = _ + var next : GenomeLoc = _ + + def run = { + // todo -- there's got to be a special view that gives prevous, current, next + asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).foreach(expand(_)) + } + + def expand(loc : GenomeLoc) : Unit = { + if ( start > 0 ) { + previous = current + current = next + next = loc + if ( current == null ) { + return + } + val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt) + val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt) + if ( ok(new1) ) { + System.out.println("Printing! %s".format(repr(new1))) + output.print("%s%n".format(repr(new1))) + } + if ( ok(new2) ) { + System.out.println("Printing! %s".format(repr(new2))) + output.print("%s%n".format(repr(new2))) + } + previous = current + } else { + output.print("%s%n".format(repr(parser.createGenomeLoc(loc.getContig,loc.getStart-sizeInt,loc.getStop+sizeInt)))) + } + } + + def ok( loc : GenomeLoc ) : Boolean = { + //System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous))) + ( next == null || loc.distance(next) >= start) && (previous == null || loc.distance(previous) >= start) + } + + def repr(loc : GenomeLoc) : String = { + if ( loc == null ) return "null" + if ( outType == IntervalOutputType.INTERVALS ) { + return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) + } else if ( outType == IntervalOutputType.BED ) { + return "%s\t%d\t%d".format(loc.getContig,loc.getStart-1,loc.getStop) + } else { + return "FORMAT?" + } + } + + def parseGenomeInterval( s : String ) : GenomeLoc = { + val sp = s.split("\\s+") + if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt,sp(2).toInt) + } + + object IntervalOutputType extends Enumeration("INTERVALS","BED") { + type IntervalOutputType = Value + val INTERVALS,BED = Value + } +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala new file mode 100755 index 000000000..2b3c81b29 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.queue.library.ipf.intervals + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline._ +import java.io.{PrintStream, File} +import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} +import org.broadinstitute.sting.utils.text.XReadLines +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} + +abstract class IntersectIntervals(iVals: List[File], outFile: File) extends InProcessFunction { + /* @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals + @Output(doc="Output interval file to which to write") val output : File = outFile + @Argument(doc="List of contigs for the interval list (defaults to human b37 autosome); IN ORDER") + var contigs : List[String] = (new Range(1,22)).map(u => "%d".format(u)) ::: List("X","Y","MT") + @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false + + + val out : PrintStream = new PrintStream(output) + + def run = { + var dict : SAMSequenceDictionary = new SAMSequenceDictionary + contigs.map( u => new SAMSequenceRecord(u, Integer.MAX_VALUE ) ).foreach(u => dict.addSequence(u)) + val parser : GenomeLocParser = new GenomeLocParser(dict) + val locList : IntervalIterator = new IntervalIterator(intervals,parser,assumeSorted) + + } + + class IntervalIterator(ivals : List[File], parser: GenomeLocParser, sorted: Boolean) extends Iterable[GenomeLoc] { + val readers : List[XReadLinesBuffer] = ivals.map(f => new XReadLinesBuffer(new XReadLines(f), parser)) + val filesAreSorted : Boolean = sorted + + def hasNext = ! readers.filter( _.hasNext ).isEmpty + def next = if ( assumeSorted ) nextElement else nextSortedElement + def nextElement = + + + } + + class XReadLinesBuffer( x: XReadLines, p: GenomeLocParser) extends Iterable[GenomeLoc] { + val xrl: XReadLines = x + val parser : GenomeLocParser = p + var current : GenomeLoc = _ + + def hasNext : Boolean = curLine != null || x.hasNext + def next : String = { + if ( curLine == null && x.hasNext ) curLine = x.next + val retLine = curLine + if ( x.hasNext ) curLine = x.next else curLine = null + retLine + } + + } */ + +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala index e7c741cef..8d9fc68e4 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala @@ -15,7 +15,7 @@ class PipelineArgumentCollection { @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false) var trigger: File = _ - @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable") + @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false) var refseqTable: File = _ @Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)