PAC sets the refseq file to optional.

Additional ipf for expanding interval lists. Either there's heavy latency on the toro -- or it's not working properly yet (e.g. the system print in the same scope as the file print outputs a line, but no file shows up on the system)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4874 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-12-18 19:04:52 +00:00
parent f0ab7b849a
commit 6db86ae0c6
3 changed files with 138 additions and 1 deletions

View File

@ -0,0 +1,83 @@
package org.broadinstitute.sting.queue.library.ipf
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.commandline._
import java.io.{PrintStream, File}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.text.XReadLines
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, opType: String) extends InProcessFunction {
@Input(doc="The interval list to expand") val inList : File = in
@Input(doc="The reference sequence") val refDict : File = ref
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
@Output(doc="The output intervals file to write to") val outList : File = out
@Argument(doc="The output format for the intervals") val outTypeStr = opType
val output : PrintStream = new PrintStream(outList)
val parser : GenomeLocParser = new GenomeLocParser(new FastaSequenceFile(ref,true))
val xrl : XReadLines = new XReadLines(inList)
val outType = if ( outTypeStr.equals("INTERVALS") ) IntervalOutputType.INTERVALS else IntervalOutputType.BED
var previous : GenomeLoc = _
var current : GenomeLoc = _
var next : GenomeLoc = _
def run = {
// todo -- there's got to be a special view that gives prevous, current, next
asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).foreach(expand(_))
}
def expand(loc : GenomeLoc) : Unit = {
if ( start > 0 ) {
previous = current
current = next
next = loc
if ( current == null ) {
return
}
val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt)
val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt)
if ( ok(new1) ) {
System.out.println("Printing! %s".format(repr(new1)))
output.print("%s%n".format(repr(new1)))
}
if ( ok(new2) ) {
System.out.println("Printing! %s".format(repr(new2)))
output.print("%s%n".format(repr(new2)))
}
previous = current
} else {
output.print("%s%n".format(repr(parser.createGenomeLoc(loc.getContig,loc.getStart-sizeInt,loc.getStop+sizeInt))))
}
}
def ok( loc : GenomeLoc ) : Boolean = {
//System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
( next == null || loc.distance(next) >= start) && (previous == null || loc.distance(previous) >= start)
}
def repr(loc : GenomeLoc) : String = {
if ( loc == null ) return "null"
if ( outType == IntervalOutputType.INTERVALS ) {
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
} else if ( outType == IntervalOutputType.BED ) {
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-1,loc.getStop)
} else {
return "FORMAT?"
}
}
def parseGenomeInterval( s : String ) : GenomeLoc = {
val sp = s.split("\\s+")
if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt,sp(2).toInt)
}
object IntervalOutputType extends Enumeration("INTERVALS","BED") {
type IntervalOutputType = Value
val INTERVALS,BED = Value
}
}

View File

@ -0,0 +1,54 @@
package org.broadinstitute.sting.queue.library.ipf.intervals
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.commandline._
import java.io.{PrintStream, File}
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
import org.broadinstitute.sting.utils.text.XReadLines
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
abstract class IntersectIntervals(iVals: List[File], outFile: File) extends InProcessFunction {
/* @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
@Output(doc="Output interval file to which to write") val output : File = outFile
@Argument(doc="List of contigs for the interval list (defaults to human b37 autosome); IN ORDER")
var contigs : List[String] = (new Range(1,22)).map(u => "%d".format(u)) ::: List("X","Y","MT")
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
val out : PrintStream = new PrintStream(output)
def run = {
var dict : SAMSequenceDictionary = new SAMSequenceDictionary
contigs.map( u => new SAMSequenceRecord(u, Integer.MAX_VALUE ) ).foreach(u => dict.addSequence(u))
val parser : GenomeLocParser = new GenomeLocParser(dict)
val locList : IntervalIterator = new IntervalIterator(intervals,parser,assumeSorted)
}
class IntervalIterator(ivals : List[File], parser: GenomeLocParser, sorted: Boolean) extends Iterable[GenomeLoc] {
val readers : List[XReadLinesBuffer] = ivals.map(f => new XReadLinesBuffer(new XReadLines(f), parser))
val filesAreSorted : Boolean = sorted
def hasNext = ! readers.filter( _.hasNext ).isEmpty
def next = if ( assumeSorted ) nextElement else nextSortedElement
def nextElement =
}
class XReadLinesBuffer( x: XReadLines, p: GenomeLocParser) extends Iterable[GenomeLoc] {
val xrl: XReadLines = x
val parser : GenomeLocParser = p
var current : GenomeLoc = _
def hasNext : Boolean = curLine != null || x.hasNext
def next : String = {
if ( curLine == null && x.hasNext ) curLine = x.next
val retLine = curLine
if ( x.hasNext ) curLine = x.next else curLine = null
retLine
}
} */
}

View File

@ -15,7 +15,7 @@ class PipelineArgumentCollection {
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
var trigger: File = _
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable",required=false)
var refseqTable: File = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)