Fix expand intervals to do the right thing:

- No more duplicate intervals
 - Truncation at intervals that already exist, e.g.

exists:      |--------|           |-------|
new:               |---------|
fixed:                 |-----|

note that weird instances like:

exists:           |-|        |-|                  |-|
new:           |---------------------|
fixed:                          |----|

e.g. you're truncated to the nearest interval on whatever side. In general many behaviors could happen in this instance, this is the one currently implemented.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5323 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-02-26 04:19:01 +00:00
parent fd5d1f9cfc
commit b089d35b21
2 changed files with 65 additions and 29 deletions

View File

@ -1,5 +1,6 @@
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFExtractIntervals, VCFExtractSamples, VCFSimpleMerge, VCFExtractSites}
import org.broadinstitute.sting.queue.library.ipf.SortByRef
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
import org.broadinstitute.sting.queue.QScript
import collection.JavaConversions._
@ -13,6 +14,10 @@ class QTools extends QScript {
@Argument(doc="reference file",shortName="ref",required=false) var ref : File = _
@Argument(doc="The samples to extract",shortName="sm",required=false) var samples : String = _
@Argument(doc="Keep filtered sites when merging or extracting?",shortName="kf",required=false) var keepFilters : Boolean = false
@Argument(doc="Input interval list (not used with VCF tools)",shortName="il",required=false) var intervalList : File = _
@Argument(doc="Reference fai file",shortName="fai",required=false) var fai : File = _
@Argument(doc="interval list expand start",shortName="il_start",required=false) var ilStart : Int = 1
@Argument(doc="interval list expand size",shortName="il_size",required=false) var ilSize : Int = 50
// todo -- additional arguments or argument collection
def script = {
@ -35,6 +40,10 @@ class QTools extends QScript {
if ( qtool.equals("SortByRef") ) {
runSortByRef
}
if ( qtool.equals("ExpandTargets") ) {
runExpandTargets
}
}
def runVCFExtractSites = {
@ -65,4 +74,9 @@ class QTools extends QScript {
var sbr : SortByRef = new SortByRef(inVCF,new File(ref.getAbsolutePath+".fai"),output)
add(sbr)
}
def runExpandTargets = {
var ets : ExpandIntervals = new ExpandIntervals(intervalList,ilStart,ilSize,output,fai,"INTERVALS","INTERVALS")
add(ets)
}
}

View File

@ -7,6 +7,7 @@ import collection.JavaConversions._
import org.broadinstitute.sting.utils.text.XReadLines
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import collection.immutable.TreeSet
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
@ -30,45 +31,66 @@ class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ip
var first : Boolean = true
var lastTwo : (GenomeLoc,GenomeLoc) = _
var intervalCache : TreeSet[GenomeLoc] = _
val LINES_TO_CACHE : Int = 1000
def run = {
first = true
lastTwo = null
output = new PrintStream(outList)
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
})
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
xrl = new XReadLines(inList)
offsetIn = if (isBed(inType)) 1 else 0
offsetOut = if (isBed(outType)) 1 else 0
asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).sliding(3).map(a => a.toList).foreach(a => expand(a(0),a(1),a(2)))
expand(lastTwo._1,lastTwo._2,null)
offsetOut = if( isBed(outType)) 1 else 0
var line : String = xrl.next
while ( line.startsWith("@") ) {
line = xrl.next
}
var prevLoc: GenomeLoc = null
var curLoc: GenomeLoc = null
var nextLoc : GenomeLoc = parseGenomeInterval(line)
var linesProcessed : Int = 1
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
prevLoc = curLoc
curLoc = nextLoc
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
if ( curLoc != null ) {
intervalCache += refine(expandLeft(curLoc),prevLoc)
intervalCache += refine(expandRight(curLoc),nextLoc)
}
linesProcessed += 1
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
intervalCache = intervalCache -- toPrint
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
}
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
}
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
output.close()
}
def expand(previous: GenomeLoc, current: GenomeLoc, next: GenomeLoc) : Unit = {
if ( first ) {
first = false
expand(null,previous,current)
}
lastTwo = (current,next)
if ( start > 0 ) {
val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt)
val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt)
if ( ok(new1,previous,next) ) {
//System.out.println("Printing! %s".format(repr(new1)))
output.print("%s%n".format(repr(new1)))
}
if ( ok(new2,previous,next) ) {
//System.out.println("Printing! %s".format(repr(new2)))
output.print("%s%n".format(repr(new2)))
}
} else {
output.print("%s%n".format(repr(parser.createGenomeLoc(current.getContig,current.getStart-sizeInt,current.getStop+sizeInt))))
}
def expandLeft(g: GenomeLoc) : GenomeLoc = {
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
}
def ok( loc : GenomeLoc, prev: GenomeLoc, next: GenomeLoc ) : Boolean = {
//System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start) && (!loc.equals(prev) && !loc.equals(next))
def expandRight(g: GenomeLoc) : GenomeLoc = {
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
}
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
if ( borderG == null || ! newG.overlapsP(borderG) ) {
newG
} else {
if ( newG.getStart < borderG.getStart ) {
parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
} else {
parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
}
}
}
def repr(loc : GenomeLoc) : String = {