Fix expand intervals to do the right thing:
- No more duplicate intervals - Truncation at intervals that already exist, e.g. exists: |--------| |-------| new: |---------| fixed: |-----| note that weird instances like: exists: |-| |-| |-| new: |---------------------| fixed: |----| e.g. you're truncated to the nearest interval on whatever side. In general many behaviors could happen in this instance, this is the one currently implemented. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5323 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fd5d1f9cfc
commit
b089d35b21
|
|
@ -1,5 +1,6 @@
|
|||
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFExtractIntervals, VCFExtractSamples, VCFSimpleMerge, VCFExtractSites}
|
||||
import org.broadinstitute.sting.queue.library.ipf.SortByRef
|
||||
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import collection.JavaConversions._
|
||||
|
||||
|
|
@ -13,6 +14,10 @@ class QTools extends QScript {
|
|||
@Argument(doc="reference file",shortName="ref",required=false) var ref : File = _
|
||||
@Argument(doc="The samples to extract",shortName="sm",required=false) var samples : String = _
|
||||
@Argument(doc="Keep filtered sites when merging or extracting?",shortName="kf",required=false) var keepFilters : Boolean = false
|
||||
@Argument(doc="Input interval list (not used with VCF tools)",shortName="il",required=false) var intervalList : File = _
|
||||
@Argument(doc="Reference fai file",shortName="fai",required=false) var fai : File = _
|
||||
@Argument(doc="interval list expand start",shortName="il_start",required=false) var ilStart : Int = 1
|
||||
@Argument(doc="interval list expand size",shortName="il_size",required=false) var ilSize : Int = 50
|
||||
// todo -- additional arguments or argument collection
|
||||
|
||||
def script = {
|
||||
|
|
@ -35,6 +40,10 @@ class QTools extends QScript {
|
|||
if ( qtool.equals("SortByRef") ) {
|
||||
runSortByRef
|
||||
}
|
||||
|
||||
if ( qtool.equals("ExpandTargets") ) {
|
||||
runExpandTargets
|
||||
}
|
||||
}
|
||||
|
||||
def runVCFExtractSites = {
|
||||
|
|
@ -65,4 +74,9 @@ class QTools extends QScript {
|
|||
var sbr : SortByRef = new SortByRef(inVCF,new File(ref.getAbsolutePath+".fai"),output)
|
||||
add(sbr)
|
||||
}
|
||||
|
||||
def runExpandTargets = {
|
||||
var ets : ExpandIntervals = new ExpandIntervals(intervalList,ilStart,ilSize,output,fai,"INTERVALS","INTERVALS")
|
||||
add(ets)
|
||||
}
|
||||
}
|
||||
|
|
@ -7,6 +7,7 @@ import collection.JavaConversions._
|
|||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
import collection.immutable.TreeSet
|
||||
|
||||
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
|
||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
|
||||
|
|
@ -30,45 +31,66 @@ class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ip
|
|||
var first : Boolean = true
|
||||
var lastTwo : (GenomeLoc,GenomeLoc) = _
|
||||
|
||||
var intervalCache : TreeSet[GenomeLoc] = _
|
||||
val LINES_TO_CACHE : Int = 1000
|
||||
|
||||
def run = {
|
||||
first = true
|
||||
lastTwo = null
|
||||
output = new PrintStream(outList)
|
||||
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
|
||||
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
|
||||
})
|
||||
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
||||
xrl = new XReadLines(inList)
|
||||
offsetIn = if (isBed(inType)) 1 else 0
|
||||
offsetOut = if (isBed(outType)) 1 else 0
|
||||
asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).sliding(3).map(a => a.toList).foreach(a => expand(a(0),a(1),a(2)))
|
||||
expand(lastTwo._1,lastTwo._2,null)
|
||||
offsetOut = if( isBed(outType)) 1 else 0
|
||||
var line : String = xrl.next
|
||||
while ( line.startsWith("@") ) {
|
||||
line = xrl.next
|
||||
}
|
||||
var prevLoc: GenomeLoc = null
|
||||
var curLoc: GenomeLoc = null
|
||||
var nextLoc : GenomeLoc = parseGenomeInterval(line)
|
||||
var linesProcessed : Int = 1
|
||||
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
|
||||
prevLoc = curLoc
|
||||
curLoc = nextLoc
|
||||
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
|
||||
if ( curLoc != null ) {
|
||||
intervalCache += refine(expandLeft(curLoc),prevLoc)
|
||||
intervalCache += refine(expandRight(curLoc),nextLoc)
|
||||
}
|
||||
linesProcessed += 1
|
||||
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
|
||||
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
|
||||
intervalCache = intervalCache -- toPrint
|
||||
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
}
|
||||
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
|
||||
}
|
||||
|
||||
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
|
||||
output.close()
|
||||
}
|
||||
|
||||
def expand(previous: GenomeLoc, current: GenomeLoc, next: GenomeLoc) : Unit = {
|
||||
if ( first ) {
|
||||
first = false
|
||||
expand(null,previous,current)
|
||||
}
|
||||
lastTwo = (current,next)
|
||||
|
||||
if ( start > 0 ) {
|
||||
val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt)
|
||||
val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt)
|
||||
if ( ok(new1,previous,next) ) {
|
||||
//System.out.println("Printing! %s".format(repr(new1)))
|
||||
output.print("%s%n".format(repr(new1)))
|
||||
}
|
||||
if ( ok(new2,previous,next) ) {
|
||||
//System.out.println("Printing! %s".format(repr(new2)))
|
||||
output.print("%s%n".format(repr(new2)))
|
||||
}
|
||||
} else {
|
||||
output.print("%s%n".format(repr(parser.createGenomeLoc(current.getContig,current.getStart-sizeInt,current.getStop+sizeInt))))
|
||||
}
|
||||
def expandLeft(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
|
||||
}
|
||||
|
||||
def ok( loc : GenomeLoc, prev: GenomeLoc, next: GenomeLoc ) : Boolean = {
|
||||
//System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
|
||||
( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start) && (!loc.equals(prev) && !loc.equals(next))
|
||||
def expandRight(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
|
||||
}
|
||||
|
||||
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
|
||||
if ( borderG == null || ! newG.overlapsP(borderG) ) {
|
||||
newG
|
||||
} else {
|
||||
if ( newG.getStart < borderG.getStart ) {
|
||||
parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
|
||||
} else {
|
||||
parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def repr(loc : GenomeLoc) : String = {
|
||||
|
|
|
|||
Loading…
Reference in New Issue