EXTREME interval processing. Still undergoing testing.
+ GroupIntervals allows user-defined scattering (e.g. take an interval list file, split it into k smaller interval list files by number of lines) + ExpandIntervals expands the intervals, either by widening them, or allowing the definition for nearby intervals (e.g. flanks starting 1bp before and after, ending 10bp after that) + IntersectIntervals takes n interval lists, writes 1 interval list that is the n-way intersection of all of them git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4885 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a829284d84
commit
61d5daa65c
|
|
@ -1,21 +1,83 @@
|
|||
import org.broadinstitute.sting.commandline.ArgumentCollection
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals
|
||||
import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions._
|
||||
|
||||
class expanded_targets extends QScript {
|
||||
@ArgumentCollection var args : PipelineArgumentCollection = new PipelineArgumentCollection
|
||||
@Argument(shortName="bait",doc="The list of baits associated with the target list",required=false) var baitFile : File = _
|
||||
@Argument(shortName="thisTrigger",doc="The trigger track to use",required=false) var thisTrigger : File = new File("/humgen/gsa-hphome1/chartl/projects/exome/expanded/triggers/joined.omni.hiseq.vcf")
|
||||
|
||||
def script = {
|
||||
|
||||
val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => {
|
||||
new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"BED")
|
||||
new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"INTERVALS")
|
||||
})
|
||||
|
||||
addAll(intervalExpands)
|
||||
|
||||
// intervalExpands.map(_.outList).foreach(makeCalls(_))
|
||||
val callFiles: List[File] = intervalExpands.map(_.outList).map(makeCalls(_,20))
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def makeCalls(iList: File, scatterNo: Int): List[File] = {
|
||||
var scatters : List[(File,File)] = _
|
||||
|
||||
var eval : VariantEval = new VariantEval
|
||||
eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out)
|
||||
eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf"))
|
||||
eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out)
|
||||
eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf"))
|
||||
eval.out = swapExt(iList,".interval_list",".eval")
|
||||
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV)
|
||||
|
||||
}
|
||||
|
||||
def makeCalls(iList: File) : File = {
|
||||
|
||||
var bams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u))
|
||||
|
||||
trait GATKArgs extends CommandLineGATK {
|
||||
this.reference_sequence = args.projectRef
|
||||
this.DBSNP = args.projectDBSNP
|
||||
this.intervals = List(iList)
|
||||
this.jarFile = args.gatkJar
|
||||
this.memoryLimit = Some(6)
|
||||
}
|
||||
|
||||
var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs
|
||||
rtc.out = swapExt(iList,".interval_list",".targets.txt")
|
||||
rtc.input_file = bams
|
||||
var clean : IndelRealigner = new IndelRealigner with GATKArgs
|
||||
clean.targetIntervals = rtc.out
|
||||
clean.out = swapExt(iList,".interval_list",".cleaned.bam")
|
||||
clean.input_file = bams
|
||||
clean.maxReads = Some(100000)
|
||||
var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
|
||||
call.scatterCount = 4
|
||||
call.input_file = List(clean.out)
|
||||
call.out = swapExt(iList,".interval_list",".raw.vcf")
|
||||
call.trig_emit_conf = Some(0.0)
|
||||
call.rodBind :+= new RodBind("trigger","vcf",thisTrigger)
|
||||
var filter : VariantFiltration = new VariantFiltration with GATKArgs
|
||||
filter.rodBind :+= new RodBind("variant","vcf",call.out)
|
||||
filter.filterExpression :+= "\"QD<5.0\""
|
||||
filter.filterName :+= "LowQualByDepth"
|
||||
filter.filterExpression :+= "\"SB>-0.10\""
|
||||
filter.filterName :+= "HighStrandBias"
|
||||
filter.out = swapExt(iList,".interval_list",".filtered.vcf")
|
||||
var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
|
||||
callHiseq.input_file = List(new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"))
|
||||
callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out)
|
||||
callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf")
|
||||
callHiseq.trig_emit_conf = Some(0.0)
|
||||
|
||||
add(rtc,clean,call,filter,callHiseq)
|
||||
|
||||
return (filter.out,callHiSeq.out)
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,75 +9,93 @@ import net.sf.picard.reference.FastaSequenceFile
|
|||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
|
||||
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
|
||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, opType: String) extends InProcessFunction {
|
||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
|
||||
@Input(doc="The interval list to expand") val inList : File = in
|
||||
@Input(doc="The reference sequence") val refDict : File = ref
|
||||
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
|
||||
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
|
||||
@Output(doc="The output intervals file to write to") val outList : File = out
|
||||
@Argument(doc="The output format for the intervals") val outTypeStr = opType
|
||||
@Argument(doc="The input format for the intervals") val inTypeStr = ipType
|
||||
|
||||
val output : PrintStream = new PrintStream(outList)
|
||||
val parser : GenomeLocParser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
||||
val xrl : XReadLines = new XReadLines(inList)
|
||||
val outType = if ( outTypeStr.equals("INTERVALS") ) IntervalOutputType.INTERVALS else IntervalOutputType.BED
|
||||
var output : PrintStream = _
|
||||
var parser : GenomeLocParser = _
|
||||
var xrl : XReadLines = _
|
||||
val outType = IntervalFormatType.convert(outTypeStr)
|
||||
val inType = IntervalFormatType.convert(inTypeStr)
|
||||
|
||||
var previous : GenomeLoc = _
|
||||
var current : GenomeLoc = _
|
||||
var next : GenomeLoc = _
|
||||
var offsetIn : Int = 0
|
||||
var offsetOut : Int = 0
|
||||
|
||||
var first : Boolean = true
|
||||
var lastTwo : (GenomeLoc,GenomeLoc) = _
|
||||
|
||||
def run = {
|
||||
// todo -- there's got to be a special view that gives prevous, current, next
|
||||
asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).foreach(expand(_))
|
||||
first = true
|
||||
lastTwo = null
|
||||
output = new PrintStream(outList)
|
||||
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
||||
xrl = new XReadLines(inList)
|
||||
offsetIn = if (isBed(inType)) 1 else 0
|
||||
offsetOut = if (isBed(outType)) 1 else 0
|
||||
asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).sliding(3).map(a => a.toList).foreach(a => expand(a(0),a(1),a(2)))
|
||||
expand(lastTwo._1,lastTwo._2,null)
|
||||
output.close()
|
||||
}
|
||||
|
||||
def expand(loc : GenomeLoc) : Unit = {
|
||||
def expand(previous: GenomeLoc, current: GenomeLoc, next: GenomeLoc) : Unit = {
|
||||
if ( first ) {
|
||||
first = false
|
||||
expand(null,previous,current)
|
||||
}
|
||||
lastTwo = (current,next)
|
||||
|
||||
if ( start > 0 ) {
|
||||
previous = current
|
||||
current = next
|
||||
next = loc
|
||||
if ( current == null ) {
|
||||
return
|
||||
}
|
||||
val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt)
|
||||
val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt)
|
||||
if ( ok(new1) ) {
|
||||
if ( ok(new1,previous,next) ) {
|
||||
//System.out.println("Printing! %s".format(repr(new1)))
|
||||
output.print("%s%n".format(repr(new1)))
|
||||
}
|
||||
if ( ok(new2) ) {
|
||||
if ( ok(new2,previous,next) ) {
|
||||
//System.out.println("Printing! %s".format(repr(new2)))
|
||||
output.print("%s%n".format(repr(new2)))
|
||||
}
|
||||
previous = current
|
||||
} else {
|
||||
output.print("%s%n".format(repr(parser.createGenomeLoc(loc.getContig,loc.getStart-sizeInt,loc.getStop+sizeInt))))
|
||||
output.print("%s%n".format(repr(parser.createGenomeLoc(current.getContig,current.getStart-sizeInt,current.getStop+sizeInt))))
|
||||
}
|
||||
}
|
||||
|
||||
def ok( loc : GenomeLoc ) : Boolean = {
|
||||
def ok( loc : GenomeLoc, prev: GenomeLoc, next: GenomeLoc ) : Boolean = {
|
||||
//System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
|
||||
( next == null || loc.distance(next) >= start) && (previous == null || loc.distance(previous) >= start)
|
||||
( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start)
|
||||
}
|
||||
|
||||
def repr(loc : GenomeLoc) : String = {
|
||||
if ( loc == null ) return "null"
|
||||
if ( outType == IntervalOutputType.INTERVALS ) {
|
||||
if ( outType == IntervalFormatType.INTERVALS ) {
|
||||
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
|
||||
} else if ( outType == IntervalOutputType.BED ) {
|
||||
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-1,loc.getStop)
|
||||
} else {
|
||||
return "FORMAT?"
|
||||
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
|
||||
}
|
||||
}
|
||||
|
||||
def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
|
||||
t == IntervalFormatType.BED
|
||||
}
|
||||
|
||||
def parseGenomeInterval( s : String ) : GenomeLoc = {
|
||||
val sp = s.split("\\s+")
|
||||
if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt,sp(2).toInt)
|
||||
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
|
||||
if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
|
||||
}
|
||||
|
||||
object IntervalOutputType extends Enumeration("INTERVALS","BED") {
|
||||
type IntervalOutputType = Value
|
||||
val INTERVALS,BED = Value
|
||||
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
|
||||
type IntervalFormatType = Value
|
||||
val INTERVALS,BED,TDF = Value
|
||||
|
||||
def convert(s : String) : IntervalFormatType = {
|
||||
if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import org.broadinstitute.sting.commandline.{Argument, Output, Input}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import java.io.{PrintStream, File}
|
||||
|
||||
/**
|
||||
* Utility for explicitly grouping intervals together in sets of size num
|
||||
* @Author chartl
|
||||
*/
|
||||
|
||||
class GroupIntervals(iList: File, num: Int, head: Boolean, dir: Option[String]) extends InProcessFunction {
|
||||
@Input(doc="Interval list to split into groups") var intervalList : File = iList
|
||||
@Output(doc="The output interval lists created") var outputList : List[File] = Nil
|
||||
@Argument(doc="The number of groups wanted") var nGroups : Int = num
|
||||
@Argument(doc="The base location you want the files") var directory : Option[String] = dir
|
||||
@Argument(doc="Has header (@ lines)") var hasHeader : Boolean = head
|
||||
|
||||
var written : Int = 0
|
||||
var header : List[String] = Nil
|
||||
|
||||
def run = {
|
||||
written = 0
|
||||
header = Nil
|
||||
asScalaIterator(new XReadLines(intervalList)).filter(u => ! isHeader(u)).grouped(num).foreach(n => write(n))
|
||||
}
|
||||
|
||||
def write(lines : Seq[String]) = {
|
||||
val oFile : File = if ( dir.isEmpty) new File(intervalList.getAbsolutePath+".group%d".format(written))else new File(dir.get+"/"+intervalList.getAbsolutePath+".group%d".format(written))
|
||||
outputList :+= oFile
|
||||
val output : PrintStream = new PrintStream(oFile)
|
||||
header.foreach( u => output.print("%s%n".format(u)))
|
||||
lines.foreach(u => output.print("%s%n".format(u)))
|
||||
output.close()
|
||||
}
|
||||
|
||||
def isHeader(s : String) : Boolean = {
|
||||
if ( hasHeader && s.startsWith("@") ) {
|
||||
header :+= s
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,54 +1,70 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import java.io.{PrintStream, File}
|
||||
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
|
||||
abstract class IntersectIntervals(iVals: List[File], outFile: File) extends InProcessFunction {
|
||||
/* @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
|
||||
class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
|
||||
@Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
|
||||
@Output(doc="Output interval file to which to write") val output : File = outFile
|
||||
@Argument(doc="List of contigs for the interval list (defaults to human b37 autosome); IN ORDER")
|
||||
var contigs : List[String] = (new Range(1,22)).map(u => "%d".format(u)) ::: List("X","Y","MT")
|
||||
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
|
||||
@Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
|
||||
|
||||
|
||||
val out : PrintStream = new PrintStream(output)
|
||||
var outStream : PrintStream = _
|
||||
var contigs : List[String] = Nil
|
||||
var dict : SAMSequenceDictionary = _
|
||||
var parser : GenomeLocParser = _
|
||||
|
||||
def run = {
|
||||
var dict : SAMSequenceDictionary = new SAMSequenceDictionary
|
||||
contigs.map( u => new SAMSequenceRecord(u, Integer.MAX_VALUE ) ).foreach(u => dict.addSequence(u))
|
||||
val parser : GenomeLocParser = new GenomeLocParser(dict)
|
||||
val locList : IntervalIterator = new IntervalIterator(intervals,parser,assumeSorted)
|
||||
|
||||
outStream = new PrintStream(output)
|
||||
dict = new SAMSequenceDictionary
|
||||
// note: memory hog
|
||||
val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
|
||||
sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
|
||||
parser = new GenomeLocParser(dict)
|
||||
sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
|
||||
outStream.close()
|
||||
}
|
||||
|
||||
class IntervalIterator(ivals : List[File], parser: GenomeLocParser, sorted: Boolean) extends Iterable[GenomeLoc] {
|
||||
val readers : List[XReadLinesBuffer] = ivals.map(f => new XReadLinesBuffer(new XReadLines(f), parser))
|
||||
val filesAreSorted : Boolean = sorted
|
||||
|
||||
def hasNext = ! readers.filter( _.hasNext ).isEmpty
|
||||
def next = if ( assumeSorted ) nextElement else nextSortedElement
|
||||
def nextElement =
|
||||
|
||||
|
||||
def writeOut(g : GenomeLoc) : Unit = {
|
||||
outStream.print("%s%n".format(g.toString))
|
||||
}
|
||||
|
||||
class XReadLinesBuffer( x: XReadLines, p: GenomeLocParser) extends Iterable[GenomeLoc] {
|
||||
val xrl: XReadLines = x
|
||||
val parser : GenomeLocParser = p
|
||||
var current : GenomeLoc = _
|
||||
|
||||
def hasNext : Boolean = curLine != null || x.hasNext
|
||||
def next : String = {
|
||||
if ( curLine == null && x.hasNext ) curLine = x.next
|
||||
val retLine = curLine
|
||||
if ( x.hasNext ) curLine = x.next else curLine = null
|
||||
retLine
|
||||
def parse(s : String) : (String,Int,Int) = {
|
||||
if ( s.contains(":") ) {
|
||||
val split1 = s.split(":")
|
||||
val split2 = split1(1).split("-")
|
||||
return (split1(0),split2(0).toInt,split2(1).toInt)
|
||||
} else {
|
||||
val split = s.split("\\s+")
|
||||
return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
|
||||
}
|
||||
}
|
||||
|
||||
} */
|
||||
def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
|
||||
parser.createGenomeLoc(coords._1,coords._2,coords._3)
|
||||
}
|
||||
|
||||
def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
|
||||
if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
|
||||
a.init :+ (a.last :+ b)
|
||||
} else {
|
||||
a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
|
||||
}
|
||||
}
|
||||
|
||||
def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
|
||||
if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
|
||||
null
|
||||
}
|
||||
u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
|
||||
}
|
||||
|
||||
def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue