diff --git a/scala/qscript/chartl/expanded_targets.q b/scala/qscript/chartl/expanded_targets.q index ca8de6a00..abe08dddc 100755 --- a/scala/qscript/chartl/expanded_targets.q +++ b/scala/qscript/chartl/expanded_targets.q @@ -1,21 +1,83 @@ import org.broadinstitute.sting.commandline.ArgumentCollection +import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ class expanded_targets extends QScript { @ArgumentCollection var args : PipelineArgumentCollection = new PipelineArgumentCollection @Argument(shortName="bait",doc="The list of baits associated with the target list",required=false) var baitFile : File = _ + @Argument(shortName="thisTrigger",doc="The trigger track to use",required=false) var thisTrigger : File = new File("/humgen/gsa-hphome1/chartl/projects/exome/expanded/triggers/joined.omni.hiseq.vcf") def script = { val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => { - new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"BED") + new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"INTERVALS") }) addAll(intervalExpands) - // intervalExpands.map(_.outList).foreach(makeCalls(_)) + val callFiles: List[File] = intervalExpands.map(_.outList).map(makeCalls(_,20)) } -} \ No newline at end of file + + def makeCalls(iList: File, scatterNo: Int): List[File] = { + var scatters : List[(File,File)] = _ + + var eval : VariantEval = new VariantEval + eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out) + eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf")) + eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out) + eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf")) + eval.out = swapExt(iList,".interval_list",".eval") + eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) + + } + + def makeCalls(iList: File) : File = { + + var bams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u)) + + trait GATKArgs extends CommandLineGATK { + this.reference_sequence = args.projectRef + this.DBSNP = args.projectDBSNP + this.intervals = List(iList) + this.jarFile = args.gatkJar + this.memoryLimit = Some(6) + } + + var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs + rtc.out = swapExt(iList,".interval_list",".targets.txt") + rtc.input_file = bams + var clean : IndelRealigner = new IndelRealigner with GATKArgs + clean.targetIntervals = rtc.out + clean.out = swapExt(iList,".interval_list",".cleaned.bam") + clean.input_file = bams + clean.maxReads = Some(100000) + var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs + call.scatterCount = 4 + call.input_file = List(clean.out) + call.out = swapExt(iList,".interval_list",".raw.vcf") + call.trig_emit_conf = Some(0.0) + call.rodBind :+= new RodBind("trigger","vcf",thisTrigger) + var filter : VariantFiltration = new VariantFiltration with GATKArgs + filter.rodBind :+= new RodBind("variant","vcf",call.out) + filter.filterExpression :+= "\"QD<5.0\"" + filter.filterName :+= "LowQualByDepth" + filter.filterExpression :+= "\"SB>-0.10\"" + filter.filterName :+= "HighStrandBias" + filter.out = swapExt(iList,".interval_list",".filtered.vcf") + var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs + callHiseq.input_file = List(new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam")) + callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out) + callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf") + callHiseq.trig_emit_conf = Some(0.0) + + add(rtc,clean,call,filter,callHiseq) + + return (filter.out,callHiSeq.out) + + } +} diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala index 7526990b8..27179e771 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala @@ -9,75 +9,93 @@ import net.sf.picard.reference.FastaSequenceFile import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} // todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, opType: String) extends InProcessFunction { +class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { @Input(doc="The interval list to expand") val inList : File = in @Input(doc="The reference sequence") val refDict : File = ref @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size @Output(doc="The output intervals file to write to") val outList : File = out @Argument(doc="The output format for the intervals") val outTypeStr = opType + @Argument(doc="The input format for the intervals") val inTypeStr = ipType - val output : PrintStream = new PrintStream(outList) - val parser : GenomeLocParser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - val xrl : XReadLines = new XReadLines(inList) - val outType = if ( outTypeStr.equals("INTERVALS") ) IntervalOutputType.INTERVALS else IntervalOutputType.BED + var output : PrintStream = _ + var parser : GenomeLocParser = _ + var xrl : XReadLines = _ + val outType = IntervalFormatType.convert(outTypeStr) + val inType = IntervalFormatType.convert(inTypeStr) - var previous : GenomeLoc = _ - var current : GenomeLoc = _ - var next : GenomeLoc = _ + var offsetIn : Int = 0 + var offsetOut : Int = 0 + + var first : Boolean = true + var lastTwo : (GenomeLoc,GenomeLoc) = _ def run = { - // todo -- there's got to be a special view that gives prevous, current, next - asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).foreach(expand(_)) + first = true + lastTwo = null + output = new PrintStream(outList) + parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) + xrl = new XReadLines(inList) + offsetIn = if (isBed(inType)) 1 else 0 + offsetOut = if (isBed(outType)) 1 else 0 + asScalaIterable(xrl).filter( ! _.startsWith("@")).map(parseGenomeInterval(_)).sliding(3).map(a => a.toList).foreach(a => expand(a(0),a(1),a(2))) + expand(lastTwo._1,lastTwo._2,null) + output.close() } - def expand(loc : GenomeLoc) : Unit = { + def expand(previous: GenomeLoc, current: GenomeLoc, next: GenomeLoc) : Unit = { + if ( first ) { + first = false + expand(null,previous,current) + } + lastTwo = (current,next) + if ( start > 0 ) { - previous = current - current = next - next = loc - if ( current == null ) { - return - } val new1 = parser.createGenomeLoc(current.getContig,current.getStart-startInt-sizeInt,current.getStart-startInt) val new2 = parser.createGenomeLoc(current.getContig,current.getStop+startInt,current.getStop+startInt+sizeInt) - if ( ok(new1) ) { + if ( ok(new1,previous,next) ) { //System.out.println("Printing! %s".format(repr(new1))) output.print("%s%n".format(repr(new1))) } - if ( ok(new2) ) { + if ( ok(new2,previous,next) ) { //System.out.println("Printing! %s".format(repr(new2))) output.print("%s%n".format(repr(new2))) } - previous = current } else { - output.print("%s%n".format(repr(parser.createGenomeLoc(loc.getContig,loc.getStart-sizeInt,loc.getStop+sizeInt)))) + output.print("%s%n".format(repr(parser.createGenomeLoc(current.getContig,current.getStart-sizeInt,current.getStop+sizeInt)))) } } - def ok( loc : GenomeLoc ) : Boolean = { + def ok( loc : GenomeLoc, prev: GenomeLoc, next: GenomeLoc ) : Boolean = { //System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous))) - ( next == null || loc.distance(next) >= start) && (previous == null || loc.distance(previous) >= start) + ( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start) } def repr(loc : GenomeLoc) : String = { if ( loc == null ) return "null" - if ( outType == IntervalOutputType.INTERVALS ) { + if ( outType == IntervalFormatType.INTERVALS ) { return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else if ( outType == IntervalOutputType.BED ) { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-1,loc.getStop) } else { - return "FORMAT?" + return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) } } + def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { + t == IntervalFormatType.BED + } + def parseGenomeInterval( s : String ) : GenomeLoc = { val sp = s.split("\\s+") - if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt,sp(2).toInt) + // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted + if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) } - object IntervalOutputType extends Enumeration("INTERVALS","BED") { - type IntervalOutputType = Value - val INTERVALS,BED = Value + object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { + type IntervalFormatType = Value + val INTERVALS,BED,TDF = Value + + def convert(s : String) : IntervalFormatType = { + if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} + } } } \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala new file mode 100755 index 000000000..c1cf38274 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.queue.library.ipf.intervals + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Argument, Output, Input} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines +import java.io.{PrintStream, File} + +/** + * Utility for explicitly grouping intervals together in sets of size num + * @Author chartl + */ + +class GroupIntervals(iList: File, num: Int, head: Boolean, dir: Option[String]) extends InProcessFunction { + @Input(doc="Interval list to split into groups") var intervalList : File = iList + @Output(doc="The output interval lists created") var outputList : List[File] = Nil + @Argument(doc="The number of groups wanted") var nGroups : Int = num + @Argument(doc="The base location you want the files") var directory : Option[String] = dir + @Argument(doc="Has header (@ lines)") var hasHeader : Boolean = head + + var written : Int = 0 + var header : List[String] = Nil + + def run = { + written = 0 + header = Nil + asScalaIterator(new XReadLines(intervalList)).filter(u => ! isHeader(u)).grouped(num).foreach(n => write(n)) + } + + def write(lines : Seq[String]) = { + val oFile : File = if ( dir.isEmpty) new File(intervalList.getAbsolutePath+".group%d".format(written))else new File(dir.get+"/"+intervalList.getAbsolutePath+".group%d".format(written)) + outputList :+= oFile + val output : PrintStream = new PrintStream(oFile) + header.foreach( u => output.print("%s%n".format(u))) + lines.foreach(u => output.print("%s%n".format(u))) + output.close() + } + + def isHeader(s : String) : Boolean = { + if ( hasHeader && s.startsWith("@") ) { + header :+= s + return true + } else { + return false + } + } + +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala index 2b3c81b29..e929477a1 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala @@ -1,54 +1,70 @@ package org.broadinstitute.sting.queue.library.ipf.intervals import org.broadinstitute.sting.queue.function.InProcessFunction +import collection.JavaConversions._ import org.broadinstitute.sting.commandline._ import java.io.{PrintStream, File} import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} import org.broadinstitute.sting.utils.text.XReadLines import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -abstract class IntersectIntervals(iVals: List[File], outFile: File) extends InProcessFunction { - /* @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals +class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { + @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="List of contigs for the interval list (defaults to human b37 autosome); IN ORDER") - var contigs : List[String] = (new Range(1,22)).map(u => "%d".format(u)) ::: List("X","Y","MT") @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false + @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - val out : PrintStream = new PrintStream(output) + var outStream : PrintStream = _ + var contigs : List[String] = Nil + var dict : SAMSequenceDictionary = _ + var parser : GenomeLocParser = _ def run = { - var dict : SAMSequenceDictionary = new SAMSequenceDictionary - contigs.map( u => new SAMSequenceRecord(u, Integer.MAX_VALUE ) ).foreach(u => dict.addSequence(u)) - val parser : GenomeLocParser = new GenomeLocParser(dict) - val locList : IntervalIterator = new IntervalIterator(intervals,parser,assumeSorted) - + outStream = new PrintStream(output) + dict = new SAMSequenceDictionary + // note: memory hog + val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex + sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) + parser = new GenomeLocParser(dict) + sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) + outStream.close() } - class IntervalIterator(ivals : List[File], parser: GenomeLocParser, sorted: Boolean) extends Iterable[GenomeLoc] { - val readers : List[XReadLinesBuffer] = ivals.map(f => new XReadLinesBuffer(new XReadLines(f), parser)) - val filesAreSorted : Boolean = sorted - - def hasNext = ! readers.filter( _.hasNext ).isEmpty - def next = if ( assumeSorted ) nextElement else nextSortedElement - def nextElement = - - + def writeOut(g : GenomeLoc) : Unit = { + outStream.print("%s%n".format(g.toString)) } - class XReadLinesBuffer( x: XReadLines, p: GenomeLocParser) extends Iterable[GenomeLoc] { - val xrl: XReadLines = x - val parser : GenomeLocParser = p - var current : GenomeLoc = _ - - def hasNext : Boolean = curLine != null || x.hasNext - def next : String = { - if ( curLine == null && x.hasNext ) curLine = x.next - val retLine = curLine - if ( x.hasNext ) curLine = x.next else curLine = null - retLine + def parse(s : String) : (String,Int,Int) = { + if ( s.contains(":") ) { + val split1 = s.split(":") + val split2 = split1(1).split("-") + return (split1(0),split2(0).toInt,split2(1).toInt) + } else { + val split = s.split("\\s+") + return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) } + } - } */ + def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { + parser.createGenomeLoc(coords._1,coords._2,coords._3) + } + + def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { + if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { + a.init :+ (a.last :+ b) + } else { + a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) + } + } + + def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { + if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for + null + } + u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) + } + + def range(a: Int, b: Int) : Range = new Range(a,b+1,1) } \ No newline at end of file