Graph structure must be known at compile time. Removing GroupIntervals until a future point where in-process-functions can predict their output based on inputs [though this is probably forever: the inputs may not exist at compile time!]

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4886 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-12-20 21:22:58 +00:00
parent 61d5daa65c
commit fc33901810
2 changed files with 74 additions and 82 deletions

View File

@ -1,6 +1,7 @@
import org.broadinstitute.sting.commandline.ArgumentCollection
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals
import org.broadinstitute.sting.queue.library.ipf.intervals.GroupIntervals
import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.utils.text.XReadLines
@ -14,52 +15,91 @@ class expanded_targets extends QScript {
def script = {
val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => {
new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"INTERVALS")
new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"TSV","INTERVALS")
})
trait GATKArgs extends CommandLineGATK {
this.reference_sequence = args.projectRef
this.DBSNP = args.projectDBSNP
this.jarFile = args.gatkJar
}
val userDir = System.getProperty("user.dir")
addAll(intervalExpands)
val callFiles: List[File] = intervalExpands.map(_.outList).map(makeCalls(_,20))
val cleanIntervals : ExpandIntervals = new ExpandIntervals(args.projectIntervals,1,210,new File(userDir+"/"+args.projectName+"_expanded_full.interval_list"),args.projectRef,"TSV","INTERVALS")
add(cleanIntervals)
val uncleanBams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u))
val realign : List[RealignerTargetCreator] = uncleanBams.map(u => {
var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs
rtc.out = swapExt(userDir,u,".bam",".expanded.targets.txt")
rtc.input_file :+= u.getAbsoluteFile
rtc.intervals :+= cleanIntervals.outList
rtc
})
val clean : List[IndelRealigner] = realign.map( u => {
var clean : IndelRealigner = new IndelRealigner with GATKArgs
clean.targetIntervals = u.out
clean.input_file = u.input_file
clean.memoryLimit = Some(4)
clean.out = new File(userDir+"/"+swapExt(u.out,".bam",".expanded.targets.bam").getName)
clean
})
addAll(realign)
addAll(clean)
val callFiles: List[File] = intervalExpands.map(u => makeCalls(u.outList,20,clean.map(h => h.out)))
}
def makeCalls(iList: File, scatterNo: Int): List[File] = {
var scatters : List[(File,File)] = _
def makeCalls(iList: File, scatterNo: Int, bams: List[File]): File = {
var scatters : GroupIntervals = new GroupIntervals(iList,20,true,Some(System.getProperty("user.dir")))
var filtered : List[(File,File)] = scatters.outputList.zipWithIndex.map(v => reallyMakeCalls(v._1,bams,v._2))
var gatherStandard : VcfGatherFunction = new VcfGatherFunction
gatherStandard.gatherParts = filtered.map(u => u._1)
gatherStandard.originalOutput = swapExt(iList,".interval_list",".filtered.calls.vcf")
var gatherHiseq : VcfGatherFunction = new VcfGatherFunction
gatherHiseq.gatherParts = filtered.map(u => u._2)
gatherHiseq.originalOutput = swapExt(iList,".interval_list",".filtered.hiseq.vcf")
var eval : VariantEval = new VariantEval
eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out)
eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf"))
eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out)
eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf"))
eval.out = swapExt(iList,".interval_list",".eval")
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV)
}
def makeCalls(iList: File) : File = {
var bams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u))
add(scatters)
add(gatherStandard)
add(gatherHiseq)
trait GATKArgs extends CommandLineGATK {
this.reference_sequence = args.projectRef
this.DBSNP = args.projectDBSNP
this.intervals = List(iList)
this.jarFile = args.gatkJar
this.memoryLimit = Some(6)
}
var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs
rtc.out = swapExt(iList,".interval_list",".targets.txt")
rtc.input_file = bams
var clean : IndelRealigner = new IndelRealigner with GATKArgs
clean.targetIntervals = rtc.out
clean.out = swapExt(iList,".interval_list",".cleaned.bam")
clean.input_file = bams
clean.maxReads = Some(100000)
var eval : VariantEval = new VariantEval with GATKArgs
eval.rodBind :+= new RodBind("evalInterval","vcf",gatherStandard.originalOutput)
eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf"))
eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",gatherHiseq.originalOutput)
eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf"))
eval.out = swapExt(iList,".interval_list",".eval")
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV)
eval.out
}
def reallyMakeCalls(iList: File, bams : List[File], scatterNo: Int) : (File,File) = {
trait GATKArgs extends CommandLineGATK {
this.reference_sequence = args.projectRef
this.DBSNP = args.projectDBSNP
this.jarFile = args.gatkJar
this.intervals :+= iList
}
var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
call.scatterCount = 4
call.input_file = List(clean.out)
call.out = swapExt(iList,".interval_list",".raw.vcf")
call.input_file = bams
call.out = swapExt(iList,".interval_list",".scatter%d.raw.vcf".format(scatterNo))
call.trig_emit_conf = Some(0.0)
call.rodBind :+= new RodBind("trigger","vcf",thisTrigger)
var filter : VariantFiltration = new VariantFiltration with GATKArgs
@ -68,16 +108,16 @@ class expanded_targets extends QScript {
filter.filterName :+= "LowQualByDepth"
filter.filterExpression :+= "\"SB>-0.10\""
filter.filterName :+= "HighStrandBias"
filter.out = swapExt(iList,".interval_list",".filtered.vcf")
filter.out = swapExt(iList,".interval_list",".scatter%d.filtered.vcf".format(scatterNo))
var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs
callHiseq.input_file = List(new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"))
callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out)
callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf")
callHiseq.out = swapExt(iList,".interval_list",".scatter%d.hiSeq.genotypes.vcf".format(scatterNo))
callHiseq.trig_emit_conf = Some(0.0)
add(rtc,clean,call,filter,callHiseq)
add(call,filter,callHiseq)
return (filter.out,callHiSeq.out)
return (filter.out,callHiseq.out)
}
}

View File

@ -1,48 +0,0 @@
package org.broadinstitute.sting.queue.library.ipf.intervals
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.commandline.{Argument, Output, Input}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.text.XReadLines
import java.io.{PrintStream, File}
/**
* Utility for explicitly grouping intervals together in sets of size num
* @Author chartl
*/
class GroupIntervals(iList: File, num: Int, head: Boolean, dir: Option[String]) extends InProcessFunction {
@Input(doc="Interval list to split into groups") var intervalList : File = iList
@Output(doc="The output interval lists created") var outputList : List[File] = Nil
@Argument(doc="The number of groups wanted") var nGroups : Int = num
@Argument(doc="The base location you want the files") var directory : Option[String] = dir
@Argument(doc="Has header (@ lines)") var hasHeader : Boolean = head
var written : Int = 0
var header : List[String] = Nil
def run = {
written = 0
header = Nil
asScalaIterator(new XReadLines(intervalList)).filter(u => ! isHeader(u)).grouped(num).foreach(n => write(n))
}
def write(lines : Seq[String]) = {
val oFile : File = if ( dir.isEmpty) new File(intervalList.getAbsolutePath+".group%d".format(written))else new File(dir.get+"/"+intervalList.getAbsolutePath+".group%d".format(written))
outputList :+= oFile
val output : PrintStream = new PrintStream(oFile)
header.foreach( u => output.print("%s%n".format(u)))
lines.foreach(u => output.print("%s%n".format(u)))
output.close()
}
def isHeader(s : String) : Boolean = {
if ( hasHeader && s.startsWith("@") ) {
header :+= s
return true
} else {
return false
}
}
}