diff --git a/scala/qscript/chartl/expanded_targets.q b/scala/qscript/chartl/expanded_targets.q index abe08dddc..ee8f4012f 100755 --- a/scala/qscript/chartl/expanded_targets.q +++ b/scala/qscript/chartl/expanded_targets.q @@ -1,6 +1,7 @@ import org.broadinstitute.sting.commandline.ArgumentCollection import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals +import org.broadinstitute.sting.queue.library.ipf.intervals.GroupIntervals import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.utils.text.XReadLines @@ -14,52 +15,91 @@ class expanded_targets extends QScript { def script = { val intervalExpands : List[ExpandIntervals] = (new Range(0,40,1)).toList.map( u => { - new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"INTERVALS") + new ExpandIntervals(args.projectIntervals,1+5*u,5,new File(System.getProperty("user.dir")+"/"+args.projectName+"_expanded_%d_%d.interval_list".format(1+5*u,6+5*u)),args.projectRef,"TSV","INTERVALS") }) + trait GATKArgs extends CommandLineGATK { + this.reference_sequence = args.projectRef + this.DBSNP = args.projectDBSNP + this.jarFile = args.gatkJar + } + + val userDir = System.getProperty("user.dir") + addAll(intervalExpands) - val callFiles: List[File] = intervalExpands.map(_.outList).map(makeCalls(_,20)) + val cleanIntervals : ExpandIntervals = new ExpandIntervals(args.projectIntervals,1,210,new File(userDir+"/"+args.projectName+"_expanded_full.interval_list"),args.projectRef,"TSV","INTERVALS") + + add(cleanIntervals) + + val uncleanBams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u)) + val realign : List[RealignerTargetCreator] = uncleanBams.map(u => { + var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs + rtc.out = swapExt(userDir,u,".bam",".expanded.targets.txt") + rtc.input_file :+= u.getAbsoluteFile + rtc.intervals :+= cleanIntervals.outList + rtc + }) + val clean : List[IndelRealigner] = realign.map( u => { + var clean : IndelRealigner = new IndelRealigner with GATKArgs + clean.targetIntervals = u.out + clean.input_file = u.input_file + clean.memoryLimit = Some(4) + clean.out = new File(userDir+"/"+swapExt(u.out,".bam",".expanded.targets.bam").getName) + clean + }) + + addAll(realign) + addAll(clean) + + val callFiles: List[File] = intervalExpands.map(u => makeCalls(u.outList,20,clean.map(h => h.out))) } - def makeCalls(iList: File, scatterNo: Int): List[File] = { - var scatters : List[(File,File)] = _ + def makeCalls(iList: File, scatterNo: Int, bams: List[File]): File = { + var scatters : GroupIntervals = new GroupIntervals(iList,20,true,Some(System.getProperty("user.dir"))) + var filtered : List[(File,File)] = scatters.outputList.zipWithIndex.map(v => reallyMakeCalls(v._1,bams,v._2)) + var gatherStandard : VcfGatherFunction = new VcfGatherFunction + gatherStandard.gatherParts = filtered.map(u => u._1) + gatherStandard.originalOutput = swapExt(iList,".interval_list",".filtered.calls.vcf") + var gatherHiseq : VcfGatherFunction = new VcfGatherFunction + gatherHiseq.gatherParts = filtered.map(u => u._2) + gatherHiseq.originalOutput = swapExt(iList,".interval_list",".filtered.hiseq.vcf") - var eval : VariantEval = new VariantEval - eval.rodBind :+= new RodBind("evalInterval","vcf",filter.out) - eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf")) - eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",callHiseq.out) - eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf")) - eval.out = swapExt(iList,".interval_list",".eval") - eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) - - } - - def makeCalls(iList: File) : File = { - - var bams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u)) + add(scatters) + add(gatherStandard) + add(gatherHiseq) trait GATKArgs extends CommandLineGATK { this.reference_sequence = args.projectRef this.DBSNP = args.projectDBSNP - this.intervals = List(iList) this.jarFile = args.gatkJar - this.memoryLimit = Some(6) } - var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs - rtc.out = swapExt(iList,".interval_list",".targets.txt") - rtc.input_file = bams - var clean : IndelRealigner = new IndelRealigner with GATKArgs - clean.targetIntervals = rtc.out - clean.out = swapExt(iList,".interval_list",".cleaned.bam") - clean.input_file = bams - clean.maxReads = Some(100000) + + var eval : VariantEval = new VariantEval with GATKArgs + eval.rodBind :+= new RodBind("evalInterval","vcf",gatherStandard.originalOutput) + eval.rodBind :+= new RodBind("compHiSeq","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf")) + eval.rodBind :+= new RodBind("compHiSeq_atSites","vcf",gatherHiseq.originalOutput) + eval.rodBind :+= new RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni_2.5_764_samples.b37.deduped.annot.vcf")) + eval.out = swapExt(iList,".interval_list",".eval") + eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) + + eval.out + } + + def reallyMakeCalls(iList: File, bams : List[File], scatterNo: Int) : (File,File) = { + + trait GATKArgs extends CommandLineGATK { + this.reference_sequence = args.projectRef + this.DBSNP = args.projectDBSNP + this.jarFile = args.gatkJar + this.intervals :+= iList + } + var call : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs - call.scatterCount = 4 - call.input_file = List(clean.out) - call.out = swapExt(iList,".interval_list",".raw.vcf") + call.input_file = bams + call.out = swapExt(iList,".interval_list",".scatter%d.raw.vcf".format(scatterNo)) call.trig_emit_conf = Some(0.0) call.rodBind :+= new RodBind("trigger","vcf",thisTrigger) var filter : VariantFiltration = new VariantFiltration with GATKArgs @@ -68,16 +108,16 @@ class expanded_targets extends QScript { filter.filterName :+= "LowQualByDepth" filter.filterExpression :+= "\"SB>-0.10\"" filter.filterName :+= "HighStrandBias" - filter.out = swapExt(iList,".interval_list",".filtered.vcf") + filter.out = swapExt(iList,".interval_list",".scatter%d.filtered.vcf".format(scatterNo)) var callHiseq : UnifiedGenotyper = new UnifiedGenotyper with GATKArgs callHiseq.input_file = List(new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam")) callHiseq.rodBind :+= new RodBind("trigger","vcf",filter.out) - callHiseq.out = swapExt(iList,".interval_list",".hiSeq.genotypes.vcf") + callHiseq.out = swapExt(iList,".interval_list",".scatter%d.hiSeq.genotypes.vcf".format(scatterNo)) callHiseq.trig_emit_conf = Some(0.0) - add(rtc,clean,call,filter,callHiseq) + add(call,filter,callHiseq) - return (filter.out,callHiSeq.out) + return (filter.out,callHiseq.out) } } diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala deleted file mode 100755 index c1cf38274..000000000 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/GroupIntervals.scala +++ /dev/null @@ -1,48 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline.{Argument, Output, Input} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import java.io.{PrintStream, File} - -/** - * Utility for explicitly grouping intervals together in sets of size num - * @Author chartl - */ - -class GroupIntervals(iList: File, num: Int, head: Boolean, dir: Option[String]) extends InProcessFunction { - @Input(doc="Interval list to split into groups") var intervalList : File = iList - @Output(doc="The output interval lists created") var outputList : List[File] = Nil - @Argument(doc="The number of groups wanted") var nGroups : Int = num - @Argument(doc="The base location you want the files") var directory : Option[String] = dir - @Argument(doc="Has header (@ lines)") var hasHeader : Boolean = head - - var written : Int = 0 - var header : List[String] = Nil - - def run = { - written = 0 - header = Nil - asScalaIterator(new XReadLines(intervalList)).filter(u => ! isHeader(u)).grouped(num).foreach(n => write(n)) - } - - def write(lines : Seq[String]) = { - val oFile : File = if ( dir.isEmpty) new File(intervalList.getAbsolutePath+".group%d".format(written))else new File(dir.get+"/"+intervalList.getAbsolutePath+".group%d".format(written)) - outputList :+= oFile - val output : PrintStream = new PrintStream(oFile) - header.foreach( u => output.print("%s%n".format(u))) - lines.foreach(u => output.print("%s%n".format(u))) - output.close() - } - - def isHeader(s : String) : Boolean = { - if ( hasHeader && s.startsWith("@") ) { - header :+= s - return true - } else { - return false - } - } - -} \ No newline at end of file