From 851b3e71f98d2a6f3fd337a96412b9915a661707 Mon Sep 17 00:00:00 2001 From: chartl Date: Mon, 14 Feb 2011 23:52:05 +0000 Subject: [PATCH] Major revision of the batch merge script. All sites are now used, hooks for some UG settings, no longer reliant on the pipeline management library (pipeline libs are probably going to go away -- nobody uses them) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5241 348d0f76-0448-11de-a6fe-93d51630548a --- scala/qscript/playground/BatchMerge.q | 68 ++++++++++++++++++- .../library/clf/vcf/VCFExtractIntervals.scala | 4 +- .../library/ipf/vcf/VCFExtractIntervals.scala | 2 +- .../library/ipf/vcf/VCFExtractSites.scala | 21 ++++-- .../library/ipf/vcf/VCFSimpleMerge.scala | 6 +- .../sting/queue/pipeline/VariantCalling.scala | 48 +++++++++++++ 6 files changed, 136 insertions(+), 13 deletions(-) diff --git a/scala/qscript/playground/BatchMerge.q b/scala/qscript/playground/BatchMerge.q index 7eb693a34..56867b0ed 100755 --- a/scala/qscript/playground/BatchMerge.q +++ b/scala/qscript/playground/BatchMerge.q @@ -1,9 +1,12 @@ import java.io.{FileReader, BufferedReader} +import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals} import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling} import org.broadinstitute.sting.queue.{QException, QScript} import collection.JavaConversions._ +import org.broadinstitute.sting.utils.baq.BAQ import org.broadinstitute.sting.utils.text.XReadLines import org.broadinstitute.sting.utils.yaml.YamlUtils @@ -15,13 +18,72 @@ class batchMergePipeline extends QScript { @Argument(doc="sting dir",shortName="sting") var stingDir: String = _ @Argument(doc="reference file",shortName="ref") var ref: File = _ @Argument(doc="batched output",shortName="batch") var batchOut: File = _ + //@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false + @Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20 + @Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20 + @Hidden @Argument(doc="Max mismatching bases",shortName="mmb",required=false) var mmb : Int = 3 + @Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1 def script = { var vcfs : List[File] = extractFileEntries(vcfList) var bams : List[File] = extractFileEntries(bamList) - var pmLib = new ProjectManagement(stingDir) - addAll(pmLib.MergeBatches(vcfs,bams,batchOut,ref,20)) + + trait ExtractArgs extends VCFExtractSites { + this.keepFilters = true + this.keepInfo = false + this.keepQual = false + } + + var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs) + var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge + combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF) + combineVCFs.fai = new File(ref.getAbsolutePath+".fai") + combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf") + var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combineVCFs.outVCF,swapExt(combineVCFs.outVCF,".vcf",".intervals.list"),true) + addAll(getVariantAlleles) + add(combineVCFs,extractIntervals) + + trait CalcLikelihoodArgs extends UGCalcLikelihoods { + this.reference_sequence = batchMerge.ref + this.max_mismatches_in_40bp_window = Some(batchMerge.mmb) + this.min_base_quality_score = Some(batchMerge.mbq) + this.min_mapping_quality_score = Some(batchMerge.mmq) + if ( batchMerge.baq >= 0 ) { + this.baqGapOpenPenalty = Some(batchMerge.baq) + this.baq = Some(BAQ.CalculationMode.CALCULATE_AS_NECESSARY) + } + this.intervals :+= extractIntervals.listOut + this.alleleVCF = combineVCFs.outVCF + this.output_all_callable_bases = true + this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") + this.memoryLimit = Some(4) + this.scatterCount = 60 + } + + def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = { + var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs + ugcl.input_file ++= bams._1 + ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2)) + return ugcl + } + + var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u)) + addAll(calcs) + + trait CallVariantsArgs extends UGCallVariants { + this.output_all_callable_bases = true + this.reference_sequence = batchMerge.ref + this.intervals :+= extractIntervals.listOut + this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") + this.scatterCount = 30 + this.memoryLimit = Some(8) + } + + var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs + cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) ) + cVars.out = batchOut + add(cVars) } def extractFileEntries(in: File): List[File] = { diff --git a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala index ffb4476d5..c072d219a 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala @@ -14,8 +14,8 @@ class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extend @Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly def commandLine = { - if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) - else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) + if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) + else "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) } } \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 49d070e75..398fc37fb 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -24,7 +24,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e def vcf2int( vcfLine: String ) : Unit = { var spline = vcfLine.split("\t") - if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || keepFilters) ) { + if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || spline(6).equals(".") || keepFilters) ) { out.print("%s:%s%n".format(spline(0),spline(1))) } } diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala index f4d64d12d..da69761a9 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala @@ -16,17 +16,26 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction { def lineMap( line: String ) : String = { if ( line.startsWith("##") ) { return line } - val spline = line.split("\t",9) + var spline = line.split("\t",9) if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) } if ( spline(6) == "PASS" || keepFilters ) { - if ( ! keepInfo ) { - spline(7) = "." + var buf = new StringBuffer(spline.slice(0,5).reduceLeft(_ + "\t" + _ )) + if ( keepQual ) { + buf.append("\t%s".format(spline(5))) + } else { + buf.append("\t.") } - if ( ! keepQual ) { - spline(5) = "." + + buf.append("\t.") + + if ( keepInfo ) { + buf.append("\t%s".format(spline(7))) + } else { + buf.append("\t.") } - return spline.slice(0,8).reduceLeft( _ + "\t" + _ ) + + return buf.toString } return "" diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala index 94b79454f..c193b63e8 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala @@ -66,9 +66,13 @@ class VCFSimpleMerge extends InProcessFunction { val glp : GenomeLocParser = new GenomeLocParser(ssd) + var last = "" while ( ! xrls.forall( u => ! u.hasNext ) ) { val first = xrls.filter( u => u.hasNext).reduceLeft( (a,b) => if ( genomeLoc(a,glp).isBefore(genomeLoc(b,glp))) a else b ) - w.println(first.next) + if ( ! first.peek.equals(last) ) { + w.println(first.peek) + } + last = first.next } w.close diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala index cf845a1b5..a614954a4 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala @@ -2,11 +2,14 @@ package org.broadinstitute.sting.queue.pipeline import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ import java.io.File +import scala.collection.JavaConversions._ import org.broadinstitute.sting.datasources.pipeline.Pipeline import org.broadinstitute.sting.gatk.DownsampleType import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.utils.text.XReadLines +import scala.collection.mutable.HashMap class VariantCalling(attribs: Pipeline,gatkJar: File) { vc => @@ -370,3 +373,48 @@ class VariantCalling(attribs: Pipeline,gatkJar: File) { new File(file.getName.stripSuffix(oldExtension) + newExtension) } + +object VariantCalling { + + /** + * @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line + * @Doc - Constructs a UG object based on the settings in the header of the input VCF + * @Returns an instance of the UG object with all information propagated + */ + def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = { + var myUG : UnifiedGenotyper = new UnifiedGenotyper + var xrl : XReadLines = new XReadLines(vcf) + val header_limit = 1000 + var line = "" + var line_no = 1 + while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) { + line = xrl.next + line_no += 1 + } + var ugLine : String = line + if ( ! ugLine.startsWith("##UnifiedGenotyper") ) { + return myUG // nothing to add + } + + var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ") + var ugMap = new HashMap[String,String] + def addEntry( s : String ) : Unit = { + val sps = s.split("=") + val k = sps(0) + val v = sps(1) + val kv = (k,v) + ugMap += kv + } + ugTokens.foreach( addEntry(_) ) + + myUG.reference_sequence = new File(ugMap("reference_sequence")) + myUG.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq"))) + myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble) + myUG.DBSNP = new File(ugMap("DBSNP")) + + // todo -- more args + + return myUG + + } +}