diff --git a/scala/qscript/playground/BatchMerge.q b/scala/qscript/playground/BatchMerge.q index 7eb693a34..56867b0ed 100755 --- a/scala/qscript/playground/BatchMerge.q +++ b/scala/qscript/playground/BatchMerge.q @@ -1,9 +1,12 @@ import java.io.{FileReader, BufferedReader} +import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.datasources.pipeline.Pipeline -import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals} import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling} import org.broadinstitute.sting.queue.{QException, QScript} import collection.JavaConversions._ +import org.broadinstitute.sting.utils.baq.BAQ import org.broadinstitute.sting.utils.text.XReadLines import org.broadinstitute.sting.utils.yaml.YamlUtils @@ -15,13 +18,72 @@ class batchMergePipeline extends QScript { @Argument(doc="sting dir",shortName="sting") var stingDir: String = _ @Argument(doc="reference file",shortName="ref") var ref: File = _ @Argument(doc="batched output",shortName="batch") var batchOut: File = _ + //@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false + @Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20 + @Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20 + @Hidden @Argument(doc="Max mismatching bases",shortName="mmb",required=false) var mmb : Int = 3 + @Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1 def script = { var vcfs : List[File] = extractFileEntries(vcfList) var bams : List[File] = extractFileEntries(bamList) - var pmLib = new ProjectManagement(stingDir) - addAll(pmLib.MergeBatches(vcfs,bams,batchOut,ref,20)) + + trait ExtractArgs extends VCFExtractSites { + this.keepFilters = true + this.keepInfo = false + this.keepQual = false + } + + var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs) + var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge + combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF) + combineVCFs.fai = new File(ref.getAbsolutePath+".fai") + combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf") + var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combineVCFs.outVCF,swapExt(combineVCFs.outVCF,".vcf",".intervals.list"),true) + addAll(getVariantAlleles) + add(combineVCFs,extractIntervals) + + trait CalcLikelihoodArgs extends UGCalcLikelihoods { + this.reference_sequence = batchMerge.ref + this.max_mismatches_in_40bp_window = Some(batchMerge.mmb) + this.min_base_quality_score = Some(batchMerge.mbq) + this.min_mapping_quality_score = Some(batchMerge.mmq) + if ( batchMerge.baq >= 0 ) { + this.baqGapOpenPenalty = Some(batchMerge.baq) + this.baq = Some(BAQ.CalculationMode.CALCULATE_AS_NECESSARY) + } + this.intervals :+= extractIntervals.listOut + this.alleleVCF = combineVCFs.outVCF + this.output_all_callable_bases = true + this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") + this.memoryLimit = Some(4) + this.scatterCount = 60 + } + + def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = { + var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs + ugcl.input_file ++= bams._1 + ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2)) + return ugcl + } + + var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u)) + addAll(calcs) + + trait CallVariantsArgs extends UGCallVariants { + this.output_all_callable_bases = true + this.reference_sequence = batchMerge.ref + this.intervals :+= extractIntervals.listOut + this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar") + this.scatterCount = 30 + this.memoryLimit = Some(8) + } + + var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs + cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) ) + cVars.out = batchOut + add(cVars) } def extractFileEntries(in: File): List[File] = { diff --git a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala index ffb4476d5..c072d219a 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala @@ -14,8 +14,8 @@ class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extend @Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly def commandLine = { - if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) - else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) + if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) + else "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath) } } \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 49d070e75..398fc37fb 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -24,7 +24,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e def vcf2int( vcfLine: String ) : Unit = { var spline = vcfLine.split("\t") - if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || keepFilters) ) { + if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || spline(6).equals(".") || keepFilters) ) { out.print("%s:%s%n".format(spline(0),spline(1))) } } diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala index f4d64d12d..da69761a9 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala @@ -16,17 +16,26 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction { def lineMap( line: String ) : String = { if ( line.startsWith("##") ) { return line } - val spline = line.split("\t",9) + var spline = line.split("\t",9) if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) } if ( spline(6) == "PASS" || keepFilters ) { - if ( ! keepInfo ) { - spline(7) = "." + var buf = new StringBuffer(spline.slice(0,5).reduceLeft(_ + "\t" + _ )) + if ( keepQual ) { + buf.append("\t%s".format(spline(5))) + } else { + buf.append("\t.") } - if ( ! keepQual ) { - spline(5) = "." + + buf.append("\t.") + + if ( keepInfo ) { + buf.append("\t%s".format(spline(7))) + } else { + buf.append("\t.") } - return spline.slice(0,8).reduceLeft( _ + "\t" + _ ) + + return buf.toString } return "" diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala index 94b79454f..c193b63e8 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala @@ -66,9 +66,13 @@ class VCFSimpleMerge extends InProcessFunction { val glp : GenomeLocParser = new GenomeLocParser(ssd) + var last = "" while ( ! xrls.forall( u => ! u.hasNext ) ) { val first = xrls.filter( u => u.hasNext).reduceLeft( (a,b) => if ( genomeLoc(a,glp).isBefore(genomeLoc(b,glp))) a else b ) - w.println(first.next) + if ( ! first.peek.equals(last) ) { + w.println(first.peek) + } + last = first.next } w.close diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala index cf845a1b5..a614954a4 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala @@ -2,11 +2,14 @@ package org.broadinstitute.sting.queue.pipeline import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ import java.io.File +import scala.collection.JavaConversions._ import org.broadinstitute.sting.datasources.pipeline.Pipeline import org.broadinstitute.sting.gatk.DownsampleType import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.utils.text.XReadLines +import scala.collection.mutable.HashMap class VariantCalling(attribs: Pipeline,gatkJar: File) { vc => @@ -370,3 +373,48 @@ class VariantCalling(attribs: Pipeline,gatkJar: File) { new File(file.getName.stripSuffix(oldExtension) + newExtension) } + +object VariantCalling { + + /** + * @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line + * @Doc - Constructs a UG object based on the settings in the header of the input VCF + * @Returns an instance of the UG object with all information propagated + */ + def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = { + var myUG : UnifiedGenotyper = new UnifiedGenotyper + var xrl : XReadLines = new XReadLines(vcf) + val header_limit = 1000 + var line = "" + var line_no = 1 + while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) { + line = xrl.next + line_no += 1 + } + var ugLine : String = line + if ( ! ugLine.startsWith("##UnifiedGenotyper") ) { + return myUG // nothing to add + } + + var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ") + var ugMap = new HashMap[String,String] + def addEntry( s : String ) : Unit = { + val sps = s.split("=") + val k = sps(0) + val v = sps(1) + val kv = (k,v) + ugMap += kv + } + ugTokens.foreach( addEntry(_) ) + + myUG.reference_sequence = new File(ugMap("reference_sequence")) + myUG.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq"))) + myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble) + myUG.DBSNP = new File(ugMap("DBSNP")) + + // todo -- more args + + return myUG + + } +}