Major revision of the batch merge script. All sites are now used, hooks for some UG settings, no longer reliant on the pipeline management library (pipeline libs are probably going to go away -- nobody uses them)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5241 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-02-14 23:52:05 +00:00
parent d6e3f2eba6
commit 851b3e71f9
6 changed files with 136 additions and 13 deletions

View File

@ -1,9 +1,12 @@
import java.io.{FileReader, BufferedReader}
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals}
import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling}
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.baq.BAQ
import org.broadinstitute.sting.utils.text.XReadLines
import org.broadinstitute.sting.utils.yaml.YamlUtils
@ -15,13 +18,72 @@ class batchMergePipeline extends QScript {
@Argument(doc="sting dir",shortName="sting") var stingDir: String = _
@Argument(doc="reference file",shortName="ref") var ref: File = _
@Argument(doc="batched output",shortName="batch") var batchOut: File = _
//@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false
@Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20
@Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20
@Hidden @Argument(doc="Max mismatching bases",shortName="mmb",required=false) var mmb : Int = 3
@Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1
def script = {
var vcfs : List[File] = extractFileEntries(vcfList)
var bams : List[File] = extractFileEntries(bamList)
var pmLib = new ProjectManagement(stingDir)
addAll(pmLib.MergeBatches(vcfs,bams,batchOut,ref,20))
trait ExtractArgs extends VCFExtractSites {
this.keepFilters = true
this.keepInfo = false
this.keepQual = false
}
var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs)
var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge
combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF)
combineVCFs.fai = new File(ref.getAbsolutePath+".fai")
combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf")
var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combineVCFs.outVCF,swapExt(combineVCFs.outVCF,".vcf",".intervals.list"),true)
addAll(getVariantAlleles)
add(combineVCFs,extractIntervals)
trait CalcLikelihoodArgs extends UGCalcLikelihoods {
this.reference_sequence = batchMerge.ref
this.max_mismatches_in_40bp_window = Some(batchMerge.mmb)
this.min_base_quality_score = Some(batchMerge.mbq)
this.min_mapping_quality_score = Some(batchMerge.mmq)
if ( batchMerge.baq >= 0 ) {
this.baqGapOpenPenalty = Some(batchMerge.baq)
this.baq = Some(BAQ.CalculationMode.CALCULATE_AS_NECESSARY)
}
this.intervals :+= extractIntervals.listOut
this.alleleVCF = combineVCFs.outVCF
this.output_all_callable_bases = true
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.memoryLimit = Some(4)
this.scatterCount = 60
}
def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = {
var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs
ugcl.input_file ++= bams._1
ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2))
return ugcl
}
var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u))
addAll(calcs)
trait CallVariantsArgs extends UGCallVariants {
this.output_all_callable_bases = true
this.reference_sequence = batchMerge.ref
this.intervals :+= extractIntervals.listOut
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.scatterCount = 30
this.memoryLimit = Some(8)
}
var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs
cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) )
cVars.out = batchOut
add(cVars)
}
def extractFileEntries(in: File): List[File] = {

View File

@ -14,8 +14,8 @@ class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extend
@Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly
def commandLine = {
if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
else "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
}
}

View File

@ -24,7 +24,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e
def vcf2int( vcfLine: String ) : Unit = {
var spline = vcfLine.split("\t")
if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || keepFilters) ) {
if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || spline(6).equals(".") || keepFilters) ) {
out.print("%s:%s%n".format(spline(0),spline(1)))
}
}

View File

@ -16,17 +16,26 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction {
def lineMap( line: String ) : String = {
if ( line.startsWith("##") ) { return line }
val spline = line.split("\t",9)
var spline = line.split("\t",9)
if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) }
if ( spline(6) == "PASS" || keepFilters ) {
if ( ! keepInfo ) {
spline(7) = "."
var buf = new StringBuffer(spline.slice(0,5).reduceLeft(_ + "\t" + _ ))
if ( keepQual ) {
buf.append("\t%s".format(spline(5)))
} else {
buf.append("\t.")
}
if ( ! keepQual ) {
spline(5) = "."
buf.append("\t.")
if ( keepInfo ) {
buf.append("\t%s".format(spline(7)))
} else {
buf.append("\t.")
}
return spline.slice(0,8).reduceLeft( _ + "\t" + _ )
return buf.toString
}
return ""

View File

@ -66,9 +66,13 @@ class VCFSimpleMerge extends InProcessFunction {
val glp : GenomeLocParser = new GenomeLocParser(ssd)
var last = ""
while ( ! xrls.forall( u => ! u.hasNext ) ) {
val first = xrls.filter( u => u.hasNext).reduceLeft( (a,b) => if ( genomeLoc(a,glp).isBefore(genomeLoc(b,glp))) a else b )
w.println(first.next)
if ( ! first.peek.equals(last) ) {
w.println(first.peek)
}
last = first.next
}
w.close

View File

@ -2,11 +2,14 @@ package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.util._
import java.io.File
import scala.collection.JavaConversions._
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.text.XReadLines
import scala.collection.mutable.HashMap
class VariantCalling(attribs: Pipeline,gatkJar: File) {
vc =>
@ -370,3 +373,48 @@ class VariantCalling(attribs: Pipeline,gatkJar: File) {
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}
object VariantCalling {
/**
* @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line
* @Doc - Constructs a UG object based on the settings in the header of the input VCF
* @Returns an instance of the UG object with all information propagated
*/
def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = {
var myUG : UnifiedGenotyper = new UnifiedGenotyper
var xrl : XReadLines = new XReadLines(vcf)
val header_limit = 1000
var line = ""
var line_no = 1
while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) {
line = xrl.next
line_no += 1
}
var ugLine : String = line
if ( ! ugLine.startsWith("##UnifiedGenotyper") ) {
return myUG // nothing to add
}
var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ")
var ugMap = new HashMap[String,String]
def addEntry( s : String ) : Unit = {
val sps = s.split("=")
val k = sps(0)
val v = sps(1)
val kv = (k,v)
ugMap += kv
}
ugTokens.foreach( addEntry(_) )
myUG.reference_sequence = new File(ugMap("reference_sequence"))
myUG.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq")))
myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble)
myUG.DBSNP = new File(ugMap("DBSNP"))
// todo -- more args
return myUG
}
}