Major revision of the batch merge script. All sites are now used, hooks for some UG settings, no longer reliant on the pipeline management library (pipeline libs are probably going to go away -- nobody uses them)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5241 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d6e3f2eba6
commit
851b3e71f9
|
|
@ -1,9 +1,12 @@
|
|||
import java.io.{FileReader, BufferedReader}
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals}
|
||||
import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling}
|
||||
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.baq.BAQ
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
|
||||
|
|
@ -15,13 +18,72 @@ class batchMergePipeline extends QScript {
|
|||
@Argument(doc="sting dir",shortName="sting") var stingDir: String = _
|
||||
@Argument(doc="reference file",shortName="ref") var ref: File = _
|
||||
@Argument(doc="batched output",shortName="batch") var batchOut: File = _
|
||||
//@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false
|
||||
@Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20
|
||||
@Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20
|
||||
@Hidden @Argument(doc="Max mismatching bases",shortName="mmb",required=false) var mmb : Int = 3
|
||||
@Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1
|
||||
|
||||
def script = {
|
||||
|
||||
var vcfs : List[File] = extractFileEntries(vcfList)
|
||||
var bams : List[File] = extractFileEntries(bamList)
|
||||
var pmLib = new ProjectManagement(stingDir)
|
||||
addAll(pmLib.MergeBatches(vcfs,bams,batchOut,ref,20))
|
||||
|
||||
trait ExtractArgs extends VCFExtractSites {
|
||||
this.keepFilters = true
|
||||
this.keepInfo = false
|
||||
this.keepQual = false
|
||||
}
|
||||
|
||||
var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs)
|
||||
var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge
|
||||
combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF)
|
||||
combineVCFs.fai = new File(ref.getAbsolutePath+".fai")
|
||||
combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf")
|
||||
var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combineVCFs.outVCF,swapExt(combineVCFs.outVCF,".vcf",".intervals.list"),true)
|
||||
addAll(getVariantAlleles)
|
||||
add(combineVCFs,extractIntervals)
|
||||
|
||||
trait CalcLikelihoodArgs extends UGCalcLikelihoods {
|
||||
this.reference_sequence = batchMerge.ref
|
||||
this.max_mismatches_in_40bp_window = Some(batchMerge.mmb)
|
||||
this.min_base_quality_score = Some(batchMerge.mbq)
|
||||
this.min_mapping_quality_score = Some(batchMerge.mmq)
|
||||
if ( batchMerge.baq >= 0 ) {
|
||||
this.baqGapOpenPenalty = Some(batchMerge.baq)
|
||||
this.baq = Some(BAQ.CalculationMode.CALCULATE_AS_NECESSARY)
|
||||
}
|
||||
this.intervals :+= extractIntervals.listOut
|
||||
this.alleleVCF = combineVCFs.outVCF
|
||||
this.output_all_callable_bases = true
|
||||
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
|
||||
this.memoryLimit = Some(4)
|
||||
this.scatterCount = 60
|
||||
}
|
||||
|
||||
def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = {
|
||||
var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs
|
||||
ugcl.input_file ++= bams._1
|
||||
ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2))
|
||||
return ugcl
|
||||
}
|
||||
|
||||
var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u))
|
||||
addAll(calcs)
|
||||
|
||||
trait CallVariantsArgs extends UGCallVariants {
|
||||
this.output_all_callable_bases = true
|
||||
this.reference_sequence = batchMerge.ref
|
||||
this.intervals :+= extractIntervals.listOut
|
||||
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
|
||||
this.scatterCount = 30
|
||||
this.memoryLimit = Some(8)
|
||||
}
|
||||
|
||||
var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs
|
||||
cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) )
|
||||
cVars.out = batchOut
|
||||
add(cVars)
|
||||
}
|
||||
|
||||
def extractFileEntries(in: File): List[File] = {
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extend
|
|||
@Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly
|
||||
|
||||
def commandLine = {
|
||||
if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
|
||||
else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
|
||||
if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
|
||||
else "grep -v \\\\# %s | awk '{print $1\":\"$2}' | uniq > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -24,7 +24,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e
|
|||
|
||||
def vcf2int( vcfLine: String ) : Unit = {
|
||||
var spline = vcfLine.split("\t")
|
||||
if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || keepFilters) ) {
|
||||
if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || spline(6).equals(".") || keepFilters) ) {
|
||||
out.print("%s:%s%n".format(spline(0),spline(1)))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,17 +16,26 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction {
|
|||
|
||||
def lineMap( line: String ) : String = {
|
||||
if ( line.startsWith("##") ) { return line }
|
||||
val spline = line.split("\t",9)
|
||||
var spline = line.split("\t",9)
|
||||
if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) }
|
||||
|
||||
if ( spline(6) == "PASS" || keepFilters ) {
|
||||
if ( ! keepInfo ) {
|
||||
spline(7) = "."
|
||||
var buf = new StringBuffer(spline.slice(0,5).reduceLeft(_ + "\t" + _ ))
|
||||
if ( keepQual ) {
|
||||
buf.append("\t%s".format(spline(5)))
|
||||
} else {
|
||||
buf.append("\t.")
|
||||
}
|
||||
if ( ! keepQual ) {
|
||||
spline(5) = "."
|
||||
|
||||
buf.append("\t.")
|
||||
|
||||
if ( keepInfo ) {
|
||||
buf.append("\t%s".format(spline(7)))
|
||||
} else {
|
||||
buf.append("\t.")
|
||||
}
|
||||
return spline.slice(0,8).reduceLeft( _ + "\t" + _ )
|
||||
|
||||
return buf.toString
|
||||
}
|
||||
|
||||
return ""
|
||||
|
|
|
|||
|
|
@ -66,9 +66,13 @@ class VCFSimpleMerge extends InProcessFunction {
|
|||
|
||||
val glp : GenomeLocParser = new GenomeLocParser(ssd)
|
||||
|
||||
var last = ""
|
||||
while ( ! xrls.forall( u => ! u.hasNext ) ) {
|
||||
val first = xrls.filter( u => u.hasNext).reduceLeft( (a,b) => if ( genomeLoc(a,glp).isBefore(genomeLoc(b,glp))) a else b )
|
||||
w.println(first.next)
|
||||
if ( ! first.peek.equals(last) ) {
|
||||
w.println(first.peek)
|
||||
}
|
||||
last = first.next
|
||||
}
|
||||
|
||||
w.close
|
||||
|
|
|
|||
|
|
@ -2,11 +2,14 @@ package org.broadinstitute.sting.queue.pipeline
|
|||
import org.broadinstitute.sting.commandline._
|
||||
import org.broadinstitute.sting.queue.util._
|
||||
import java.io.File
|
||||
import scala.collection.JavaConversions._
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
class VariantCalling(attribs: Pipeline,gatkJar: File) {
|
||||
vc =>
|
||||
|
|
@ -370,3 +373,48 @@ class VariantCalling(attribs: Pipeline,gatkJar: File) {
|
|||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
||||
object VariantCalling {
|
||||
|
||||
/**
|
||||
* @Input - a VCF file (presumably with ##UnifiedGenotyper=" ... line
|
||||
* @Doc - Constructs a UG object based on the settings in the header of the input VCF
|
||||
* @Returns an instance of the UG object with all information propagated
|
||||
*/
|
||||
def unifiedGenotyperFromVCF( vcf : File ) : UnifiedGenotyper = {
|
||||
var myUG : UnifiedGenotyper = new UnifiedGenotyper
|
||||
var xrl : XReadLines = new XReadLines(vcf)
|
||||
val header_limit = 1000
|
||||
var line = ""
|
||||
var line_no = 1
|
||||
while ( ! line.startsWith("##UnifiedGenotyper=") && line_no < header_limit) {
|
||||
line = xrl.next
|
||||
line_no += 1
|
||||
}
|
||||
var ugLine : String = line
|
||||
if ( ! ugLine.startsWith("##UnifiedGenotyper") ) {
|
||||
return myUG // nothing to add
|
||||
}
|
||||
|
||||
var ugTokens = ugLine.stripPrefix("##UnifiedGenotyper=\"").stripSuffix("\"").split("[^,;:] ")
|
||||
var ugMap = new HashMap[String,String]
|
||||
def addEntry( s : String ) : Unit = {
|
||||
val sps = s.split("=")
|
||||
val k = sps(0)
|
||||
val v = sps(1)
|
||||
val kv = (k,v)
|
||||
ugMap += kv
|
||||
}
|
||||
ugTokens.foreach( addEntry(_) )
|
||||
|
||||
myUG.reference_sequence = new File(ugMap("reference_sequence"))
|
||||
myUG.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.valueOf(ugMap("baq")))
|
||||
myUG.baqGapOpenPenalty = Some(ugMap("baqGapOpenPenalty").toDouble)
|
||||
myUG.DBSNP = new File(ugMap("DBSNP"))
|
||||
|
||||
// todo -- more args
|
||||
|
||||
return myUG
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue