gatk-3.8/scala/qscript/playground/BatchMerge.q

117 lines
5.4 KiB
Plaintext
Executable File

import java.io.{FileReader, BufferedReader}
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.walkers.genotyper.{GenotypeLikelihoodsCalculationModel, UnifiedGenotyperEngine}
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals}
import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling}
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.baq.BAQ
import org.broadinstitute.sting.utils.text.XReadLines
import org.broadinstitute.sting.utils.yaml.YamlUtils
class batchMergePipeline extends QScript {
batchMerge =>
@Argument(doc="VCF list",shortName="vcfs") var vcfList: File = _
@Argument(doc="bam list",shortName="bams") var bamList: File = _
@Argument(doc="sting dir",shortName="sting") var stingDir: String = _
@Argument(doc="reference file",shortName="ref") var ref: File = _
@Argument(doc="batched output",shortName="batch") var batchOut: File = _
//@Argument(doc="read UG settings from header",shortName="ugh") var ugSettingsFromHeader : Boolean = false
@Hidden @Argument(doc="Min base q",shortName="mbq",required=false) var mbq : Int = 20
@Hidden @Argument(doc="Min map q",shortName="mmq",required=false) var mmq : Int = 20
@Hidden @Argument(doc="Max mismatching bases",shortName="mmb",required=false) var mmb : Int = 3
@Hidden @Argument(doc="baq gap open penalty, using sets baq to calc when necessary",shortName="baqp",required=false) var baq : Int = -1
@Hidden @Argument(doc="VCFs are indels",shortName="indel") var indelMode : Boolean = false
def script = {
var vcfs : List[File] = extractFileEntries(vcfList)
var bams : List[File] = extractFileEntries(bamList)
trait ExtractArgs extends VCFExtractSites {
this.keepFilters = false
this.keepInfo = false
this.keepQual = false
}
var getVariantAlleles : List[VCFExtractSites] = vcfs.map( u => new VCFExtractSites(u, swapExt(batchOut.getParent,u,".vcf",".alleles.vcf")) with ExtractArgs)
var combineVCFs : VCFSimpleMerge = new VCFSimpleMerge
combineVCFs.vcfs = getVariantAlleles.map(u => u.outVCF)
combineVCFs.fai = new File(ref.getAbsolutePath+".fai")
combineVCFs.outVCF = swapExt(batchOut,".vcf",".pf.alleles.vcf")
var extractIntervals : VCFExtractIntervals = new VCFExtractIntervals(combineVCFs.outVCF,swapExt(combineVCFs.outVCF,".vcf",".intervals.list"),true)
addAll(getVariantAlleles)
add(combineVCFs,extractIntervals)
trait CalcLikelihoodArgs extends UGCalcLikelihoods {
this.reference_sequence = batchMerge.ref
this.max_mismatches_in_40bp_window = batchMerge.mmb
this.min_base_quality_score = batchMerge.mbq
this.min_mapping_quality_score = batchMerge.mmq
if ( batchMerge.baq >= 0 ) {
this.baqGapOpenPenalty = batchMerge.baq
this.baq = BAQ.CalculationMode.CALCULATE_AS_NECESSARY
}
this.intervals :+= extractIntervals.listOut
this.alleleVCF = combineVCFs.outVCF
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.memoryLimit = 4
this.scatterCount = 60
this.output_mode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES
this.genotyping_mode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
if ( batchMerge.indelMode ) {
this.genotype_likelihoods_model = GenotypeLikelihoodsCalculationModel.Model.DINDEL
}
}
def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = {
var ugcl = new UGCalcLikelihoods with CalcLikelihoodArgs
ugcl.input_file ++= bams._1
ugcl.out = new File("MBatch%d.likelihoods.vcf".format(bams._2))
return ugcl
}
var calcs: List[UGCalcLikelihoods] = bams.grouped(20).toList.zipWithIndex.map(u => newUGCL(u))
addAll(calcs)
trait CallVariantsArgs extends UGCallVariants {
this.reference_sequence = batchMerge.ref
this.intervals :+= extractIntervals.listOut
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.scatterCount = 30
this.memoryLimit = 8
this.output_mode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES
this.genotyping_mode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
if ( batchMerge.indelMode ) {
this.genotype_likelihoods_model = GenotypeLikelihoodsCalculationModel.Model.DINDEL
}
}
var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs
cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) )
cVars.out = batchOut
add(cVars)
trait CombineVariantsArgs extends CombineVariants {
this.reference_sequence = batchMerge.ref
this.intervals :+= extractIntervals.listOut
this.jarFile = new File(batchMerge.stingDir+"/dist/GenomeAnalysisTK.jar")
this.scatterCount = 10
this.memoryLimit=4
}
var combine : CombineVariants = new CombineVariants with CombineVariantsArgs
combine.out = swapExt(batchOut,".vcf",".variant.combined.vcf")
combine.rodBind ++= vcfs.map( u => new RodBind(u.getName,"vcf",u) )
add(combine)
}
def extractFileEntries(in: File): List[File] = {
return (new XReadLines(in)).readLines.toList.map( new File(_) )
}
}