From a1b7d283752f1e37247b15f285b7ef7b166ccaf5 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 7 Apr 2011 20:03:48 +0000 Subject: [PATCH] Initial VQSR full search script git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5591 348d0f76-0448-11de-a6fe-93d51630548a --- .../oneoffs/chartl/Exome_VQSR_FullSearch.q | 224 ++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100755 scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q diff --git a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q new file mode 100755 index 000000000..8e6c9a1b7 --- /dev/null +++ b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q @@ -0,0 +1,224 @@ +import collection.mutable.HashMap +import java.io.{PrintWriter, PrintStream} +import org.broadinstitute.sting.commandline.ArgumentSource +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction} +import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.utils.interval.IntervalSetRule +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions + +class Exome_VQSR_FullSearch extends QScript { + qScript => + + // COMMAND LINE ARGUMENTS + + // VARIABLES USED + val SCRIPT_BASE_NAME = "Exome_VQSR_Search" + val UG_CALL_THRESH = 10.0; + val VQSR_CALL_THRESH = List(10.0,20.0,30.0,40.0,50.0) + val VQSR_ANNOTATIONS_TO_USE = List(List("QD","SB"),List("QD","SB","HRun"),List("QD","SB","HaplotypeScore"),List("QD","SB","HRun","HaplotypeScore")) + val HM3_SITES = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf") + val OMNI_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") + val AXIOM_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf") + val DBSNP_129 = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + val SENSITIVITY = (new Range(0,19)).map(u => 90.0 + 0.5*u).toList + val RECALIBRATE_TOGETHER = List(true,false) + val VQSR_HAPMAP_PRIOR = "15.0" + val VQSR_OMNI_PRIOR = "12.0" + + var VQSR_RODBINDS : Map[String,List[RodBind]] = new HashMap[String,List[RodBind]] + val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s" + val VQSR_DBSNP_TAG = "known=true,training=false,truth=false" + + + for ( tf <- List( (true,false),(false,true),(true,true)) ) { + var mrb: List[RodBind] = Nil + val ext = (if ( tf._1 ) "HT" else "HF") + (if ( tf._2 ) "OT" else "OF") + val hmSt = if ( tf._1 ) "true" else "false" + val omSt = if ( tf._2 ) "true" else "false" + mrb :+= RodBind("dbsnp","VCF",DBSNP_129,VQSR_DBSNP_TAG) + mrb :+= RodBind("HapMap3","VCF",HM3_SITES,VQSR_TAG_FT.format(hmSt,VQSR_HAPMAP_PRIOR)) + mrb :+= RodBind("Omni","VCF",OMNI_CHIP,VQSR_TAG_FT.format(omSt,VQSR_OMNI_PRIOR)) + VQSR_RODBINDS :+= (ext,mrb) + } + + val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList + val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list") + val EXPAND_INTS = 40 + val HAND_FILTERED : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf") + val GATK_JAR : File = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + + def script() { + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= INTS + this.jarFile = GATK_JAR + this.reference_sequence = REF + this.memoryLimit = Some(4) + } + + val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS") + ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out") + + if (EXPAND_INTS > 0) { + add(ei) + } + + trait ExpandedIntervals extends CommandLineGATK { + if (EXPAND_INTS > 0) { + this.intervals :+= ei.outList + } + } + + val callSNPsAndIndels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals + callSNPsAndIndels.analysisName = base+"_calls" + callSNPsAndIndels.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out") + callSNPsAndIndels.memoryLimit = Some(6) + callSNPsAndIndels.downsample_to_coverage = Some(600) + callSNPsAndIndels.input_file = bamFiles + callSNPsAndIndels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP_129) + callSNPsAndIndels.out = new File(SCRIPT_BASE_NAME+".rawCalls.vcf") + callSNPsAndIndels.stand_call_conf = Some(UG_CALL_THRESH) + + callSNPsAndIndels.scatterCount = qscript.num_var_scatter_jobs + callSNPsAndIndels.setupScatterFunction = { + case scatter: ScatterFunction => + scatter.commandDirectory = new File("SnpCalls/ScatterGather") + scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out") + } + callSNPsAndIndels.setupCloneFunction = { + case (clone: CloneFunction, index: Int) => + clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index)) + clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index)) + } + callSNPsAndIndels.setupGatherFunction = { + case (gather: GatherFunction, source: ArgumentSource) => + gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName)) + gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) + } + + add(callSNPsAndIndels) + + class ExtractSNPs(in: File, out: File) extends InProcessFunction { + + @Input(doc="foo") + var inputVCF : File = in + @Output(doc="foo") + var outputVCF : File = out + + def canPrint(line: String) : Boolean = { + if ( line.startsWith("#") ) { true } + val spline = line.split("\t",5) + if ( spline.apply(3).size() > 1 || spline.apply(4).size() > 1 ) { false } + true + } + + def run() { + val outWriter = new PrintWriter(new PrintStream(outputVCF)) + asScalaIterator(new XReadLines(inputVCF)).foreach(u => { + if ( canPrint(u) ) { + outWriter.println(u) + } + }) + } + } + + val extractSNPs : ExtractSNPs = new ExtractSNPs(callSNPsAndIndels.out,new File(base+".snpCalls.vcf")) + add(extractSNPs) + + def getPath(annoList: List[String], jointRecal: Boolean) : File = { + new File("VQSR/%s/%s".format( annoList.reduceLeft( _ + "." + _ ) , if(jointRecal) "together" else "separate") ) + } + + for ( annotations <- VQSR_ANNOTATIONS_TO_USE ) { + for ( recalTogether <- RECALIBRATE_TOGETHER ) { + val directory = getPath(annotations,recalTogether) + for ( call_thresh <- VQSR_CALL_THRESH ) { + var filterQual = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals + filterQual.rodBind :+= new RodBind(extractSNPs.outputVCF) + filterQual.filterExpression :+= "QUAL < %.1f".format(VQSR_CALL_THRESH) + filterQual.filterName :+= "LowQual" + filterQual.commandDirectory = directory + filterQual.out = new File(directory.getAbsolutePath+"/"+SCRIPT_BASE_NAME+".filterQual%.1f.vcf".format(VQSR_CALL_THRESH)) + add(filterQual) + for ( vqsr_rb <- VQSR_RODBINDS.iterator ) { + trait VQSR_Args extends ContrastiveRecalibrator { + this.allPoly = true + this.analysisName = "VQSR_%s_%s_%d".format( annotations.reduceLeft( _ + "." + _), if ( recalTogether ) "true" else "false", call_thresh) + this.commandDirectory = directory + this.use_annotation ++= annotations + this.tranche ++= SENSITIVITY + this.rodBind :+= RodBind("inputData","VCF",filterQual.out) + this.rodBind ++= vqsr_rb._2 + } + val nameFormat = SCRIPT_BASE_NAME+".%1f.%s.".format(call_thresh,vqsr_rb._1)+"%s." + if ( recalTogether ) { + var vqsr = new ContrastiveRecalibrator with VQSR_Args with ExpandedIntervals with CommandLineGATKArgs + vqsr.tranchesFile = new File(nameFormat.format("both")+"tranche") + vqsr.recalFile = new File(nameFormat.format("both")+"recal") + add(vqsr) + addAll(eval(vqsr)) + } else { + var exons = new ContrastiveRecalibrator with VQSR_Args with CommandLineGATKArgs + exons.tranchesFile = new File(nameFormat.format("exons")+"tranche") + exons.recalFile = new File(nameFormat.format("exons")+"recal") + var flanks = new ContrastiveRecalibrator with VQSR_Args + flanks.intervals :+= ei.outList + flanks.jarFile = GATK_JAR + flanks.memoryLimit = Some(4) + flanks.reference_sequence = REF + flanks.tranchesFile = new File(nameFormat.format("flanks")+"tranche") + flanks.recalFile = new File(nameFormat.format("flanks")+"recal") + add(exons,flanks) + addAll(eval(exons)) + addAll(eval(flanks)) + } + } + } + } + } + } + + // want to apply and eval + def eval(recal: ContrastiveRecalibrator) : List[QFunction] = { + var functions : List[QFunction] = Nil + trait ImplicitArgs extends CommandLineGATK { + this.jarFile = recal.jarFile + this.reference_sequence = recal.reference_sequence + this.commandDirectory = recal.commandDirectory + this.intervals ++= recal.intervals + } + + trait ApplyArgs extends ApplyRecalibration with ImplicitArgs { + this.tranchesFile = recal.tranchesFile + this.recalFile = recal.recalFile + } + + trait EvalArgs extends VariantEval with ImplicitArgs { + this.stratificationModule = List("Novelty") + this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance") + this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129) + this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP) + } + + for ( sens <- SENSITIVITY ) { + var cut = new ApplyRecalibration with ApplyArgs + cut.analysisName = recal.analysisName+".cut%.1f".format(sens) + val vcfExt = ".%.1f.vcf".format(sens) + cut.out = swapExt(cut.recalFile,".recal",vcfExt) + cut.ts_filter_level = Some(sens) + functions :+= cut + + var eval = new VariantEval with EvalArgs + eval.analysisName = cut.analysisName+".eval" + eval.out = swapExt(cut.out,".vcf",".eval") + eval.rodBind :+= RodBind("evalContrastive","VCF",cut.out) + functions :+= eval + } + + functions + } +} \ No newline at end of file