Committing the base qscript I used to make the Phase1 Project Consensus. Does per-population cleaning and simplifyBAM, and then per-analysis-panel calling with genotype given alleles. Combines info fields using the panel with max AC.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5764 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-04 20:13:26 +00:00 · 2011-05-04 20:13:26 +00:00 · b5ab2274f6
parent b4654b0f47
commit b5ab2274f6
1 changed files with 179 additions and 0 deletions
--- a/scala/qscript/oneoffs/rpoplin/Phase1ProjectConsensus.scala
+++ b/scala/qscript/oneoffs/rpoplin/Phase1ProjectConsensus.scala
@ -0,0 +1,179 @@
+import net.sf.picard.reference.FastaSequenceFile
+import org.broadinstitute.sting.datasources.pipeline.Pipeline
+import org.broadinstitute.sting.gatk.DownsampleType
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.extensions.samtools._
+import org.broadinstitute.sting.queue.{QException, QScript}
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
+
+class Phase1ProjectConsensus extends QScript {
+  qscript =>
+
+  @Input(doc="path to GATK jar", shortName="gatk", required=true)
+  var gatkJar: File = _
+
+  @Input(doc="output path", shortName="outputDir", required=true)
+  var outputDir: String = _
+
+  @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true)
+  var outputTmpDir: String = _
+
+  private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
+  private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
+  private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf"
+  val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560)
+  val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI")
+  private val alleles: String = "/humgen/1kg/processing/production_wgs_phase1/consensus/ALL.phase1.wgs.union.pass.sites.vcf"
+
+  private var pipeline: Pipeline = _
+
+  trait CommandLineGATKArgs extends CommandLineGATK {
+    this.jarFile = qscript.gatkJar
+    this.reference_sequence = qscript.reference
+    this.memoryLimit = Some(3)
+  }
+
+  class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int, val chr: String) {
+    val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + chr + "/" + baseName + "/" + baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf")
+
+    val callSnps = new UnifiedGenotyper with CommandLineGATKArgs
+    callSnps.out = rawVCFsnps
+    callSnps.dcov = 50
+    callSnps.stand_call_conf = 4.0
+    callSnps.stand_emit_conf = 4.0
+    callSnps.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE
+    callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" +baseName + ".phase1.chr" + chr + "." + jobNumber + ".raw.snps"
+    callSnps.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP
+    callSnps.genotyping_mode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
+    //callSnps.out_mode = org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES
+    callSnps.rodBind :+= RodBind("alleles", "VCF", qscript.alleles)
+    callSnps.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP )
+    callSnps.sites_only = true
+  }
+
+  class Chromosome(val inputChr: Int) {
+    var chr: String = inputChr.toString
+    if(inputChr == 23) { chr = "X" }
+
+    val combine = new CombineVariants with CommandLineGATKArgs
+    val chrVCF = new File(qscript.outputDir + "/calls/" + "combined.phase1.chr" + chr + ".raw.snps.vcf")
+    combine.out = chrVCF
+    combine.intervalsString :+= chr
+  }
+
+  def script = {
+
+    for(chr <- List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23)) {
+      val chrObject = new Chromosome(chr)
+      val basesPerJob: Int = 3000000
+      val lastBase: Int = qscript.chromosomeLength(chr - 1)
+      var start: Int = 1
+      var stop: Int = start - 1 + basesPerJob
+      if( stop > lastBase ) { stop = lastBase }
+      var jobNumber: Int = 1
+      while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) {
+        if( chr != 23 ) {
+          callThisChunk("%d:%d-%d".format(chr, start, stop), jobNumber, chr, chrObject)
+        } else {
+          callThisChunk("X:%d-%d".format(start, stop), jobNumber, chr, chrObject)
+        }
+        start += basesPerJob
+        stop += basesPerJob
+        if( stop > lastBase ) { stop = lastBase }
+        jobNumber += 1
+      }
+      add(chrObject.combine)
+    }
+  }
+
+
+  def callThisChunk(interval: String, jobNumber: Int, inputChr: Int, chrObject: Chromosome) = {
+
+    var chr: String = inputChr.toString
+    if(inputChr == 23) { chr = "X" }
+
+    val AFRadmix = new AnalysisPanel("AFR.admix", List("LWK","YRI","ASW","CLM","PUR"), jobNumber, chr)
+    val AMRadmix = new AnalysisPanel("AMR.admix", List("MXL","CLM","PUR","ASW"), jobNumber, chr)
+    val EURadmix = new AnalysisPanel("EUR.admix", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber, chr)
+    val ASNadmix = new AnalysisPanel("ASN.admix", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber, chr)
+    val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW"), jobNumber, chr)
+    val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR"), jobNumber, chr)
+    val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS"), jobNumber, chr)
+    val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT"), jobNumber, chr)
+    val ALL = new AnalysisPanel("ALL", List("LWK","YRI","ASW","MXL","CLM","PUR","CEU","FIN","GBR","TSI","IBS","CHB","CHS","JPT"), jobNumber, chr)
+
+    val analysisPanels = List(AFR, ASN, AMR, EUR, AFRadmix, ASNadmix, AMRadmix, EURadmix) //ALL
+
+    val combine = new CombineVariants with CommandLineGATKArgs
+    val combinedChunk = new File(qscript.outputDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps.vcf")
+
+    combine.out = combinedChunk
+    combine.jobName = qscript.outputTmpDir + "/calls/chr" + chr + "/" + "combined.phase1.chr" + chr + "." + jobNumber + ".raw.snps"
+    combine.intervalsString :+= interval
+    combine.mergeInfoWithMaxAC = true
+    combine.priority = "AFR.admix,AMR.admix,EUR.admix,ASN.admix,AFR,AMR,EUR,ASN" //ALL,
+
+    for( population <- qscript.populations ) {
+      val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + chr + "/" + population + ".phase1.chr" + chr + "." + jobNumber.toString + "."
+      val bamList: File = new File("/humgen/1kg/processing/production_wgs_phase1/bam_lists/%s.list".format(population))
+      val targetIntervals: File = new File(baseTmpName + "target.intervals")
+
+      // 1.) Create cleaning targets
+      val target = new RealignerTargetCreator with CommandLineGATKArgs
+      target.memoryLimit = 4
+      target.input_file :+= bamList
+      target.intervalsString :+= interval
+      target.out = targetIntervals
+      target.mismatchFraction = 0.0
+      target.maxIntervalSize = 700
+      target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls)
+      target.jobName = baseTmpName + "target"
+      //target.isIntermediate = true
+      target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP )
+
+      // 2.) Clean without SW
+      val clean = new IndelRealigner with CommandLineGATKArgs
+      val cleanedBam = new File(baseTmpName + "cleaned.bam")
+      clean.memoryLimit = 6
+      clean.input_file :+= bamList
+      clean.intervalsString :+= interval
+      clean.targetIntervals = targetIntervals
+      clean.out = cleanedBam
+      clean.doNotUseSW = true
+      clean.maxInMemory = 8000
+      clean.maxReads = 100
+      clean.greedy = 80
+      clean.LOD = 7.0
+      clean.knownsOnly = true
+      clean.maxConsensuses = 10
+      clean.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF
+      clean.simplifyBAM = true
+      clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls)
+      clean.jobName = baseTmpName + "clean"
+      //clean.isIntermediate = true
+      clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP )
+
+      add(target, clean)
+
+      for( a <- analysisPanels ) {
+        for( p <- a.pops) {
+          if( p == population ) {
+            a.callSnps.input_file :+= cleanedBam
+          }
+        }
+      }
+    }
+
+    for( a <- analysisPanels ) {
+      a.callSnps.intervalsString :+= interval
+      if(a.baseName == "ALL") { a.callSnps.memoryLimit = 4 }
+      add(a.callSnps)
+      combine.rodBind :+= RodBind(a.baseName, "VCF", a.callSnps.out)
+    }
+
+    add(combine)
+    chrObject.combine.rodBind :+= RodBind("ALL" + jobNumber.toString, "VCF", combine.out)
+  }
+}