By popular demand I'm adding the qscript I used to do the 660 bamfile 1000G calling for ASHG. It does cleaning, BAQing, and merging in 3mb chunks genome-wide then calls SNPs on those temporary bams.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4866 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-12-17 18:49:03 +00:00
parent 4a54f3f230
commit 095fc1922a
1 changed files with 192 additions and 0 deletions

View File

@ -0,0 +1,192 @@
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class ASHGcalling extends QScript {
qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true)
var gatkJar: File = _
@Input(doc="the chromosome to process", shortName="chr", required=true)
var chr: Int = _
@Input(doc="output path", shortName="outputDir", required=false)
var outputDir: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/"
@Input(doc="base output filename", shortName="baseName", required=false)
var baseName: String = "ALL.august"
@Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false)
var outputTmpDir: String = "/humgen/gsa-hpprojects/august_cleaned_bams"
private val tmpDir: File = new File("/broad/shptmp/rpoplin/")
private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
private val targetIntervals: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/knownIndels.intervals")
private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed"
val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566)
val populations = List("YRI","LWK","ASW","PUR","CEU","TSI","GBR","FIN","MXL","CHB","CHS","JPT")
private var pipeline: Pipeline = _
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.gatkJar
this.reference_sequence = qscript.reference
this.memoryLimit = Some(2)
this.DBSNP = qscript.dbSNP
this.jobTempDir = qscript.tmpDir
}
class SamtoolsBaqFunction extends CommandLineFunction {
@Input(doc="The input BAM file") var in_bam: File = _
@Output(doc="The output BAM file") var out_bam: File = _
def commandLine = "/humgen/gsa-scr1/rpoplin/samtools/samtools calmd -br %s %s > %s".format(in_bam.getAbsolutePath, qscript.reference, out_bam.getAbsolutePath)
}
class DeleteMeFunction extends CommandLineFunction {
@Input(doc="The file to be deleted") var me: File = _
@Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _
def commandLine = "rm -f %s".format(me.getAbsolutePath)
}
class DeleteMeAllFunction extends CommandLineFunction {
@Input(doc="The file to be deleted") var me: File = _
@Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _
def commandLine = "rm -f %s*".format(me.getAbsolutePath)
}
def script = {
val basesPerJob: Int = 3000000
val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1)
var start: Int = 1
var stop: Int = start - 1 + basesPerJob
if( stop > lastBase ) { stop = lastBase }
var jobNumber: Int = 1
while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) {
callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber)
start += basesPerJob
stop += basesPerJob
if( stop > lastBase ) { stop = lastBase }
jobNumber += 1
}
/* CombineVariants parses the 800+ genotypes per record and is way too slow. Combine the vcf files together using grep, cat, and sortByRef.pl outside of Queue
combineVariants = new CombineVariants with CommandLineGATKArgs
combineVariants.rodBind = vcfChunks
combineVariants.out = new TaggedFile(qscript.baseName + ".chr" + qscript.chr.toString + ".filtered.vcf", "vcf")
combineVariants.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
combineVariants.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED)
combineVariants.setKey = "null"
add(combineVariants)
*/
}
def callThisChunk(interval: String, jobNumber: Int) = {
val baseName: String = qscript.outputDir + "/chr" + qscript.chr.toString + "/" + qscript.baseName + ".chr" + qscript.chr.toString + "." + jobNumber.toString +"."
var call = new UnifiedGenotyperV2 with CommandLineGATKArgs
val rawCalls = new File(baseName + "raw.vcf")
for( population <- qscript.populations ) {
val baseTmpName: String = qscript.outputTmpDir + "/chr" + qscript.chr.toString + "/" + population + ".august.chr" + qscript.chr.toString + "." + jobNumber.toString +"."
val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/%s.chr%d.bam.list".format(population, qscript.chr))
// 1.) Clean at known indels
var clean = new IndelRealigner with CommandLineGATKArgs
val cleanedBam = new File(baseTmpName + "cleaned.bam")
clean.memoryLimit = Some(4)
clean.input_file :+= bamList
clean.intervalsString :+= interval
clean.targetIntervals = qscript.targetIntervals
clean.out = cleanedBam
clean.rodBind :+= RodBind("indels", "VCF", qscript.dindelCalls)
clean.knownsOnly = true
clean.LOD = Some(1.0)
clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
clean.compress = Some(2)
clean.jobName = baseName + population + ".clean"
//clean.stripBam = true
//clean.fileSystemUsage = "indium"
// 2.) Apply BAQ calculation
var baq = new SamtoolsBaqFunction
val baqedBam = new File(baseTmpName + "cleaned.baq.bam")
baq.memoryLimit = Some(4)
baq.in_bam = cleanedBam
baq.out_bam = baqedBam
baq.jobName = baseName + population + ".baq"
//baq.fileSystemUsage = "iodine"
// 3a.) Delete cleaned bam
var deleteClean = new DeleteMeFunction
deleteClean.me = cleanedBam
deleteClean.trigger = baqedBam
deleteClean.jobName = baseName + population + ".deleteClean"
//deleteClean.fileSystemUsage = "iodine"
// 3b.) Index BAQ'ed bam
var index = new SamtoolsIndexFunction
index.bamFile = baqedBam
index.jobName = baseName + population + ".index"
//index.fileSystemUsage = "iodine"
// 5a.) Delete BAQ'ed bam and index
//var deleteBaq = new DeleteMeAllFunction
//deleteBaq.me = baqedBam
//deleteBaq.trigger = rawCalls
//deleteBaq.jobName = baseName + population + ".deleteBaq"
//deleteBaq.fileSystemUsage = "iodine"
call.input_file :+= baqedBam
//add(clean, baq, deleteClean, index, deleteBaq)
add(clean, baq, deleteClean, index)
}
// 4.) Call with UGv2
call.memoryLimit = Some(4)
call.intervalsString :+= interval
call.out = rawCalls
call.dcov = Some(50)
call.standard_min_confidence_threshold_for_calling = Some(50)
call.standard_min_confidence_threshold_for_emitting = Some(30)
call.min_mapping_quality_score = Some(20)
call.min_base_quality_score = Some(20)
call.pnrm = Some(org.broadinstitute.sting.playground.gatk.walkers.genotyper.AlleleFrequencyCalculationModel.Model.GRID_SEARCH)
call.jobName = baseName + "call"
//call.fileSystemUsage = "iodine"
// 5b.) Filter near indels and HARD_TO_VALIDATE
var filter = new VariantFiltration with CommandLineGATKArgs
val filteredCalls = new File(baseName + "filtered.vcf")
filter.memoryLimit = Some(1)
filter.out = filteredCalls
filter.intervalsString :+= interval
filter.variantVCF = rawCalls
filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask)
filter.maskName = "InDel"
filter.filterName ++= List("HARD_TO_VALIDATE")
filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"")
filter.jobName = baseName + "filter"
//filter.fileSystemUsage = "indium"
// 6.) Delete raw calls and index
var deleteRawCalls = new DeleteMeAllFunction
deleteRawCalls.me = rawCalls
deleteRawCalls.trigger = filteredCalls
deleteRawCalls.jobName = baseName + "deleteRawCalls"
//deleteRawCalls.fileSystemUsage = "indium"
add(call, filter, deleteRawCalls)
}
}