By popular demand I'm adding the qscript I used to do the 660 bamfile 1000G calling for ASHG. It does cleaning, BAQing, and merging in 3mb chunks genome-wide then calls SNPs on those temporary bams.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4866 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a54f3f230
commit
095fc1922a
|
|
@ -0,0 +1,192 @@
|
|||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
||||
|
||||
class ASHGcalling extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Input(doc="path to GATK jar", shortName="gatk", required=true)
|
||||
var gatkJar: File = _
|
||||
|
||||
@Input(doc="the chromosome to process", shortName="chr", required=true)
|
||||
var chr: Int = _
|
||||
|
||||
@Input(doc="output path", shortName="outputDir", required=false)
|
||||
var outputDir: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/"
|
||||
|
||||
@Input(doc="base output filename", shortName="baseName", required=false)
|
||||
var baseName: String = "ALL.august"
|
||||
|
||||
@Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false)
|
||||
var outputTmpDir: String = "/humgen/gsa-hpprojects/august_cleaned_bams"
|
||||
|
||||
private val tmpDir: File = new File("/broad/shptmp/rpoplin/")
|
||||
private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
|
||||
private val targetIntervals: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/knownIndels.intervals")
|
||||
private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
|
||||
private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed"
|
||||
val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566)
|
||||
val populations = List("YRI","LWK","ASW","PUR","CEU","TSI","GBR","FIN","MXL","CHB","CHS","JPT")
|
||||
|
||||
private var pipeline: Pipeline = _
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.jarFile = qscript.gatkJar
|
||||
this.reference_sequence = qscript.reference
|
||||
this.memoryLimit = Some(2)
|
||||
this.DBSNP = qscript.dbSNP
|
||||
this.jobTempDir = qscript.tmpDir
|
||||
}
|
||||
|
||||
class SamtoolsBaqFunction extends CommandLineFunction {
|
||||
@Input(doc="The input BAM file") var in_bam: File = _
|
||||
@Output(doc="The output BAM file") var out_bam: File = _
|
||||
def commandLine = "/humgen/gsa-scr1/rpoplin/samtools/samtools calmd -br %s %s > %s".format(in_bam.getAbsolutePath, qscript.reference, out_bam.getAbsolutePath)
|
||||
}
|
||||
|
||||
class DeleteMeFunction extends CommandLineFunction {
|
||||
@Input(doc="The file to be deleted") var me: File = _
|
||||
@Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _
|
||||
def commandLine = "rm -f %s".format(me.getAbsolutePath)
|
||||
}
|
||||
|
||||
class DeleteMeAllFunction extends CommandLineFunction {
|
||||
@Input(doc="The file to be deleted") var me: File = _
|
||||
@Input(doc="The file which must exist before we are allowed to delete") var trigger: File = _
|
||||
def commandLine = "rm -f %s*".format(me.getAbsolutePath)
|
||||
}
|
||||
|
||||
def script = {
|
||||
val basesPerJob: Int = 3000000
|
||||
val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1)
|
||||
var start: Int = 1
|
||||
var stop: Int = start - 1 + basesPerJob
|
||||
if( stop > lastBase ) { stop = lastBase }
|
||||
var jobNumber: Int = 1
|
||||
while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) {
|
||||
callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber)
|
||||
start += basesPerJob
|
||||
stop += basesPerJob
|
||||
if( stop > lastBase ) { stop = lastBase }
|
||||
jobNumber += 1
|
||||
}
|
||||
|
||||
/* CombineVariants parses the 800+ genotypes per record and is way too slow. Combine the vcf files together using grep, cat, and sortByRef.pl outside of Queue
|
||||
combineVariants = new CombineVariants with CommandLineGATKArgs
|
||||
combineVariants.rodBind = vcfChunks
|
||||
combineVariants.out = new TaggedFile(qscript.baseName + ".chr" + qscript.chr.toString + ".filtered.vcf", "vcf")
|
||||
combineVariants.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
|
||||
combineVariants.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNSORTED)
|
||||
combineVariants.setKey = "null"
|
||||
add(combineVariants)
|
||||
*/
|
||||
}
|
||||
|
||||
def callThisChunk(interval: String, jobNumber: Int) = {
|
||||
|
||||
val baseName: String = qscript.outputDir + "/chr" + qscript.chr.toString + "/" + qscript.baseName + ".chr" + qscript.chr.toString + "." + jobNumber.toString +"."
|
||||
var call = new UnifiedGenotyperV2 with CommandLineGATKArgs
|
||||
val rawCalls = new File(baseName + "raw.vcf")
|
||||
|
||||
for( population <- qscript.populations ) {
|
||||
val baseTmpName: String = qscript.outputTmpDir + "/chr" + qscript.chr.toString + "/" + population + ".august.chr" + qscript.chr.toString + "." + jobNumber.toString +"."
|
||||
val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/%s.chr%d.bam.list".format(population, qscript.chr))
|
||||
|
||||
// 1.) Clean at known indels
|
||||
var clean = new IndelRealigner with CommandLineGATKArgs
|
||||
val cleanedBam = new File(baseTmpName + "cleaned.bam")
|
||||
clean.memoryLimit = Some(4)
|
||||
clean.input_file :+= bamList
|
||||
clean.intervalsString :+= interval
|
||||
clean.targetIntervals = qscript.targetIntervals
|
||||
clean.out = cleanedBam
|
||||
clean.rodBind :+= RodBind("indels", "VCF", qscript.dindelCalls)
|
||||
clean.knownsOnly = true
|
||||
clean.LOD = Some(1.0)
|
||||
clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
|
||||
clean.compress = Some(2)
|
||||
clean.jobName = baseName + population + ".clean"
|
||||
//clean.stripBam = true
|
||||
//clean.fileSystemUsage = "indium"
|
||||
|
||||
// 2.) Apply BAQ calculation
|
||||
var baq = new SamtoolsBaqFunction
|
||||
val baqedBam = new File(baseTmpName + "cleaned.baq.bam")
|
||||
baq.memoryLimit = Some(4)
|
||||
baq.in_bam = cleanedBam
|
||||
baq.out_bam = baqedBam
|
||||
baq.jobName = baseName + population + ".baq"
|
||||
//baq.fileSystemUsage = "iodine"
|
||||
|
||||
// 3a.) Delete cleaned bam
|
||||
var deleteClean = new DeleteMeFunction
|
||||
deleteClean.me = cleanedBam
|
||||
deleteClean.trigger = baqedBam
|
||||
deleteClean.jobName = baseName + population + ".deleteClean"
|
||||
//deleteClean.fileSystemUsage = "iodine"
|
||||
|
||||
// 3b.) Index BAQ'ed bam
|
||||
var index = new SamtoolsIndexFunction
|
||||
index.bamFile = baqedBam
|
||||
index.jobName = baseName + population + ".index"
|
||||
//index.fileSystemUsage = "iodine"
|
||||
|
||||
// 5a.) Delete BAQ'ed bam and index
|
||||
//var deleteBaq = new DeleteMeAllFunction
|
||||
//deleteBaq.me = baqedBam
|
||||
//deleteBaq.trigger = rawCalls
|
||||
//deleteBaq.jobName = baseName + population + ".deleteBaq"
|
||||
//deleteBaq.fileSystemUsage = "iodine"
|
||||
|
||||
call.input_file :+= baqedBam
|
||||
|
||||
//add(clean, baq, deleteClean, index, deleteBaq)
|
||||
add(clean, baq, deleteClean, index)
|
||||
}
|
||||
|
||||
// 4.) Call with UGv2
|
||||
call.memoryLimit = Some(4)
|
||||
call.intervalsString :+= interval
|
||||
call.out = rawCalls
|
||||
call.dcov = Some(50)
|
||||
call.standard_min_confidence_threshold_for_calling = Some(50)
|
||||
call.standard_min_confidence_threshold_for_emitting = Some(30)
|
||||
call.min_mapping_quality_score = Some(20)
|
||||
call.min_base_quality_score = Some(20)
|
||||
call.pnrm = Some(org.broadinstitute.sting.playground.gatk.walkers.genotyper.AlleleFrequencyCalculationModel.Model.GRID_SEARCH)
|
||||
call.jobName = baseName + "call"
|
||||
//call.fileSystemUsage = "iodine"
|
||||
|
||||
// 5b.) Filter near indels and HARD_TO_VALIDATE
|
||||
var filter = new VariantFiltration with CommandLineGATKArgs
|
||||
val filteredCalls = new File(baseName + "filtered.vcf")
|
||||
filter.memoryLimit = Some(1)
|
||||
filter.out = filteredCalls
|
||||
filter.intervalsString :+= interval
|
||||
filter.variantVCF = rawCalls
|
||||
filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask)
|
||||
filter.maskName = "InDel"
|
||||
filter.filterName ++= List("HARD_TO_VALIDATE")
|
||||
filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"")
|
||||
filter.jobName = baseName + "filter"
|
||||
//filter.fileSystemUsage = "indium"
|
||||
|
||||
// 6.) Delete raw calls and index
|
||||
var deleteRawCalls = new DeleteMeAllFunction
|
||||
deleteRawCalls.me = rawCalls
|
||||
deleteRawCalls.trigger = filteredCalls
|
||||
deleteRawCalls.jobName = baseName + "deleteRawCalls"
|
||||
//deleteRawCalls.fileSystemUsage = "indium"
|
||||
|
||||
add(call, filter, deleteRawCalls)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue