diff --git a/scala/qscript/rpoplin/Phase1Calling.scala b/scala/qscript/rpoplin/Phase1Calling.scala new file mode 100755 index 000000000..b6216e404 --- /dev/null +++ b/scala/qscript/rpoplin/Phase1Calling.scala @@ -0,0 +1,132 @@ +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType + +class Phase1Calling extends QScript { + qscript => + + @Input(doc="path to GATK jar", shortName="gatk", required=true) + var gatkJar: File = _ + + @Input(doc="the chromosome to process", shortName="chr", required=false) + var chr: Int = 20 + + @Input(doc="output path", shortName="outputDir", required=false) + var outputDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" + + @Input(doc="base output filename", shortName="baseName", required=false) + var baseName: String = "" + + @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) + var outputTmpDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" + + private val tmpDir: File = new File("/broad/shptmp/rpoplin/tmp/") + private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") + private val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" + private val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/AFR.dindel_august_release.20110110.sites.vcf.gz" + private val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/ASN.dindel_august_release.20110110.sites.vcf.gz" + private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" + private val dindelMask: String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed" + val hapmap = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + val g1k = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf" + val omni = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/764samples.deduped.b37.annot.vcf" + val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) + val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","JPT","LWK","MXL","PUR","TSI","YRI") + //val populations = List("EUR","AMR","ASN","AFR") + //val populations = List("FIN", "LWK") + private val intervals: String = "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.500Kb.hg19.intervals" + //val populations = List("ZZZ") // small set used for debugging + + private var pipeline: Pipeline = _ + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.reference + this.memoryLimit = Some(3) + this.jobTempDir = qscript.tmpDir + this.DBSNP = qscript.dbSNP + } + + def script = { + callThisChunk() // using scatter/gather capabilities of Queue so no need to for loop over 1Mb chunks of the chromosome + } + + def callThisChunk() = { + + val interval = "%d".format(qscript.chr) + for( population <- qscript.populations ) { + val baseName: String = qscript.outputDir + "/" + population + ".phase1.chr" + qscript.chr.toString + val bamList: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.bam.lists/%s.chr%d.list".format(population, qscript.chr)) + + val rawCalls = new File(baseName + ".raw.vcf") + val filteredCalls = new File(baseName + ".filtered.vcf") + val clusterFile = new File(baseName + ".omni.clusters") + val recalibratedCalls = new File(baseName + ".recal.vcf") + val tranchesFile = new File(baseName + ".ts.omni.tranches") + + var call = new UnifiedGenotyper with CommandLineGATKArgs + call.intervalsString ++= List(qscript.intervals) + call.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 + call.dcov = Some( 50 ) + call.stand_call_conf = Some( 4.0 ) + call.stand_emit_conf = Some( 4.0 ) + call.input_file :+= bamList + call.out = rawCalls + call.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) + call.analysisName = baseName + "_UG" + + var filter = new VariantFiltration with CommandLineGATKArgs + filter.intervalsString ++= List(qscript.intervals) + filter.scatterCount = 10 + filter.variantVCF = rawCalls + filter.out = filteredCalls + filter.filterName ++= List("HARD_TO_VALIDATE") + filter.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") + filter.analysisName = baseName + "_VF" + filter.rodBind :+= RodBind("mask", "Bed", qscript.dindelMask) + filter.maskName = "InDel" + + var gvc = new GenerateVariantClusters with CommandLineGATKArgs + gvc.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) + gvc.rodBind :+= RodBind("1kg", "VCF", qscript.omni) + gvc.rodBind :+= RodBind("input", "VCF", filteredCalls ) + gvc.clusterFile = clusterFile + gvc.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") + gvc.analysisName = baseName + "_GVC" + gvc.intervalsString ++= List(qscript.intervals) + gvc.qual = Some(100) // clustering parameters to be updated soon pending new experimentation results + gvc.std = Some(4.5) + gvc.mG = Some(6) + + var vr = new VariantRecalibrator with CommandLineGATKArgs + vr.rodBind :+= RodBind("1kg", "VCF", qscript.omni) + vr.rodBind :+= RodBind("hapmap", "VCF", qscript.hapmap) + vr.rodBind :+= RodBind("truthOmni", "VCF", qscript.omni) + vr.rodBind :+= RodBind("truthHapMap", "VCF", qscript.hapmap) + vr.rodBind :+= RodBind("input", "VCF", filteredCalls ) + vr.clusterFile = clusterFile + vr.analysisName = baseName + "_VR" + vr.intervalsString ++= List(qscript.intervals) + vr.ignoreFilter ++= List("HARD_TO_VALIDATE") + vr.target_titv = Some(2.3) + vr.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY) + vr.tranche ++= List("0.1", "1.0", "2.0", "3.0", "5.0", "10.0", "100.0") + vr.out = recalibratedCalls + vr.priorDBSNP = Some(10.0) + vr.priorHapMap = Some(12.0) + vr.prior1KG = Some(12.0) + vr.tranchesFile = tranchesFile + + add(call, filter, gvc, vr) + } + + } +} \ No newline at end of file diff --git a/scala/qscript/rpoplin/Phase1Cleaning.scala b/scala/qscript/rpoplin/Phase1Cleaning.scala new file mode 100755 index 000000000..913a58f0f --- /dev/null +++ b/scala/qscript/rpoplin/Phase1Cleaning.scala @@ -0,0 +1,99 @@ +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType + +class Phase1Cleaning extends QScript { + qscript => + + @Input(doc="path to GATK jar", shortName="gatk", required=true) + var gatkJar: File = _ + + @Input(doc="the chromosome to process", shortName="chr", required=false) + var chr: Int = 20 + + @Input(doc="output path", shortName="outputDir", required=false) + var outputDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" + + @Input(doc="base output filename", shortName="baseName", required=false) + var baseName: String = "" + + @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false) + var outputTmpDir: String = "/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams" + + private val tmpDir: File = new File("/broad/shptmp/rpoplin/tmp/") + private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.vcf") + private val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" + private val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/AFR.dindel_august_release.20110110.sites.vcf.gz" + private val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/ASN.dindel_august_release.20110110.sites.vcf.gz" + private val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110111_august_dindel_indel_calls/EUR.dindel_august_release.20110110.sites.vcf.gz" + val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) + val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","JPT","LWK","MXL","PUR","TSI","YRI") + //val populations = List("ZZZ") // small set used for debugging + + private var pipeline: Pipeline = _ + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.reference + this.memoryLimit = Some(2) + this.jobTempDir = qscript.tmpDir + } + + def script = { + callThisChunk() // using scatter/gather capabilities of Queue so no need to for loop over 1Mb chunks of the chromosome + } + + + def callThisChunk() = { + + val interval = "%d".format(qscript.chr) + for( population <- qscript.populations ) { + val baseTmpName: String = qscript.outputTmpDir + "/" + population + ".phase1.chr" + qscript.chr.toString + "." + val bamList: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.bam.lists/%s.chr%d.list".format(population, qscript.chr)) + val targetIntervals: File = new File("/humgen/1kg/processing/allPopulations_chr20_phase1_release/perPop.cleaned.BAQed.bams/intervals/%s.chr%d.intervals".format(population, qscript.chr)) + + // 1.) Create cleaning targets + var target = new RealignerTargetCreator with CommandLineGATKArgs + target.memoryLimit = Some(4) + target.input_file :+= bamList + target.intervalsString :+= interval + target.out = targetIntervals + target.mismatchFraction = Some(0.0) + target.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) + target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelPilotCalls) + target.rodBind :+= RodBind("indels2", "VCF", qscript.dindelAFRCalls) + target.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) + target.rodBind :+= RodBind("indels4", "VCF", qscript.dindelASNCalls) + target.jobName = baseName + population + ".target" + + // 2.) Clean without SW + var clean = new IndelRealigner with CommandLineGATKArgs + val cleanedBam = new File(baseTmpName + "cleaned.bam") + clean.memoryLimit = Some(4) + clean.input_file :+= bamList + clean.intervalsString :+= interval + clean.targetIntervals = targetIntervals + clean.out = cleanedBam + clean.doNotUseSW = true + clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) + clean.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP) + clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelPilotCalls) + clean.rodBind :+= RodBind("indels2", "VCF", qscript.dindelAFRCalls) + clean.rodBind :+= RodBind("indels3", "VCF", qscript.dindelEURCalls) + clean.rodBind :+= RodBind("indels4", "VCF", qscript.dindelASNCalls) + clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true + clean.jobName = baseName + population + ".clean" + + add(target, clean) + } + + } +} \ No newline at end of file