gatk-3.8/scala/qscript/oneoffs/chartl/ExomeVQSR.q

220 lines
11 KiB
Plaintext
Executable File

import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.function.ListWriterFunction
import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
import org.broadinstitute.sting.queue.QScript
import collection.JavaConversions._
import org.broadinstitute.sting.utils.text.XReadLines
class FullCallingPipeline extends QScript {
qscript =>
@Input(doc="path to GATK jar", shortName="G")
var gatkJar: File = _
@Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels). By default is set to 20.", shortName="varScatter", required=false)
var num_var_scatter_jobs = 20
@Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false)
var expandIntervals = 50
private var pipeline: Pipeline = _
private final val picardFixMatesClass = "net.sf.picard.sam.FixMateInformation"
val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList
val DBSNP : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list")
val BASE : String = "exon_vqsr"
val handFiltered : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf")
trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals :+= INTS
this.jarFile = qscript.gatkJar
this.reference_sequence = REF
this.memoryLimit = Some(4)
}
// ------------ SETUP THE PIPELINE ----------- //
def script = {
endToEnd(BASE,"cleaned")
}
def endToEnd(base: String, bamType: String) = {
val bamFiles = BAM_FILES
val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS")
ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out")
if (qscript.expandIntervals > 0) {
//add(ei)
}
trait ExpandedIntervals extends CommandLineGATK {
if (qscript.expandIntervals > 0) {
this.intervals :+= ei.outList
}
}
// Call indels
val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
indels.analysisName = base + "_indels"
indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out")
indels.memoryLimit = Some(6)
indels.downsample_to_coverage = Some(600)
indels.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL
indels.input_file = bamFiles
indels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
indels.out = new File("IndelCalls", base+".indels.vcf")
indels.scatterCount = qscript.num_var_scatter_jobs
indels.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("IndelCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
}
indels.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
}
indels.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
}
// Filter indels
val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
filteredIndels.analysisName = base + "_filteredIndels"
filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out")
filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter")
filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"")
filteredIndels.variantVCF = indels.out
filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf")
// Call snps
val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
snps.analysisName = base+"_snps"
snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out")
snps.memoryLimit = Some(6)
snps.downsample_to_coverage = Some(600)
snps.input_file = bamFiles
snps.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
snps.out = new File("SnpCalls", base+".snps.vcf")
snps.scatterCount = qscript.num_var_scatter_jobs
snps.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
}
snps.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
}
snps.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
}
// Filter snps at indels
val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
filteredSNPs.analysisName = base+"_filteredSNPs"
filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out")
filteredSNPs.clusterWindowSize = Some(10)
filteredSNPs.clusterSize = Some(3)
filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out)
filteredSNPs.variantVCF = snps.out
filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf")
// Mako de Clusters
val cr = new ContrastiveRecalibrator with CommandLineGATKArgs with ExpandedIntervals
cr.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
cr.rodBind :+= new RodBind("dbsnp","vcf",DBSNP,"known=true,training=false,truth=false,prior=8.0")
cr.rodBind :+= new RodBind("hapmap","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"),"known=false,training=true,truth=true,prior=15.0")
cr.rodBind :+= new RodBind("omni","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"),"known=false,training=true,truth=true,prior=12.0")
cr.allPoly = true
cr.use_annotation ++= List("HaplotypeScore","SB","QD","HRun")
cr.tranches_file = new File(base+".tranche")
cr.recal_file = new File(base+".contrastive.recal.table")
cr.tranche ++= List("99.9","99.5","99.25","98.0","97.75","97.65","97.5","97.3","97.2","97.1","98.0","97.5","97.0","96.75","96.5","96.0","95.5","95.0","94.75","94.5","94.25","94.0",
"93.75","93.5","93.25","93.0","92.75","92.5","92.25","92.0","91.0","90.0")
cr.analysisName = base+"_ContrastiveRecalibrator"
cr.memoryLimit = Some(32)
cr.num_threads = Some(6)
// Apply the Recalibration
val ar = new ApplyRecalibration with CommandLineGATKArgs with ExpandedIntervals
ar.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
ar.tranches_file = cr.tranches_file
ar.recal_file = cr.recal_file
ar.ts_filter_level = Some(91.75)
ar.out = new File(base+"_contrastive_recal.91.75.vcf")
ar.memoryLimit = Some(6)
// Variant eval the standard region
val stdEval = new VariantEval with CommandLineGATKArgs
stdEval.analysisName = base+"_VariantEval"
stdEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.std.out")
stdEval.noST = true
stdEval.noEV = true
stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
stdEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
stdEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
stdEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
stdEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
stdEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
stdEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
stdEval.out = swapExt(ar.out, ".vcf", ".eval")
stdEval.num_threads = Some(6)
// Variant eval the flanking region
val flanksEval = new VariantEval with CommandLineGATKArgs
flanksEval.analysisName = base+"_VariantEval"
flanksEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.flanks.out")
flanksEval.intervals = List(ei.outList)
flanksEval.noST = true
flanksEval.noEV = true
flanksEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
flanksEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
flanksEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
flanksEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
flanksEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
flanksEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
flanksEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
flanksEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
flanksEval.out = swapExt(ar.out, ".vcf", ".flanks.eval")
flanksEval.num_threads = Some(6)
// Make the bam list
val listOfBams = new File("Resources", base +".BamFiles.list")
val writeBamList = new ListWriterFunction
writeBamList.analysisName = base + "_BamList"
writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out")
writeBamList.inputFiles = bamFiles
writeBamList.listFile = listOfBams
//add(indels, filteredIndels, snps, filteredSNPs, stdEval, writeBamList,cr,ar)
add(ar,stdEval)
if (qscript.expandIntervals > 0) {
add(flanksEval)
}
}
}