Initial VQSR full search script
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5591 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cb3e8aec5e
commit
a1b7d28375
|
|
@ -0,0 +1,224 @@
|
|||
import collection.mutable.HashMap
|
||||
import java.io.{PrintWriter, PrintStream}
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.function.QFunction
|
||||
import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction}
|
||||
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions
|
||||
|
||||
class Exome_VQSR_FullSearch extends QScript {
|
||||
qScript =>
|
||||
|
||||
// COMMAND LINE ARGUMENTS
|
||||
|
||||
// VARIABLES USED
|
||||
val SCRIPT_BASE_NAME = "Exome_VQSR_Search"
|
||||
val UG_CALL_THRESH = 10.0;
|
||||
val VQSR_CALL_THRESH = List(10.0,20.0,30.0,40.0,50.0)
|
||||
val VQSR_ANNOTATIONS_TO_USE = List(List("QD","SB"),List("QD","SB","HRun"),List("QD","SB","HaplotypeScore"),List("QD","SB","HRun","HaplotypeScore"))
|
||||
val HM3_SITES = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf")
|
||||
val OMNI_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
|
||||
val AXIOM_CHIP = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf")
|
||||
val DBSNP_129 = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||
val SENSITIVITY = (new Range(0,19)).map(u => 90.0 + 0.5*u).toList
|
||||
val RECALIBRATE_TOGETHER = List(true,false)
|
||||
val VQSR_HAPMAP_PRIOR = "15.0"
|
||||
val VQSR_OMNI_PRIOR = "12.0"
|
||||
|
||||
var VQSR_RODBINDS : Map[String,List[RodBind]] = new HashMap[String,List[RodBind]]
|
||||
val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s"
|
||||
val VQSR_DBSNP_TAG = "known=true,training=false,truth=false"
|
||||
|
||||
|
||||
for ( tf <- List( (true,false),(false,true),(true,true)) ) {
|
||||
var mrb: List[RodBind] = Nil
|
||||
val ext = (if ( tf._1 ) "HT" else "HF") + (if ( tf._2 ) "OT" else "OF")
|
||||
val hmSt = if ( tf._1 ) "true" else "false"
|
||||
val omSt = if ( tf._2 ) "true" else "false"
|
||||
mrb :+= RodBind("dbsnp","VCF",DBSNP_129,VQSR_DBSNP_TAG)
|
||||
mrb :+= RodBind("HapMap3","VCF",HM3_SITES,VQSR_TAG_FT.format(hmSt,VQSR_HAPMAP_PRIOR))
|
||||
mrb :+= RodBind("Omni","VCF",OMNI_CHIP,VQSR_TAG_FT.format(omSt,VQSR_OMNI_PRIOR))
|
||||
VQSR_RODBINDS :+= (ext,mrb)
|
||||
}
|
||||
|
||||
val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList
|
||||
val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
||||
val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list")
|
||||
val EXPAND_INTS = 40
|
||||
val HAND_FILTERED : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf")
|
||||
val GATK_JAR : File = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
||||
|
||||
def script() {
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.intervals :+= INTS
|
||||
this.jarFile = GATK_JAR
|
||||
this.reference_sequence = REF
|
||||
this.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS")
|
||||
ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out")
|
||||
|
||||
if (EXPAND_INTS > 0) {
|
||||
add(ei)
|
||||
}
|
||||
|
||||
trait ExpandedIntervals extends CommandLineGATK {
|
||||
if (EXPAND_INTS > 0) {
|
||||
this.intervals :+= ei.outList
|
||||
}
|
||||
}
|
||||
|
||||
val callSNPsAndIndels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
|
||||
callSNPsAndIndels.analysisName = base+"_calls"
|
||||
callSNPsAndIndels.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out")
|
||||
callSNPsAndIndels.memoryLimit = Some(6)
|
||||
callSNPsAndIndels.downsample_to_coverage = Some(600)
|
||||
callSNPsAndIndels.input_file = bamFiles
|
||||
callSNPsAndIndels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP_129)
|
||||
callSNPsAndIndels.out = new File(SCRIPT_BASE_NAME+".rawCalls.vcf")
|
||||
callSNPsAndIndels.stand_call_conf = Some(UG_CALL_THRESH)
|
||||
|
||||
callSNPsAndIndels.scatterCount = qscript.num_var_scatter_jobs
|
||||
callSNPsAndIndels.setupScatterFunction = {
|
||||
case scatter: ScatterFunction =>
|
||||
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
|
||||
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
|
||||
}
|
||||
callSNPsAndIndels.setupCloneFunction = {
|
||||
case (clone: CloneFunction, index: Int) =>
|
||||
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
|
||||
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
|
||||
}
|
||||
callSNPsAndIndels.setupGatherFunction = {
|
||||
case (gather: GatherFunction, source: ArgumentSource) =>
|
||||
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
|
||||
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
|
||||
}
|
||||
|
||||
add(callSNPsAndIndels)
|
||||
|
||||
class ExtractSNPs(in: File, out: File) extends InProcessFunction {
|
||||
|
||||
@Input(doc="foo")
|
||||
var inputVCF : File = in
|
||||
@Output(doc="foo")
|
||||
var outputVCF : File = out
|
||||
|
||||
def canPrint(line: String) : Boolean = {
|
||||
if ( line.startsWith("#") ) { true }
|
||||
val spline = line.split("\t",5)
|
||||
if ( spline.apply(3).size() > 1 || spline.apply(4).size() > 1 ) { false }
|
||||
true
|
||||
}
|
||||
|
||||
def run() {
|
||||
val outWriter = new PrintWriter(new PrintStream(outputVCF))
|
||||
asScalaIterator(new XReadLines(inputVCF)).foreach(u => {
|
||||
if ( canPrint(u) ) {
|
||||
outWriter.println(u)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
val extractSNPs : ExtractSNPs = new ExtractSNPs(callSNPsAndIndels.out,new File(base+".snpCalls.vcf"))
|
||||
add(extractSNPs)
|
||||
|
||||
def getPath(annoList: List[String], jointRecal: Boolean) : File = {
|
||||
new File("VQSR/%s/%s".format( annoList.reduceLeft( _ + "." + _ ) , if(jointRecal) "together" else "separate") )
|
||||
}
|
||||
|
||||
for ( annotations <- VQSR_ANNOTATIONS_TO_USE ) {
|
||||
for ( recalTogether <- RECALIBRATE_TOGETHER ) {
|
||||
val directory = getPath(annotations,recalTogether)
|
||||
for ( call_thresh <- VQSR_CALL_THRESH ) {
|
||||
var filterQual = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
|
||||
filterQual.rodBind :+= new RodBind(extractSNPs.outputVCF)
|
||||
filterQual.filterExpression :+= "QUAL < %.1f".format(VQSR_CALL_THRESH)
|
||||
filterQual.filterName :+= "LowQual"
|
||||
filterQual.commandDirectory = directory
|
||||
filterQual.out = new File(directory.getAbsolutePath+"/"+SCRIPT_BASE_NAME+".filterQual%.1f.vcf".format(VQSR_CALL_THRESH))
|
||||
add(filterQual)
|
||||
for ( vqsr_rb <- VQSR_RODBINDS.iterator ) {
|
||||
trait VQSR_Args extends ContrastiveRecalibrator {
|
||||
this.allPoly = true
|
||||
this.analysisName = "VQSR_%s_%s_%d".format( annotations.reduceLeft( _ + "." + _), if ( recalTogether ) "true" else "false", call_thresh)
|
||||
this.commandDirectory = directory
|
||||
this.use_annotation ++= annotations
|
||||
this.tranche ++= SENSITIVITY
|
||||
this.rodBind :+= RodBind("inputData","VCF",filterQual.out)
|
||||
this.rodBind ++= vqsr_rb._2
|
||||
}
|
||||
val nameFormat = SCRIPT_BASE_NAME+".%1f.%s.".format(call_thresh,vqsr_rb._1)+"%s."
|
||||
if ( recalTogether ) {
|
||||
var vqsr = new ContrastiveRecalibrator with VQSR_Args with ExpandedIntervals with CommandLineGATKArgs
|
||||
vqsr.tranchesFile = new File(nameFormat.format("both")+"tranche")
|
||||
vqsr.recalFile = new File(nameFormat.format("both")+"recal")
|
||||
add(vqsr)
|
||||
addAll(eval(vqsr))
|
||||
} else {
|
||||
var exons = new ContrastiveRecalibrator with VQSR_Args with CommandLineGATKArgs
|
||||
exons.tranchesFile = new File(nameFormat.format("exons")+"tranche")
|
||||
exons.recalFile = new File(nameFormat.format("exons")+"recal")
|
||||
var flanks = new ContrastiveRecalibrator with VQSR_Args
|
||||
flanks.intervals :+= ei.outList
|
||||
flanks.jarFile = GATK_JAR
|
||||
flanks.memoryLimit = Some(4)
|
||||
flanks.reference_sequence = REF
|
||||
flanks.tranchesFile = new File(nameFormat.format("flanks")+"tranche")
|
||||
flanks.recalFile = new File(nameFormat.format("flanks")+"recal")
|
||||
add(exons,flanks)
|
||||
addAll(eval(exons))
|
||||
addAll(eval(flanks))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// want to apply and eval
|
||||
def eval(recal: ContrastiveRecalibrator) : List[QFunction] = {
|
||||
var functions : List[QFunction] = Nil
|
||||
trait ImplicitArgs extends CommandLineGATK {
|
||||
this.jarFile = recal.jarFile
|
||||
this.reference_sequence = recal.reference_sequence
|
||||
this.commandDirectory = recal.commandDirectory
|
||||
this.intervals ++= recal.intervals
|
||||
}
|
||||
|
||||
trait ApplyArgs extends ApplyRecalibration with ImplicitArgs {
|
||||
this.tranchesFile = recal.tranchesFile
|
||||
this.recalFile = recal.recalFile
|
||||
}
|
||||
|
||||
trait EvalArgs extends VariantEval with ImplicitArgs {
|
||||
this.stratificationModule = List("Novelty")
|
||||
this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance")
|
||||
this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129)
|
||||
this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP)
|
||||
}
|
||||
|
||||
for ( sens <- SENSITIVITY ) {
|
||||
var cut = new ApplyRecalibration with ApplyArgs
|
||||
cut.analysisName = recal.analysisName+".cut%.1f".format(sens)
|
||||
val vcfExt = ".%.1f.vcf".format(sens)
|
||||
cut.out = swapExt(cut.recalFile,".recal",vcfExt)
|
||||
cut.ts_filter_level = Some(sens)
|
||||
functions :+= cut
|
||||
|
||||
var eval = new VariantEval with EvalArgs
|
||||
eval.analysisName = cut.analysisName+".eval"
|
||||
eval.out = swapExt(cut.out,".vcf",".eval")
|
||||
eval.rodBind :+= RodBind("evalContrastive","VCF",cut.out)
|
||||
functions :+= eval
|
||||
}
|
||||
|
||||
functions
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue