From 27d4b317fc0bb16d4a13b83a3be5a0e736c9d081 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 16 Jun 2011 12:56:04 +0000 Subject: [PATCH] Simple program that calls indels in CEU trio exomes and WGS can compared the results. Overall the indel calls really look good to me, given reasonably good input BAM files. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6006 348d0f76-0448-11de-a6fe-93d51630548a --- analysis/depristo/genotypeAccuracy/commands.R | 2 +- .../depristo/IndelCallerEvaluation.scala | 113 ++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100755 scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala diff --git a/analysis/depristo/genotypeAccuracy/commands.R b/analysis/depristo/genotypeAccuracy/commands.R index 73cd2c372..4c49c273d 100644 --- a/analysis/depristo/genotypeAccuracy/commands.R +++ b/analysis/depristo/genotypeAccuracy/commands.R @@ -15,7 +15,7 @@ if ( HAVE_RAW_DATA ) { + geom_abline(slope=1, linetype=2)) # + geom_smooth(se=T, size=1.5, aes(weight=Sum))) } else { - eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T) + eByComp = read.table("~/Dropbox/GSA members/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T) } #print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD")) diff --git a/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala b/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala new file mode 100755 index 000000000..30c577806 --- /dev/null +++ b/scala/qscript/oneoffs/depristo/IndelCallerEvaluation.scala @@ -0,0 +1,113 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction + +class IndelCallerEvaluation extends QScript { + val BUNDLE = "/humgen/gsa-hpprojects/GATK/bundle/current" + + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File(BUNDLE + "/b37/human_g1k_v37.fasta") + + @Argument(shortName = "bam", doc="BAM", required=true) + val bams: List[File] = null; + + @Argument(shortName = "intervals", doc="intervals", required=false) + val myIntervals: String = null; + + @Argument(shortName = "dcov", doc="dcov", required=false) + val DCOV: Int = 250; + + val dbSNP: File = new File(BUNDLE + "/b37/dbsnp_132.b37.vcf") + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.reference_sequence = referenceFile; + this.memoryLimit = 4 + + if ( intervals != null ) + this.intervalsString = List(myIntervals); + } + + trait CoFoJa extends JavaCommandLineFunction { + override def javaOpts = super.javaOpts // + " -javaagent:lib/cofoja.jar" + } + + def processOne(bam: File, gsaProduction: Boolean): File = { + val rawVCF = new Call(bam, gsaProduction) + add(rawVCF) + + val filterIndels = new FilterIndels(rawVCF.out) + add(filterIndels) + + // create a variant eval for us + add(new Eval(filterIndels.out)) + return filterIndels.out + } + + def script = { + for ( gsaProduction <- List(true, false)) { + val vcfs = bams.map(processOne(_, gsaProduction)) + + val combineCalls = new CombineVariants with UNIVERSAL_GATK_ARGS + for ( vcf <- vcfs ) + combineCalls.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + + combineCalls.filteredrecordsmergetype = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED + combineCalls.out = "combined" + productionString(gsaProduction) + ".vcf" + add(combineCalls) + + add(new ToTable(combineCalls.out)) + } + } + + class FilterIndels(@Input vcf: File) extends VariantFiltration with UNIVERSAL_GATK_ARGS { + this.variantVCF = vcf + this.filterName = List("Indel_QUAL", "Indel_SB", "Indel_QD") + this.filterExpression = List("\"QUAL<30.0\"", "\"SB>-1.0\"", "\"QD<2.0\"") + this.out = swapExt(vcf,".vcf",".filtered.vcf") + } + + class ToTable(@Input vcf: File) extends VariantsToTable with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("variant", "VCF", vcf) + this.fields = List("FILTER", "set") + this.out = swapExt(vcf,".vcf",".table") + this.raw = true + } + + class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("eval", "VCF", vcf) + if ( dbSNP.exists() ) + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.doNotUseAllStandardStratifications = true + this.doNotUseAllStandardModules = true + this.evalModule = List("CountVariants", "IndelStatistics", "CompOverlap") + this.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter", "JexlExpression") + this.out = swapExt(vcf,".vcf",".eval") + } + + def productionString(gsaProduction: Boolean): String = { + return if ( gsaProduction ) ".prod" else ".expt" + } + + class Call(@Input(doc="foo") bam: File, gsaProduction: Boolean) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { + @Output(doc="foo") var outVCF: File = swapExt(bam,".bam", productionString(gsaProduction) + ".indels.vcf") + this.input_file = List(bam) + this.stand_call_conf = 50.0 + this.stand_emit_conf = 50.0 + this.dcov = DCOV; + this.o = outVCF + + this.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL + this.GSA_PRODUCTION_ONLY = gsaProduction + + if ( dbSNP.exists() ) + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + } +} +