From d77f4ebe31a8f9e48165bd7ccfd3cd39f2ee25e1 Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 21 May 2011 14:02:30 +0000 Subject: [PATCH] CalibrateGenotypeLikelihoods now emits a molten data set with REF and ALT alleles, so that GL calibration can be evaluated as a function of the REF/ALT bases. DigestTable is a stand-alone Rscript that digests the multi-GB molten data table into a tiny table that shows reported vs. empirical GLs, as a function of a variety of features of the data, like REF/ALT, comp GT, eval GT, and GL itself. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5833 348d0f76-0448-11de-a6fe-93d51630548a --- analysis/depristo/genotypeAccuracy/commands.R | 64 +++++-------------- .../depristo/genotypeAccuracy/digestTable.R | 62 ++++++++++++++++++ .../walkers/CalibrateGenotypeLikelihoods.java | 14 ++-- 3 files changed, 86 insertions(+), 54 deletions(-) create mode 100644 analysis/depristo/genotypeAccuracy/digestTable.R diff --git a/analysis/depristo/genotypeAccuracy/commands.R b/analysis/depristo/genotypeAccuracy/commands.R index 4d25f6fc0..73cd2c372 100644 --- a/analysis/depristo/genotypeAccuracy/commands.R +++ b/analysis/depristo/genotypeAccuracy/commands.R @@ -2,60 +2,27 @@ require("lattice") require("ggplot2") require("splines") -READ_DATA = F - -if ( READ_DATA ) { - d = subset(read.table("~/Dropbox/Analysis/genotypeAccuracy/cgl.hiseq.table", header=T), rg != "ALL") - d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent")) - d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G" - d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper" - d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent" - print(summary(d$technology)) -#d = read.table("~/Desktop/broadLocal/GATK/trunk/foo", header=T) +ymax = xmax = 30 +HAVE_RAW_DATA = F +if ( HAVE_RAW_DATA ) { + inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table" + #inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz" + r <- digestTable(inputDataFile) + d = r$d + eByComp = r$eByComp + countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts)) + print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax)) + + geom_abline(slope=1, linetype=2)) + # + geom_smooth(se=T, size=1.5, aes(weight=Sum))) +} else { + eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T) } -#moltenCD = melt(d, id.vars=c("comp", "rg"), measure.vars=c("QofAAGivenD", "QofABGivenD", "QofBBGivenD")) -#moltenCD$log10value = round(-10*log10(1-10^moltenCD$value)) - -genotypeCounts <- function(x) { - #print(table(x$comp)) - type = unique(x$variable)[1] - #print(type) - t = addmargins(table(x$comp)) - return(t) -} - -addEmpiricalPofG <- function(d) { - r = c() - # - # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp - # track is perfect. In reality the chip is at best Q30 accurate (replicate samples have - # level than this level of concordance). At low incoming confidence, we can effectively - # ignore this term but when the incoming Q is near or above Q30 this approximation clearly - # breaks down. - # - for ( i in 1:dim(d)[1] ) { - row = d[i,] - if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF - if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET - if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR - r = c(r, v / row$Sum) - } - - #print(length(r)) - d$EmpiricalPofG = r - d$EmpiricalPofGQ = round(-10*log10(1-r)) - return(d) -} - -eByComp <- addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts)) -countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts)) -print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD")) +#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD")) #print(subset(eByComp, EmpiricalPofGQ < Inf)) goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf) -ymax = xmax = 30 print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2)) print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax)) @@ -71,4 +38,3 @@ print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ + geom_abline(slope=1, linetype=2) + geom_smooth(se=T, size=1.5, aes(weight=Sum))) - diff --git a/analysis/depristo/genotypeAccuracy/digestTable.R b/analysis/depristo/genotypeAccuracy/digestTable.R new file mode 100644 index 000000000..1278abc47 --- /dev/null +++ b/analysis/depristo/genotypeAccuracy/digestTable.R @@ -0,0 +1,62 @@ +#!/bin/env Rscript + +require("ggplot2") + +args <- commandArgs(TRUE) +verbose = TRUE + +inputDataFile = args[1] +onCmdLine = ! is.na(inputDataFile) + +addEmpiricalPofG <- function(d) { + r = c() + # + # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp + # track is perfect. In reality the chip is at best Q30 accurate (replicate samples have + # level than this level of concordance). At low incoming confidence, we can effectively + # ignore this term but when the incoming Q is near or above Q30 this approximation clearly + # breaks down. + # + for ( i in 1:dim(d)[1] ) { + row = d[i,] + if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF + if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET + if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR + r = c(r, v / row$Sum) + } + + #print(length(r)) + d$EmpiricalPofG = r + d$EmpiricalPofGQ = round(-10*log10(1-r)) + return(d) +} + +genotypeCounts <- function(x) { + type = unique(x$variable)[1] + t = addmargins(table(x$comp)) + return(t) +} + + +digestTable <- function(inputDataFile) { + d = subset(read.table(inputDataFile, header=T), rg != "ALL") + d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent")) + d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G" + d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper" + d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent" + print(summary(d$technology)) + + eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts)) + return(list(d=d, eByComp = eByComp)) + #countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts)) +} + +writeMyTable <- function(t, name) { + write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep="")) +} + +if ( onCmdLine ) { + r <- digestTable(inputDataFile) + writeMyTable(r$eByComp, "eByComp") +} + diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java index eacc7498e..db2d7fc31 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java @@ -97,6 +97,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker { String rgID, sample; GenotypeLikelihoods pl; + String ref, alt; VariantContext.Type siteType; Genotype.Type genotypeType; @@ -107,7 +108,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD"); - List fields = Arrays.asList("sample", "rg", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD"); + List fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD"); out.println(Utils.join("\t", fields)); double[] counts = new double[]{1, 1, 1}; @@ -260,8 +264,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker 1 ) { // tons of 1s, and not interesting - out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%d%n", - d.sample, d.rgID, d.siteType, d.pl.getAsString(), d.genotypeType.toString(), + out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n", + d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(), pGNames.get(i), q); } }