CalibrateGenotypeLikelihoods now emits a molten data set with REF and ALT alleles, so that GL calibration can be evaluated as a function of the REF/ALT bases. DigestTable is a stand-alone Rscript that digests the multi-GB molten data table into a tiny table that shows reported vs. empirical GLs, as a function of a variety of features of the data, like REF/ALT, comp GT, eval GT, and GL itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-05-21 14:02:30 +00:00
parent 6a49e8df34
commit d77f4ebe31
3 changed files with 86 additions and 54 deletions

View File

@ -2,60 +2,27 @@ require("lattice")
require("ggplot2")
require("splines")
READ_DATA = F
if ( READ_DATA ) {
d = subset(read.table("~/Dropbox/Analysis/genotypeAccuracy/cgl.hiseq.table", header=T), rg != "ALL")
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
print(summary(d$technology))
#d = read.table("~/Desktop/broadLocal/GATK/trunk/foo", header=T)
ymax = xmax = 30
HAVE_RAW_DATA = F
if ( HAVE_RAW_DATA ) {
inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
#inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
r <- digestTable(inputDataFile)
d = r$d
eByComp = r$eByComp
countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
+ geom_abline(slope=1, linetype=2))
# + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
} else {
eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
}
#moltenCD = melt(d, id.vars=c("comp", "rg"), measure.vars=c("QofAAGivenD", "QofABGivenD", "QofBBGivenD"))
#moltenCD$log10value = round(-10*log10(1-10^moltenCD$value))
genotypeCounts <- function(x) {
#print(table(x$comp))
type = unique(x$variable)[1]
#print(type)
t = addmargins(table(x$comp))
return(t)
}
addEmpiricalPofG <- function(d) {
r = c()
#
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
# level than this level of concordance). At low incoming confidence, we can effectively
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
# breaks down.
#
for ( i in 1:dim(d)[1] ) {
row = d[i,]
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
r = c(r, v / row$Sum)
}
#print(length(r))
d$EmpiricalPofG = r
d$EmpiricalPofGQ = round(-10*log10(1-r))
return(d)
}
eByComp <- addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
#print(subset(eByComp, EmpiricalPofGQ < Inf))
goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
ymax = xmax = 30
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax))
@ -71,4 +38,3 @@ print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~
+ geom_abline(slope=1, linetype=2)
+ geom_smooth(se=T, size=1.5, aes(weight=Sum)))

View File

@ -0,0 +1,62 @@
#!/bin/env Rscript
require("ggplot2")
args <- commandArgs(TRUE)
verbose = TRUE
inputDataFile = args[1]
onCmdLine = ! is.na(inputDataFile)
addEmpiricalPofG <- function(d) {
r = c()
#
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
# level than this level of concordance). At low incoming confidence, we can effectively
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
# breaks down.
#
for ( i in 1:dim(d)[1] ) {
row = d[i,]
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
r = c(r, v / row$Sum)
}
#print(length(r))
d$EmpiricalPofG = r
d$EmpiricalPofGQ = round(-10*log10(1-r))
return(d)
}
genotypeCounts <- function(x) {
type = unique(x$variable)[1]
t = addmargins(table(x$comp))
return(t)
}
digestTable <- function(inputDataFile) {
d = subset(read.table(inputDataFile, header=T), rg != "ALL")
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
print(summary(d$technology))
eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
return(list(d=d, eByComp = eByComp))
#countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
}
writeMyTable <- function(t, name) {
write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
}
if ( onCmdLine ) {
r <- digestTable(inputDataFile)
writeMyTable(r$eByComp, "eByComp")
}

View File

@ -97,6 +97,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
public static class Datum implements Comparable<Datum> {
String rgID, sample;
GenotypeLikelihoods pl;
String ref, alt;
VariantContext.Type siteType;
Genotype.Type genotypeType;
@ -107,7 +108,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
return bySample != 0 ? bySample : byRG;
}
public Datum(String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
public Datum(String ref, String alt, String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
this.ref = ref;
this.alt = alt;
this.sample = sample;
this.rgID = rgID;
this.pl = pl;
@ -196,7 +199,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
Genotype rgGT = call.getGenotype(sample);
if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) {
Datum d = new Datum(sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
Datum d = new Datum(vcComp.getReference().getBaseString(), vcComp.getAlternateAllele(0).getBaseString(),
sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
data.values.add(d);
}
}
@ -242,7 +246,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
public void onTraversalDone(Data data) {
// print the header
List<String> pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD");
List<String> fields = Arrays.asList("sample", "rg", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
out.println(Utils.join("\t", fields));
double[] counts = new double[]{1, 1, 1};
@ -260,8 +264,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
for ( int i = 0; i < pGNames.size(); i++ ) {
int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9));
if ( q > 1 ) { // tons of 1s, and not interesting
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
d.sample, d.rgID, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
pGNames.get(i), q);
}
}