CalibrateGenotypeLikelihoods now emits a molten data set with REF and ALT alleles, so that GL calibration can be evaluated as a function of the REF/ALT bases. DigestTable is a stand-alone Rscript that digests the multi-GB molten data table into a tiny table that shows reported vs. empirical GLs, as a function of a variety of features of the data, like REF/ALT, comp GT, eval GT, and GL itself.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6a49e8df34
commit
d77f4ebe31
|
|
@ -2,60 +2,27 @@ require("lattice")
|
||||||
require("ggplot2")
|
require("ggplot2")
|
||||||
require("splines")
|
require("splines")
|
||||||
|
|
||||||
READ_DATA = F
|
ymax = xmax = 30
|
||||||
|
HAVE_RAW_DATA = F
|
||||||
if ( READ_DATA ) {
|
if ( HAVE_RAW_DATA ) {
|
||||||
d = subset(read.table("~/Dropbox/Analysis/genotypeAccuracy/cgl.hiseq.table", header=T), rg != "ALL")
|
inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
|
||||||
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
|
#inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
|
||||||
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
|
r <- digestTable(inputDataFile)
|
||||||
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
|
d = r$d
|
||||||
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
|
eByComp = r$eByComp
|
||||||
print(summary(d$technology))
|
countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||||
#d = read.table("~/Desktop/broadLocal/GATK/trunk/foo", header=T)
|
print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
|
||||||
|
+ geom_abline(slope=1, linetype=2))
|
||||||
|
# + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
|
||||||
|
} else {
|
||||||
|
eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
|
||||||
}
|
}
|
||||||
|
|
||||||
#moltenCD = melt(d, id.vars=c("comp", "rg"), measure.vars=c("QofAAGivenD", "QofABGivenD", "QofBBGivenD"))
|
#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
|
||||||
#moltenCD$log10value = round(-10*log10(1-10^moltenCD$value))
|
|
||||||
|
|
||||||
genotypeCounts <- function(x) {
|
|
||||||
#print(table(x$comp))
|
|
||||||
type = unique(x$variable)[1]
|
|
||||||
#print(type)
|
|
||||||
t = addmargins(table(x$comp))
|
|
||||||
return(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
addEmpiricalPofG <- function(d) {
|
|
||||||
r = c()
|
|
||||||
#
|
|
||||||
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
|
|
||||||
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
|
|
||||||
# level than this level of concordance). At low incoming confidence, we can effectively
|
|
||||||
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
|
|
||||||
# breaks down.
|
|
||||||
#
|
|
||||||
for ( i in 1:dim(d)[1] ) {
|
|
||||||
row = d[i,]
|
|
||||||
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
|
|
||||||
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
|
|
||||||
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
|
|
||||||
r = c(r, v / row$Sum)
|
|
||||||
}
|
|
||||||
|
|
||||||
#print(length(r))
|
|
||||||
d$EmpiricalPofG = r
|
|
||||||
d$EmpiricalPofGQ = round(-10*log10(1-r))
|
|
||||||
return(d)
|
|
||||||
}
|
|
||||||
|
|
||||||
eByComp <- addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
|
|
||||||
countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
|
|
||||||
print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
|
|
||||||
#print(subset(eByComp, EmpiricalPofGQ < Inf))
|
#print(subset(eByComp, EmpiricalPofGQ < Inf))
|
||||||
|
|
||||||
goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
|
goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
|
||||||
|
|
||||||
ymax = xmax = 30
|
|
||||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
|
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
|
||||||
|
|
||||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax))
|
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax))
|
||||||
|
|
@ -71,4 +38,3 @@ print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~
|
||||||
+ geom_abline(slope=1, linetype=2)
|
+ geom_abline(slope=1, linetype=2)
|
||||||
+ geom_smooth(se=T, size=1.5, aes(weight=Sum)))
|
+ geom_smooth(se=T, size=1.5, aes(weight=Sum)))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
#!/bin/env Rscript
|
||||||
|
|
||||||
|
require("ggplot2")
|
||||||
|
|
||||||
|
args <- commandArgs(TRUE)
|
||||||
|
verbose = TRUE
|
||||||
|
|
||||||
|
inputDataFile = args[1]
|
||||||
|
onCmdLine = ! is.na(inputDataFile)
|
||||||
|
|
||||||
|
addEmpiricalPofG <- function(d) {
|
||||||
|
r = c()
|
||||||
|
#
|
||||||
|
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
|
||||||
|
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
|
||||||
|
# level than this level of concordance). At low incoming confidence, we can effectively
|
||||||
|
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
|
||||||
|
# breaks down.
|
||||||
|
#
|
||||||
|
for ( i in 1:dim(d)[1] ) {
|
||||||
|
row = d[i,]
|
||||||
|
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
|
||||||
|
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
|
||||||
|
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
|
||||||
|
r = c(r, v / row$Sum)
|
||||||
|
}
|
||||||
|
|
||||||
|
#print(length(r))
|
||||||
|
d$EmpiricalPofG = r
|
||||||
|
d$EmpiricalPofGQ = round(-10*log10(1-r))
|
||||||
|
return(d)
|
||||||
|
}
|
||||||
|
|
||||||
|
genotypeCounts <- function(x) {
|
||||||
|
type = unique(x$variable)[1]
|
||||||
|
t = addmargins(table(x$comp))
|
||||||
|
return(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
digestTable <- function(inputDataFile) {
|
||||||
|
d = subset(read.table(inputDataFile, header=T), rg != "ALL")
|
||||||
|
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
|
||||||
|
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
|
||||||
|
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
|
||||||
|
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
|
||||||
|
print(summary(d$technology))
|
||||||
|
|
||||||
|
eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||||
|
return(list(d=d, eByComp = eByComp))
|
||||||
|
#countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||||
|
}
|
||||||
|
|
||||||
|
writeMyTable <- function(t, name) {
|
||||||
|
write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( onCmdLine ) {
|
||||||
|
r <- digestTable(inputDataFile)
|
||||||
|
writeMyTable(r$eByComp, "eByComp")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -97,6 +97,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
|
||||||
public static class Datum implements Comparable<Datum> {
|
public static class Datum implements Comparable<Datum> {
|
||||||
String rgID, sample;
|
String rgID, sample;
|
||||||
GenotypeLikelihoods pl;
|
GenotypeLikelihoods pl;
|
||||||
|
String ref, alt;
|
||||||
VariantContext.Type siteType;
|
VariantContext.Type siteType;
|
||||||
Genotype.Type genotypeType;
|
Genotype.Type genotypeType;
|
||||||
|
|
||||||
|
|
@ -107,7 +108,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
|
||||||
return bySample != 0 ? bySample : byRG;
|
return bySample != 0 ? bySample : byRG;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Datum(String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
|
public Datum(String ref, String alt, String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
|
||||||
|
this.ref = ref;
|
||||||
|
this.alt = alt;
|
||||||
this.sample = sample;
|
this.sample = sample;
|
||||||
this.rgID = rgID;
|
this.rgID = rgID;
|
||||||
this.pl = pl;
|
this.pl = pl;
|
||||||
|
|
@ -196,7 +199,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
|
||||||
Genotype rgGT = call.getGenotype(sample);
|
Genotype rgGT = call.getGenotype(sample);
|
||||||
|
|
||||||
if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) {
|
if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) {
|
||||||
Datum d = new Datum(sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
|
Datum d = new Datum(vcComp.getReference().getBaseString(), vcComp.getAlternateAllele(0).getBaseString(),
|
||||||
|
sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
|
||||||
data.values.add(d);
|
data.values.add(d);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -242,7 +246,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
|
||||||
public void onTraversalDone(Data data) {
|
public void onTraversalDone(Data data) {
|
||||||
// print the header
|
// print the header
|
||||||
List<String> pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD");
|
List<String> pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD");
|
||||||
List<String> fields = Arrays.asList("sample", "rg", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
|
List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
|
||||||
out.println(Utils.join("\t", fields));
|
out.println(Utils.join("\t", fields));
|
||||||
|
|
||||||
double[] counts = new double[]{1, 1, 1};
|
double[] counts = new double[]{1, 1, 1};
|
||||||
|
|
@ -260,8 +264,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
|
||||||
for ( int i = 0; i < pGNames.size(); i++ ) {
|
for ( int i = 0; i < pGNames.size(); i++ ) {
|
||||||
int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9));
|
int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9));
|
||||||
if ( q > 1 ) { // tons of 1s, and not interesting
|
if ( q > 1 ) { // tons of 1s, and not interesting
|
||||||
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
|
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
|
||||||
d.sample, d.rgID, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
|
d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
|
||||||
pGNames.get(i), q);
|
pGNames.get(i), q);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue