CalibrateGenotypeLikelihoods now emits a molten data set with REF and ALT alleles, so that GL calibration can be evaluated as a function of the REF/ALT bases. DigestTable is a stand-alone Rscript that digests the multi-GB molten data table into a tiny table that shows reported vs. empirical GLs, as a function of a variety of features of the data, like REF/ALT, comp GT, eval GT, and GL itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5833 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-21 14:02:30 +00:00 · 2011-05-21 14:02:30 +00:00 · d77f4ebe31
parent 6a49e8df34
commit d77f4ebe31
3 changed files with 86 additions and 54 deletions
--- a/analysis/depristo/genotypeAccuracy/commands.R
+++ b/analysis/depristo/genotypeAccuracy/commands.R
@ -2,60 +2,27 @@ require("lattice")
 require("ggplot2")
 require("splines")
-READ_DATA = F
+ymax = xmax = 30
-
+HAVE_RAW_DATA = F
-if ( READ_DATA ) {
+if ( HAVE_RAW_DATA ) {
-  d = subset(read.table("~/Dropbox/Analysis/genotypeAccuracy/cgl.hiseq.table", header=T), rg != "ALL")
+  inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
-  d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
+  #inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
-  d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
+  r <- digestTable(inputDataFile)
-  d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
+  d = r$d
-  d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
+  eByComp = r$eByComp
-  print(summary(d$technology))
+  countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
-#d = read.table("~/Desktop/broadLocal/GATK/trunk/foo", header=T)
+  print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
   + geom_abline(slope=1, linetype=2))
   #   + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
 } else {
  eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
 }
-#moltenCD = melt(d, id.vars=c("comp", "rg"), measure.vars=c("QofAAGivenD", "QofABGivenD", "QofBBGivenD"))
+#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
 #moltenCD$log10value = round(-10*log10(1-10^moltenCD$value))
 genotypeCounts <- function(x) {
  #print(table(x$comp))
  type = unique(x$variable)[1]
  #print(type)
  t = addmargins(table(x$comp))
  return(t)
 }
 addEmpiricalPofG <- function(d) {
  r = c()
  #
  # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
  # track is perfect.  In reality the chip is at best Q30 accurate (replicate samples have
  # level than this level of concordance).  At low incoming confidence, we can effectively
  # ignore this term but when the incoming Q is near or above Q30 this approximation clearly
  # breaks down.
  #
  for ( i in 1:dim(d)[1] ) {
    row = d[i,]
    if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
    if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
    if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
    r = c(r, v / row$Sum)
  }
  #print(length(r))
  d$EmpiricalPofG = r
  d$EmpiricalPofGQ = round(-10*log10(1-r))
  return(d)
 }
 eByComp <- addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
 countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
 print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
 #print(subset(eByComp, EmpiricalPofGQ < Inf))
 goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
 ymax = xmax = 30
 print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
 print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax)) 
@ -71,4 +38,3 @@ print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~
 + geom_abline(slope=1, linetype=2)
 + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
--- a/analysis/depristo/genotypeAccuracy/digestTable.R
+++ b/analysis/depristo/genotypeAccuracy/digestTable.R
@ -0,0 +1,62 @@
 #!/bin/env Rscript
 require("ggplot2")
 args <- commandArgs(TRUE)
 verbose = TRUE
 inputDataFile = args[1]
 onCmdLine = ! is.na(inputDataFile)
 addEmpiricalPofG <- function(d) {
  r = c()
  #
  # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
  # track is perfect.  In reality the chip is at best Q30 accurate (replicate samples have
  # level than this level of concordance).  At low incoming confidence, we can effectively
  # ignore this term but when the incoming Q is near or above Q30 this approximation clearly
  # breaks down.
  #
  for ( i in 1:dim(d)[1] ) {
    row = d[i,]
    if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
    if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
    if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
    r = c(r, v / row$Sum)
  }
  #print(length(r))
  d$EmpiricalPofG = r
  d$EmpiricalPofGQ = round(-10*log10(1-r))
  return(d)
 }
 genotypeCounts <- function(x) {
  type = unique(x$variable)[1]
  t = addmargins(table(x$comp))
  return(t)
 }
 digestTable <- function(inputDataFile) {
  d = subset(read.table(inputDataFile, header=T), rg != "ALL")
  d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
  d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
  d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
  d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
  print(summary(d$technology))
  eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
  return(list(d=d, eByComp = eByComp))
  #countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
 }
 writeMyTable <- function(t, name) {
  write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
 }
 if ( onCmdLine ) {
  r <- digestTable(inputDataFile)
  writeMyTable(r$eByComp, "eByComp")
 }
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java
@ -97,6 +97,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
    public static class Datum implements Comparable<Datum> {
        String rgID, sample;
        GenotypeLikelihoods pl;
        String ref, alt;
        VariantContext.Type siteType;
        Genotype.Type genotypeType;
@ -107,7 +108,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
            return bySample != 0 ? bySample : byRG;
        }
-        public Datum(String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
+        public Datum(String ref, String alt, String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
            this.ref = ref;
            this.alt = alt;
            this.sample = sample;
            this.rgID = rgID;
            this.pl = pl;
@ -196,7 +199,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
                Genotype rgGT = call.getGenotype(sample);
                if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) {
-                    Datum d = new Datum(sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
+                    Datum d = new Datum(vcComp.getReference().getBaseString(), vcComp.getAlternateAllele(0).getBaseString(),
                            sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
                    data.values.add(d);
                }
            }
@ -242,7 +246,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
    public void onTraversalDone(Data data) {
        // print the header
        List<String> pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD");
-        List<String> fields = Arrays.asList("sample", "rg", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
+        List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
        out.println(Utils.join("\t", fields));
        double[] counts = new double[]{1, 1, 1};
@ -260,8 +264,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
            for ( int i = 0; i < pGNames.size(); i++ ) {
                int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9));
                if ( q > 1 ) { // tons of 1s, and not interesting
-                    out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
+                    out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
-                            d.sample, d.rgID, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
+                            d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
                            pGNames.get(i), q);
                }
            }