From d77f4ebe31a8f9e48165bd7ccfd3cd39f2ee25e1 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sat, 21 May 2011 14:02:30 +0000
Subject: [PATCH] CalibrateGenotypeLikelihoods now emits a molten data set with
 REF and ALT alleles, so that GL calibration can be evaluated as a function of
 the REF/ALT bases.  DigestTable is a stand-alone Rscript that digests the
 multi-GB molten data table into a tiny table that shows reported vs.
 empirical GLs, as a function of a variety of features of the data, like
 REF/ALT, comp GT, eval GT, and GL itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5833 348d0f76-0448-11de-a6fe-93d51630548a
---
 analysis/depristo/genotypeAccuracy/commands.R | 64 +++++--------------
 .../depristo/genotypeAccuracy/digestTable.R   | 62 ++++++++++++++++++
 .../walkers/CalibrateGenotypeLikelihoods.java | 14 ++--
 3 files changed, 86 insertions(+), 54 deletions(-)
 create mode 100644 analysis/depristo/genotypeAccuracy/digestTable.R

diff --git a/analysis/depristo/genotypeAccuracy/commands.R b/analysis/depristo/genotypeAccuracy/commands.R
index 4d25f6fc0..73cd2c372 100644
--- a/analysis/depristo/genotypeAccuracy/commands.R
+++ b/analysis/depristo/genotypeAccuracy/commands.R
@@ -2,60 +2,27 @@ require("lattice")
 require("ggplot2")
 require("splines")
 
-READ_DATA = F
-
-if ( READ_DATA ) {
-  d = subset(read.table("~/Dropbox/Analysis/genotypeAccuracy/cgl.hiseq.table", header=T), rg != "ALL")
-  d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
-  d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
-  d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
-  d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
-  print(summary(d$technology))
-#d = read.table("~/Desktop/broadLocal/GATK/trunk/foo", header=T)
+ymax = xmax = 30
+HAVE_RAW_DATA = F
+if ( HAVE_RAW_DATA ) {
+  inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
+  #inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
+  r <- digestTable(inputDataFile)
+  d = r$d
+  eByComp = r$eByComp
+  countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
+  print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
+   + geom_abline(slope=1, linetype=2))
+   #   + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
+} else {
+  eByComp = read.table("~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
 }
 
-#moltenCD = melt(d, id.vars=c("comp", "rg"), measure.vars=c("QofAAGivenD", "QofABGivenD", "QofBBGivenD"))
-#moltenCD$log10value = round(-10*log10(1-10^moltenCD$value))
-  
-genotypeCounts <- function(x) {
-  #print(table(x$comp))
-  type = unique(x$variable)[1]
-  #print(type)
-  t = addmargins(table(x$comp))
-  return(t)
-}
-
-addEmpiricalPofG <- function(d) {
-  r = c()
-  #
-  # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
-  # track is perfect.  In reality the chip is at best Q30 accurate (replicate samples have
-  # level than this level of concordance).  At low incoming confidence, we can effectively
-  # ignore this term but when the incoming Q is near or above Q30 this approximation clearly
-  # breaks down.
-  #
-  for ( i in 1:dim(d)[1] ) {
-    row = d[i,]
-    if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
-    if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
-    if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
-    r = c(r, v / row$Sum)
-  }
-
-  #print(length(r))
-  d$EmpiricalPofG = r
-  d$EmpiricalPofGQ = round(-10*log10(1-r))
-  return(d)
-}
-
-eByComp <- addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
-countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
-print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
+#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
 #print(subset(eByComp, EmpiricalPofGQ < Inf))
 
 goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
 
-ymax = xmax = 30
 print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
 
 print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax)) 
@@ -71,4 +38,3 @@ print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~
 + geom_abline(slope=1, linetype=2)
 + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
 
-
diff --git a/analysis/depristo/genotypeAccuracy/digestTable.R b/analysis/depristo/genotypeAccuracy/digestTable.R
new file mode 100644
index 000000000..1278abc47
--- /dev/null
+++ b/analysis/depristo/genotypeAccuracy/digestTable.R
@@ -0,0 +1,62 @@
+#!/bin/env Rscript
+
+require("ggplot2")
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+inputDataFile = args[1]
+onCmdLine = ! is.na(inputDataFile)
+
+addEmpiricalPofG <- function(d) {
+  r = c()
+  #
+  # TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
+  # track is perfect.  In reality the chip is at best Q30 accurate (replicate samples have
+  # level than this level of concordance).  At low incoming confidence, we can effectively
+  # ignore this term but when the incoming Q is near or above Q30 this approximation clearly
+  # breaks down.
+  #
+  for ( i in 1:dim(d)[1] ) {
+    row = d[i,]
+    if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
+    if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
+    if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
+    r = c(r, v / row$Sum)
+  }
+
+  #print(length(r))
+  d$EmpiricalPofG = r
+  d$EmpiricalPofGQ = round(-10*log10(1-r))
+  return(d)
+}
+
+genotypeCounts <- function(x) {
+  type = unique(x$variable)[1]
+  t = addmargins(table(x$comp))
+  return(t)
+}
+
+
+digestTable <- function(inputDataFile) {
+  d = subset(read.table(inputDataFile, header=T), rg != "ALL")
+  d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
+  d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
+  d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
+  d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
+  print(summary(d$technology))
+  
+  eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
+  return(list(d=d, eByComp = eByComp))
+  #countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
+}
+  
+writeMyTable <- function(t, name) {
+  write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
+}
+
+if ( onCmdLine ) {
+  r <- digestTable(inputDataFile)
+  writeMyTable(r$eByComp, "eByComp")
+}
+
diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java
index eacc7498e..db2d7fc31 100644
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CalibrateGenotypeLikelihoods.java
@@ -97,6 +97,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
     public static class Datum implements Comparable<Datum> {
         String rgID, sample;
         GenotypeLikelihoods pl;
+        String ref, alt;
         VariantContext.Type siteType;
         Genotype.Type genotypeType;
 
@@ -107,7 +108,9 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
             return bySample != 0 ? bySample : byRG;
         }
 
-        public Datum(String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
+        public Datum(String ref, String alt, String sample, String rgID, GenotypeLikelihoods pl, VariantContext.Type siteType, Genotype.Type genotypeType) {
+            this.ref = ref;
+            this.alt = alt;
             this.sample = sample;
             this.rgID = rgID;
             this.pl = pl;
@@ -196,7 +199,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
                 Genotype rgGT = call.getGenotype(sample);
 
                 if ( rgGT != null && ! rgGT.isNoCall() && rgGT.getLikelihoods().getAsVector() != null ) {
-                    Datum d = new Datum(sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
+                    Datum d = new Datum(vcComp.getReference().getBaseString(), vcComp.getAlternateAllele(0).getBaseString(),
+                            sample, rgAC.getKey().getReadGroupId(), rgGT.getLikelihoods(), vcComp.getType(), compGT.getType());
                     data.values.add(d);
                 }
             }
@@ -242,7 +246,7 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
     public void onTraversalDone(Data data) {
         // print the header
         List<String> pGNames = Arrays.asList("QofAAGivenD", "QofABGivenD", "QofBBGivenD");
-        List<String> fields = Arrays.asList("sample", "rg", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
+        List<String> fields = Arrays.asList("sample", "rg", "ref", "alt", "siteType", "pls", "comp", "pGGivenDType", "pGGivenD");
         out.println(Utils.join("\t", fields));
 
         double[] counts = new double[]{1, 1, 1};
@@ -260,8 +264,8 @@ public class CalibrateGenotypeLikelihoods extends RodWalker<CalibrateGenotypeLik
             for ( int i = 0; i < pGNames.size(); i++ ) {
                 int q = QualityUtils.probToQual(pOfGGivenD[i], Math.pow(10.0, -9.9));
                 if ( q > 1 ) { // tons of 1s, and not interesting
-                    out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
-                            d.sample, d.rgID, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
+                    out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d%n",
+                            d.sample, d.rgID, d.ref, d.alt, d.siteType, d.pl.getAsString(), d.genotypeType.toString(),
                             pGNames.get(i), q);
                 }
             }