From 87be63c7e46a4ab14832d0ad2314d4f942f77dac Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Fri, 13 Apr 2012 16:59:15 -0400
Subject: [PATCH] Improve variantCallQC.R

-- Refactor plotting utilities into master utility in gsalib.  Everyone can use it now
-- Better plots for standard variantCallQC
---
 .../utils/R/gsalib/R/gsa.variantqc.utils.R    | 236 ++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R

diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
new file mode 100644
index 000000000..88fc48e2a
--- /dev/null
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
@@ -0,0 +1,236 @@
+library(gplots)
+library(ggplot2)
+
+# -------------------------------------------------------
+# Utilities for displaying multiple plots per page
+# -------------------------------------------------------
+
+distributeGraphRows <- function(graphs, heights = c()) {
+  # Viewport layout 2 graphs top to bottom with given relative heights
+  #
+  #
+  if (length(heights) == 0) {
+    heights <- rep.int(1, length(graphs))
+  }
+  heights <- heights[!is.na(graphs)]
+  graphs <- graphs[!is.na(graphs)]
+  numGraphs <- length(graphs)
+  Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
+  grid.newpage()
+  pushViewport(viewport(layout = Layout))
+  subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
+  for (i in 1:numGraphs) {
+    print(graphs[[i]], vp = subplot(i))
+  }
+}
+
+distributeLogGraph <- function(graph, xName) {
+  continuousGraph <- graph + scale_x_continuous(xName)
+  logGraph <- graph + scale_x_log10(xName) + opts(title="")
+  distributeGraphRows(list(continuousGraph, logGraph))
+}
+
+distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) {
+  distributeGraphRows(list(perSampleGraph, distGraph), ratio)
+}
+
+removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) {
+  # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame
+  #
+  # Only keeps the column marked with "all" for each removed column
+  #
+  for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) {
+    if (toRemove %in% colnames(variantEvalDataFrame)) {
+      variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",]
+    }
+  }
+  variantEvalDataFrame    
+}
+
+openPDF <- function(outputPDF) {
+  # Open the outputPDF file with standard dimensions, if outputPDF is not NA
+  if ( ! is.na(outputPDF) ) {
+    pdf(outputPDF, height=8.5, width=11)
+  }
+}
+
+closePDF <- function(outputPDF) {
+  # close the outputPDF file if not NA, and try to compact the PDF if possible
+  if ( ! is.na(outputPDF) ) {
+    dev.off()
+    if (exists("compactPDF")) {
+      compactPDF(outputPDF)
+    }
+  }
+}
+
+makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) {
+  if ( is.null(widths) ) widths <- rep(1, length(ACs))
+  
+  value = NULL
+  titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom)
+}
+
+.reduceACs <- function(binWidthForAC, ACs) {
+  # computes data structures necessary to reduce the full range of ACs
+  #
+  # binWidthForAC returns the number of upcoming bins that should be merged into 
+  # that AC bin.  ACs is a vector of all AC values from 0 to 2N that should be 
+  # merged together
+  #
+  # Returns a list containing the reduced ACs starts, their corresponding widths,
+  # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
+  maxAC <- max(ACs)
+  newACs <- c()
+  widths <- c()
+  newACMap <- c()
+  ac <- 0
+  while ( ac < maxAC ) {
+    newACs <- c(newACs, ac)
+    width <- binWidthForAC(ac)
+    widths <- c(widths, width)
+    newACMap <- c(newACMap, rep(ac, width))
+    ac <- ac + width
+  }
+  list(ACs = newACs, widths=widths, newACMap = newACMap)
+}
+
+# geometricACs <- function(k, ACs) {
+#   nBins <- round(k * log10(max(ACs)))
+#   
+#   binWidthForAC <- function(ac) {
+#     max(ceiling(ac / nBins), 1)
+#   }
+#   
+#   return(reduceACs(binWidthForAC, ACs))
+# }
+
+reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) {
+  # map the full range of AC values onto a log linear scale
+  #
+  # Reduce the full AC range onto one where the width of each new AC increases at a rate of
+  # 10^scaleFactor in size with growing AC values.  This is primarily useful for accurately
+  # computing ratios or other quantities by AC that aren't well determined when the AC 
+  # values are very large
+  #
+  # Returns a list containing the reduced ACs starts, their corresponding widths,
+  # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
+  maxAC <- max(ACs)
+  afs <- ACs / maxAC
+  breaks <- 10^(seq(-4, -1, scaleFactor))
+  widths <- c()
+  lastBreak <- 1
+  for ( i in length(breaks):1 ) {
+    b <- breaks[i]
+    width <- sum(afs < lastBreak & afs >= b)
+    widths <- c(widths, width)
+    lastBreak <- b
+  }
+  widths <- rev(widths)
+  
+  binWidthForAC <- function(ac) {
+    af <- ac / maxAC
+    value = 1
+    for ( i in length(breaks):1 )
+      if ( af >= breaks[i] ) {
+        value = widths[i]
+        break
+      }
+    
+    return(value)
+  }
+  
+  return(.reduceACs(binWidthForAC, ACs))
+}
+
+.remapACs <- function(remapper, k, df) {
+  newACs <- remapper(k, df$AC)
+  
+  n = length(newACs$ACs)
+  num = rep(0, n)
+  denom = rep(0, n)
+  for ( i in 1:dim(df)[1] ) {
+    rowI = df$AC == i
+    row = df[rowI,]
+    newAC = newACs$newACMap[row$AC]
+    newRowI = newACs$ACs == newAC
+    num[newRowI] = num[newRowI] + df$num[rowI]
+    denom[newRowI] = denom[newRowI] + df$denom[rowI]
+  }
+  
+  newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths )
+  newdf
+}
+
+compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) {
+  df = makeRatioDataFrame(ACs, num, denom, 1)
+  return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df))
+}
+
+plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", 
+                          fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels", 
+                          onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, moreTitle="") {
+  metrics$strat = metrics[[requestedStrat]]
+  
+  otherFacet = "."
+  id.vars = c("strat", "nobs")
+  metrics$nobs <- metrics[[nObsField]]
+  
+  # keep track of the other strat and it's implied facet value
+  if (! is.null(anotherStrat)) { 
+    id.vars = c(id.vars, anotherStrat)
+    otherFacet = anotherStrat
+  }
+  
+  molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures))
+  perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable))
+  title <- opts(title=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle))
+  
+  determineFacet <- function(onX) {
+    if ( onX ) { 
+      paste(otherFacet, "~ variable")
+    } else {
+      paste("variable ~", otherFacet)
+    }
+  }
+  
+  sampleFacet = determineFacet(facetVariableOnXPerSample)
+  distFacet   = determineFacet(facetVariableOnXForDist)
+  
+  if ( requestedStrat == "Sample" ) {
+    perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
+    perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
+  } else {
+    perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
+    perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
+  }    
+  perSampleGraph <- perSampleGraph + ylab("Variable value") + title
+  perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free")
+  
+  nValues = length(unique(molten$value))
+  if (nValues > 2) {
+    if ( requestedStrat == "Sample" ) {
+      distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable))
+    } else {
+      distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs))
+    }
+    distGraph <- distGraph + geom_histogram(aes(y=..ndensity..))
+    distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..))
+    distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter"))
+    scale = "free"
+    if ( fixHistogramX ) scale = "fixed"
+    distGraph <- distGraph + facet_grid(distFacet, scales=scale)
+    distGraph <- distGraph + ylab("Relative frequency")
+    distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
+    distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none")
+  } else {
+    distGraph <- NA
+  }
+  
+  if ( onSamePage ) {
+    suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph))
+  } else {
+    suppressMessages(print(perSampleGraph))
+    suppressMessages(print(distGraph + title))
+  }
+}