Improve variantCallQC.R

-- Refactor plotting utilities into master utility in gsalib.  Everyone can use it now
-- Better plots for standard variantCallQC
This commit is contained in:
Mark DePristo 2012-04-13 16:59:15 -04:00
parent 3f6b2423d8
commit 87be63c7e4
1 changed files with 236 additions and 0 deletions

View File

@ -0,0 +1,236 @@
library(gplots)
library(ggplot2)
# -------------------------------------------------------
# Utilities for displaying multiple plots per page
# -------------------------------------------------------
distributeGraphRows <- function(graphs, heights = c()) {
# Viewport layout 2 graphs top to bottom with given relative heights
#
#
if (length(heights) == 0) {
heights <- rep.int(1, length(graphs))
}
heights <- heights[!is.na(graphs)]
graphs <- graphs[!is.na(graphs)]
numGraphs <- length(graphs)
Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
grid.newpage()
pushViewport(viewport(layout = Layout))
subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
for (i in 1:numGraphs) {
print(graphs[[i]], vp = subplot(i))
}
}
distributeLogGraph <- function(graph, xName) {
continuousGraph <- graph + scale_x_continuous(xName)
logGraph <- graph + scale_x_log10(xName) + opts(title="")
distributeGraphRows(list(continuousGraph, logGraph))
}
distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) {
distributeGraphRows(list(perSampleGraph, distGraph), ratio)
}
removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) {
# Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame
#
# Only keeps the column marked with "all" for each removed column
#
for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) {
if (toRemove %in% colnames(variantEvalDataFrame)) {
variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",]
}
}
variantEvalDataFrame
}
openPDF <- function(outputPDF) {
# Open the outputPDF file with standard dimensions, if outputPDF is not NA
if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
}
closePDF <- function(outputPDF) {
# close the outputPDF file if not NA, and try to compact the PDF if possible
if ( ! is.na(outputPDF) ) {
dev.off()
if (exists("compactPDF")) {
compactPDF(outputPDF)
}
}
}
makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) {
if ( is.null(widths) ) widths <- rep(1, length(ACs))
value = NULL
titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom)
}
.reduceACs <- function(binWidthForAC, ACs) {
# computes data structures necessary to reduce the full range of ACs
#
# binWidthForAC returns the number of upcoming bins that should be merged into
# that AC bin. ACs is a vector of all AC values from 0 to 2N that should be
# merged together
#
# Returns a list containing the reduced ACs starts, their corresponding widths,
# and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
maxAC <- max(ACs)
newACs <- c()
widths <- c()
newACMap <- c()
ac <- 0
while ( ac < maxAC ) {
newACs <- c(newACs, ac)
width <- binWidthForAC(ac)
widths <- c(widths, width)
newACMap <- c(newACMap, rep(ac, width))
ac <- ac + width
}
list(ACs = newACs, widths=widths, newACMap = newACMap)
}
# geometricACs <- function(k, ACs) {
# nBins <- round(k * log10(max(ACs)))
#
# binWidthForAC <- function(ac) {
# max(ceiling(ac / nBins), 1)
# }
#
# return(reduceACs(binWidthForAC, ACs))
# }
reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) {
# map the full range of AC values onto a log linear scale
#
# Reduce the full AC range onto one where the width of each new AC increases at a rate of
# 10^scaleFactor in size with growing AC values. This is primarily useful for accurately
# computing ratios or other quantities by AC that aren't well determined when the AC
# values are very large
#
# Returns a list containing the reduced ACs starts, their corresponding widths,
# and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
maxAC <- max(ACs)
afs <- ACs / maxAC
breaks <- 10^(seq(-4, -1, scaleFactor))
widths <- c()
lastBreak <- 1
for ( i in length(breaks):1 ) {
b <- breaks[i]
width <- sum(afs < lastBreak & afs >= b)
widths <- c(widths, width)
lastBreak <- b
}
widths <- rev(widths)
binWidthForAC <- function(ac) {
af <- ac / maxAC
value = 1
for ( i in length(breaks):1 )
if ( af >= breaks[i] ) {
value = widths[i]
break
}
return(value)
}
return(.reduceACs(binWidthForAC, ACs))
}
.remapACs <- function(remapper, k, df) {
newACs <- remapper(k, df$AC)
n = length(newACs$ACs)
num = rep(0, n)
denom = rep(0, n)
for ( i in 1:dim(df)[1] ) {
rowI = df$AC == i
row = df[rowI,]
newAC = newACs$newACMap[row$AC]
newRowI = newACs$ACs == newAC
num[newRowI] = num[newRowI] + df$num[rowI]
denom[newRowI] = denom[newRowI] + df$denom[rowI]
}
newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths )
newdf
}
compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) {
df = makeRatioDataFrame(ACs, num, denom, 1)
return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df))
}
plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels",
onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, moreTitle="") {
metrics$strat = metrics[[requestedStrat]]
otherFacet = "."
id.vars = c("strat", "nobs")
metrics$nobs <- metrics[[nObsField]]
# keep track of the other strat and it's implied facet value
if (! is.null(anotherStrat)) {
id.vars = c(id.vars, anotherStrat)
otherFacet = anotherStrat
}
molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures))
perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable))
title <- opts(title=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle))
determineFacet <- function(onX) {
if ( onX ) {
paste(otherFacet, "~ variable")
} else {
paste("variable ~", otherFacet)
}
}
sampleFacet = determineFacet(facetVariableOnXPerSample)
distFacet = determineFacet(facetVariableOnXForDist)
if ( requestedStrat == "Sample" ) {
perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
} else {
perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
}
perSampleGraph <- perSampleGraph + ylab("Variable value") + title
perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free")
nValues = length(unique(molten$value))
if (nValues > 2) {
if ( requestedStrat == "Sample" ) {
distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable))
} else {
distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs))
}
distGraph <- distGraph + geom_histogram(aes(y=..ndensity..))
distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..))
distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter"))
scale = "free"
if ( fixHistogramX ) scale = "fixed"
distGraph <- distGraph + facet_grid(distFacet, scales=scale)
distGraph <- distGraph + ylab("Relative frequency")
distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none")
} else {
distGraph <- NA
}
if ( onSamePage ) {
suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph))
} else {
suppressMessages(print(perSampleGraph))
suppressMessages(print(distGraph + title))
}
}