From 62f53838596a6c3235e9fcb08d26ff4564383c1b Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 4 Oct 2010 00:27:59 +0000 Subject: [PATCH] * Added an R package, "gsalib", providing a place to store common, useful, documented R methods. To use this module, you must follow three steps: 1) Build the module with the following command: $ ant gsalib 2) Add the module path to your ~/.Rprofile file: .libPaths("/path/to/Sting/trunk/R/") 3) At the top of each R script that will use the library, include the line: library(gsalib) You can now use the package like any other R package. To get high-level documentation, supply the following command to R: help(gsalib) The methods contained herein are: getargs : A method to easily provide arguments to interactive and non-interactive scripts. Prints out a help message specifying how the script should be run if no arguments or "-h" is provided. Very helpful when you're writing an R-script piecemeal in interactive mode, then want to make it a command-line program. plot.venn : Plots a two-way or three-way proportional Venn diagram. read.eval : Reads VariantEval output that's formatted in R style. read.gatkreport : Reads GATKReport output. gsa.message : Emits a message with the prefix "[gsalib]" to stdout. gsa.warn : Emits a warning message with the prefix "[gsalib] Warning:" to stdout. gsa.error : Emits an error message with the prefix "[gsalib] Error: to stdout, calls traceback() and halts execution. Documentation on each of these methods can be obtained by typing "help(method_name)" at the R prompt. * Retired GATKReport.R, as that functionality has now been moved to gsalib. * Retired gsacommons, as that functionality has been split between gsalib and VariantReport.R. * Modified VariantReport.R to make use of gsalib. The script now uses the getargs() method to provide the user with some information as to the proper way to run the script. Documentation on how to prepare output is given at http://www.broadinstitute.org/gsa/wiki/index.php/VariantEval . * Added 'gsalib' target to build.xml file. Running "ant gsalib" will compile this module and place the R-ready package in R/gsalib . git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4416 348d0f76-0448-11de-a6fe-93d51630548a --- R/VariantReport/VariantReport.R | 379 ++++++++++++- R/gsacommons.R | 496 ------------------ R/src/gsalib/DESCRIPTION | 10 + R/src/gsalib/R/getargs.R | 116 ++++ R/src/gsalib/R/gsa.error.R | 12 + R/src/gsalib/R/gsa.message.R | 3 + R/src/gsalib/R/gsa.warn.R | 3 + R/src/gsalib/R/plot.venn.R | 50 ++ R/src/gsalib/R/read.eval.R | 74 +++ .../gsalib/R/read.gatkreport.R} | 4 +- R/src/gsalib/Read-and-delete-me | 9 + R/src/gsalib/man/getargs.Rd | 57 ++ R/src/gsalib/man/gsa.error.Rd | 49 ++ R/src/gsalib/man/gsa.message.Rd | 44 ++ R/src/gsalib/man/gsa.warn.Rd | 46 ++ R/src/gsalib/man/gsalib-package.Rd | 55 ++ R/src/gsalib/man/plot.venn.Rd | 75 +++ R/src/gsalib/man/read.eval.Rd | 111 ++++ R/src/gsalib/man/read.gatkreport.Rd | 55 ++ build.xml | 7 + 20 files changed, 1140 insertions(+), 515 deletions(-) delete mode 100644 R/gsacommons.R create mode 100644 R/src/gsalib/DESCRIPTION create mode 100644 R/src/gsalib/R/getargs.R create mode 100644 R/src/gsalib/R/gsa.error.R create mode 100644 R/src/gsalib/R/gsa.message.R create mode 100644 R/src/gsalib/R/gsa.warn.R create mode 100644 R/src/gsalib/R/plot.venn.R create mode 100644 R/src/gsalib/R/read.eval.R rename R/{GATKReport.R => src/gsalib/R/read.gatkreport.R} (95%) create mode 100644 R/src/gsalib/Read-and-delete-me create mode 100644 R/src/gsalib/man/getargs.Rd create mode 100644 R/src/gsalib/man/gsa.error.Rd create mode 100644 R/src/gsalib/man/gsa.message.Rd create mode 100644 R/src/gsalib/man/gsa.warn.Rd create mode 100644 R/src/gsalib/man/gsalib-package.Rd create mode 100644 R/src/gsalib/man/plot.venn.Rd create mode 100644 R/src/gsalib/man/read.eval.Rd create mode 100644 R/src/gsalib/man/read.gatkreport.Rd diff --git a/R/VariantReport/VariantReport.R b/R/VariantReport/VariantReport.R index 0a37b6076..002d46794 100644 --- a/R/VariantReport/VariantReport.R +++ b/R/VariantReport/VariantReport.R @@ -1,29 +1,372 @@ -source(paste(Sys.getenv("STING_DIR"), "/R/gsacommons.R", sep="")); +suppressPackageStartupMessages(library(gsalib)); +suppressPackageStartupMessages(library(gplots)); -if (!interactive()) { - options(warn=-1); +eval.getMetrics <- function(eval, jexl_expression) { + callset.counts = eval$CountVariants[which(eval$CountVariants$evaluation_name == "eval" & eval$CountVariants$comparison_name == "dbsnp" & eval$CountVariants$jexl_expression == jexl_expression),]; + callset.counts.titv = eval$TiTv[which(eval$TiTv$evaluation_name == "eval" & eval$TiTv$comparison_name == "dbsnp" & eval$TiTv$jexl_expression == jexl_expression),]; - library(getopt); + callset.calledCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "all"),]$nVariantLoci; + callset.calledCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; - spec = c( - 'evalRoot', 'e', 1, 'character', "root of the VariantEval files", - 'plotRoot', 'p', 1, 'character', "output root of the PDF file", - 'title', 't', 1, 'character', "title of the report", - 'author', 'a', 1, 'character', "author that should be listed on the report" - ); + callset.knownCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "known"),]$nVariantLoci; + callset.knownCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; - opt = getopt(matrix(spec, byrow=T, nrow=length(spec)/5)); -} else { - opt = list( - evalRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval", - plotRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval/plot", - title = "Test" + callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci; + callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; + + callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci; + callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; + + callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci; + callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; + + callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci; + callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; + + metrics = list( + all = callset.calledCounts, + all.titv = callset.calledCounts.titv, + + known = callset.knownCounts, + known.titv = callset.knownCounts.titv, + + novel = callset.novelCounts, + novel.titv = callset.novelCounts.titv, + + filtered.all = callset.allFilteredCounts, + filtered.all.titv = callset.allFilteredCounts.titv, + + filtered.known = callset.knownFilteredCounts, + filtered.known.titv = callset.knownFilteredCounts.titv, + + filtered.novel = callset.novelFilteredCounts, + filtered.novel.titv = callset.novelFilteredCounts.titv ); } +.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) { + if (is.na(filtered.metrics)) { + text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", + name, 100*metrics$all/union$all.withfiltered, + metrics$all, metrics$all.titv, + metrics$known, metrics$known.titv, + metrics$novel, metrics$novel.titv + ); + } else { + text = sprintf("%s (%0.01f%% of union)\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f\n\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", + name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered, + + name, othername, + filtered.metrics$all, filtered.metrics$all.titv, + filtered.metrics$known, filtered.metrics$known.titv, + filtered.metrics$novel, filtered.metrics$novel.titv, + + name, othername, + metrics$all, metrics$all.titv, + metrics$known, metrics$known.titv, + metrics$novel, metrics$novel.titv + ); + } +} + +plot.titlePage <- function(title, author) { + textplot(sprintf("Automated Variant Report\n\n%s\n%s\n%s\n", title, author, Sys.Date())); +} + +.plot.variantTable.getRowText <- function(eval, jexl_expression) { + allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci; + knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci; + novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci; + + allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio; + knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio; + novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio; + + cbind(allVariants, knownVariants, sprintf("%0.2f", knownTiTv), novelVariants, sprintf("%0.2f", novelTiTv)); +} + +plot.variantTable <- function(eval, title) { + aonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[1]); + aonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[1]); + intersection.row = .plot.variantTable.getRowText(eval, "Intersection"); + bonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[2]); + bonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[2]); + + variantsummary = as.data.frame(rbind(bonly.row, bonly.filtered.row, intersection.row, aonly.filtered.row, aonly.row)); + + rownames(variantsummary) = c( + sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + "Intersection", + sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), + sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) + ); + colnames(variantsummary) = c("counts (all)", "counts (known)", "ti/tv (known)", "counts (novel)", "ti/tv (novel)"); + + textplot(variantsummary); +} + +plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) { + aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]); + aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]); + intersection = eval.getMetrics(eval, "Intersection"); + bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]); + bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]); + + union = list( + all = intersection$all + aonly$all + bonly$all, + all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all + ); + + plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col); + + text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union)); + text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union)); + text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union)); +} + +plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); + bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); + + title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep=""); + + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + aonly.filtered$n = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + bonly.filtered$n = 0; + } + + #par.def = par(no.readonly = TRUE); + #par(mar=c(5, 5, 3, 5)); + + if (normalize == TRUE) { + norm = aonly$n + aonly.filtered$n + intersection$n + bonly$n + bonly.filtered$n; + matnorm = rbind(aonly$n/norm, aonly.filtered$n/norm, intersection$n/norm, bonly.filtered$n/norm, bonly$n/norm); + + barplot(matnorm, col=col, xlab="Allele count", ylab="", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), ylim=c(0, 1.3), border=NA, yaxt="n", cex=1.3, cex.axis=1.3, cex.lab=1.3); + axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3); + mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3); + } else { + mat = rbind(aonly$n, aonly.filtered$n, intersection$n, bonly.filtered$n, bonly$n); + + barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, max(intersection$AC)), ylim=c(0, 1), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3); + } + + legend( + "topright", + c( + sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + "Intersection", + sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), + sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) + ), + fill=rev(col), + cex=1.3 + ); + + #par(par.def); +} + +plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + intersection.all = eval.getMetrics(eval, "Intersection"); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); + bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); + + title = paste("Allele count spectrum (", novelty_name, " variants)", sep=""); + + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + aonly.filtered$n = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + bonly.filtered$n = 0; + } + + suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, max(aonly$n + aonly.filtered$n + intersection$n, bonly$n + bonly.filtered$n + intersection$n)), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3)); + suppressWarnings(points(intersection$AC, aonly$n + aonly.filtered$n + intersection$n, type="l", lwd=2, col=col[1])); + suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[1])); + suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[3])); + suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[4])); + suppressWarnings(points(intersection$AC, bonly$n + bonly.filtered$n + intersection$n, type="l", lwd=2, col=col[5])); + + loci = (unique(eval$CountVariants$nProcessedLoci))[1]; + + points(c(1:max(intersection$AC)), 0.9*(1/1000)*loci*(1/c(1:max(intersection$AC))), type="l", lwd=2, lty=2, col="black"); + + legend( + "bottomleft", + c( + sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + "Intersection", + sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), + sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), + sprintf("Neutral expectation ( 0.9*(1/1000)*%d*(1/c(1:max(%d))) )", loci, max(intersection$AC)) + ), + lwd=c(2, 2, 3, 2, 2, 2), + lty=c(1, 2, 1, 2, 1, 2), + col=c(rev(col), "black"), + cex=1.3 + ); +} + +eval.getMetricsByAc <- function(eval, jexl_expression, novelty_name="all") { + piece = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(jexl_expression) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; +} + +plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); + bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); + + title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep=""); + + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + aonly$nTi = 0; + aonly$nTv = 0; + aonly.filtered$n = 0; + aonly.filtered$nTi = 0; + aonly.filtered$nTv = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + bonly$nTi = 0; + bonly$nTv = 0; + bonly.filtered$n = 0; + bonly.filtered$nTi = 0; + bonly.filtered$nTv = 0; + } + + titv.aonly.withfiltered = (aonly$nTi + aonly.filtered$nTi + intersection$nTi)/(aonly$nTv + aonly.filtered$nTv + intersection$nTv); + titv.aonly.withfiltered.finite = titv.aonly.withfiltered[which(is.finite(titv.aonly.withfiltered))]; + + titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv); + titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))]; + + titv.intersection.finite = intersection$Ti.Tv[which(is.finite(intersection$Ti.Tv))]; + + titv.bonly = (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv); + titv.bonly.finite = titv.bonly[which(is.finite(titv.bonly))]; + + titv.bonly.withfiltered = (bonly$nTi + bonly.filtered$nTi + intersection$nTi)/(bonly$nTv + bonly.filtered$nTv + intersection$nTv); + titv.bonly.withfiltered.finite = titv.bonly.withfiltered[which(is.finite(titv.bonly.withfiltered))]; + + titv.min = min(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); + titv.max = max(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); + + plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(0, 4), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3); + points(intersection$AC, (aonly.filtered$nTi + intersection$nTi)/(aonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[1]); + points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[2]); + points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[3]); + points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[4]); + points(intersection$AC, (bonly.filtered$nTi + intersection$nTi)/(bonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[5]); + + abline(h=2.3, lty=2); + mtext("2.3", side=4, at=2.3, cex=0.9); + + abline(h=3.3, lty=2); + mtext("3.3", side=4, at=3.3, cex=0.9); + + #legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); + + legend( + "topleft", + c( + sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), + "Intersection", + sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), + sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) + ), + lwd=c(2, 2, 3, 2, 2), + lty=c(1, 2, 1, 2, 1), + col=rev(col), + cex=1.3 + ); +} + +plot.variantsPerSample2 <- function(eval) { + if (!is.na(eval$MetricsBySample)) { + metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; + metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; + metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),]; + + title = "Calls per sample"; + indices = order(metrics.all$nVariants, decreasing=TRUE); + + plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); + points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black"); + points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue"); + points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red"); + + legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red")); + + axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4); + } +} + +plot.variantsPerSample <- function(eval, novelty_name="all") { + if (!is.na(eval$SimpleMetricsBySample)) { + metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),]; + + title = paste("Calls per sample (", novelty_name, ")", sep=""); + indices = order(metrics$CountVariants, decreasing=TRUE); + + par.def = par(no.readonly = TRUE); + par(mar=c(5, 4, 4, 4)); + + plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); + points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black"); + + axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4); + + par(new=TRUE); + plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n"); + points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black"); + + titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio)); + axis(4, at=titvaxis, labels=titvaxis, las=2); + + par(par.def); + } +} + +argspec = list( + evalRoot = list(value = NA, doc = "Path to the VariantEval R-output (omit the '.Analysis_Type.csv' part of the filename)"), + plotOut = list(value = NA, doc = "Path to the output PDF file"), + title = list(value = NA, doc = "The title of the report"), + author = list(value = NA, doc = "The author of the report") +); + +opt = getargs(argspec, doc="Take VariantEval R-output and generate a series of plots summarizing the contents"); + eval = read.eval(opt$evalRoot); -plot.begin(opt$plotRoot, "variantReport"); +pdf(opt$plotOut, width=10, height=10); plot.titlePage(opt$title, opt$author); @@ -56,4 +399,4 @@ if (length(eval$CallsetNames) > 0) { #plot.variantsPerSample(eval, novelty_name="novel"); } -plot.end(opt$plotRoot); +dev.off(); diff --git a/R/gsacommons.R b/R/gsacommons.R deleted file mode 100644 index 59c1861e0..000000000 --- a/R/gsacommons.R +++ /dev/null @@ -1,496 +0,0 @@ -suppressPackageStartupMessages(library(gplots)); - -plot.begin <- function(plotRoot, name, width=10, height=10) { - if (!is.na(plotRoot)) { - filename = paste(plotRoot, ".", name, ".pdf", sep=""); - print(sprintf("Plotting '%s' to '%s'", name, filename)); - - pdf(filename, width=width, height=height); - } else { - print(sprintf("Plotting '%s' to X11", name)); - - x11(width=width, height=height); - } -} - -plot.end <- function(plotRoot) { - if (!is.na(plotRoot)) { - dev.off(); - } -} - -plot.venn <- function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, - col=c("#FF6342", "#63C6DE", "#ADDE63"), - pos=c(0.20, 0.20, 0.80, 0.82), - debug=0 - ) { - library(png); - library(graphics); - - # Set up properties - for (i in 1:length(col)) { - rgbcol = col2rgb(col[i]); - col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]); - } - - chco = paste(col[1], col[2], col[3], sep=","); - chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=","); - - props = c( - 'cht=v', - 'chs=525x525', - 'chds=0,10000000000', - paste('chco=', chco, sep=""), - paste('chd=t:', chd, sep="") - ); - proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&'); - - # Get the venn diagram (as a temporary file) - filename = tempfile("venn"); - cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep=""); - - if (debug == 1) { - print(cmd); - } - system(cmd); - - # Render the temp png file into a plotting frame - a = readPNG(filename); - - plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab=""); - if (a >= b) { - rasterImage(a, pos[1], pos[2], pos[3], pos[4]); - } else { - rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180); - } - - # Clean up! - unlink(filename); -} - -.attemptToLoadFile <- function(filename) { - file = NA; - - if (file.exists(filename) & file.info(filename)$size > 500) { - file = read.csv(filename, header=TRUE, comment.char="#"); - } - - file; -} - -read.eval <- function(evalRoot) { - fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); - fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); - fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); - fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); - fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); - fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); - fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); - fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); - fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); - fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); - fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); - fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); - fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); - fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); - - eval = list( - AlleleCountStats = NA, - CompOverlap = NA, - CountVariants = NA, - GenotypeConcordance = NA, - MetricsByAc = NA, - MetricsBySample = NA, - Quality_Metrics_by_allele_count = NA, - QualityScoreHistogram = NA, - SampleStatistics = NA, - SampleSummaryStatistics = NA, - SimpleMetricsBySample = NA, - TiTv = NA, - TiTvStats = NA, - Variant_Quality_Score = NA, - - CallsetNames = c(), - CallsetOnlyNames = c(), - CallsetFilteredNames = c() - ); - - eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); - eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); - eval$CountVariants = .attemptToLoadFile(fileCountVariants); - eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); - eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); - eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); - eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); - eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); - eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); - eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); - eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); - eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); - eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); - eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); - - uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); - eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); - eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); - eval$CallsetFilteredNames = as.vector(c( - paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""), - paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep="")) - ); - - eval; -} - -eval.getMetrics <- function(eval, jexl_expression) { - callset.counts = eval$CountVariants[which(eval$CountVariants$evaluation_name == "eval" & eval$CountVariants$comparison_name == "dbsnp" & eval$CountVariants$jexl_expression == jexl_expression),]; - callset.counts.titv = eval$TiTv[which(eval$TiTv$evaluation_name == "eval" & eval$TiTv$comparison_name == "dbsnp" & eval$TiTv$jexl_expression == jexl_expression),]; - - callset.calledCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "all"),]$nVariantLoci; - callset.calledCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; - - callset.knownCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "known"),]$nVariantLoci; - callset.knownCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; - - callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci; - callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; - - callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci; - callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; - - callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci; - callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; - - callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci; - callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; - - metrics = list( - all = callset.calledCounts, - all.titv = callset.calledCounts.titv, - - known = callset.knownCounts, - known.titv = callset.knownCounts.titv, - - novel = callset.novelCounts, - novel.titv = callset.novelCounts.titv, - - filtered.all = callset.allFilteredCounts, - filtered.all.titv = callset.allFilteredCounts.titv, - - filtered.known = callset.knownFilteredCounts, - filtered.known.titv = callset.knownFilteredCounts.titv, - - filtered.novel = callset.novelFilteredCounts, - filtered.novel.titv = callset.novelFilteredCounts.titv - ); -} - -.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) { - if (is.na(filtered.metrics)) { - text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", - name, 100*metrics$all/union$all.withfiltered, - metrics$all, metrics$all.titv, - metrics$known, metrics$known.titv, - metrics$novel, metrics$novel.titv - ); - } else { - text = sprintf("%s (%0.01f%% of union)\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f\n\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f", - name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered, - - name, othername, - filtered.metrics$all, filtered.metrics$all.titv, - filtered.metrics$known, filtered.metrics$known.titv, - filtered.metrics$novel, filtered.metrics$novel.titv, - - name, othername, - metrics$all, metrics$all.titv, - metrics$known, metrics$known.titv, - metrics$novel, metrics$novel.titv - ); - } -} - -plot.titlePage <- function(title, author) { - textplot(sprintf("Automated Data Processing Report\n\n%s\n%s\n%s\n", title, author, Sys.Date())); -} - -.plot.variantTable.getRowText <- function(eval, jexl_expression) { - allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci; - knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci; - novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci; - - allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio; - knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio; - novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio; - - cbind(allVariants, knownVariants, sprintf("%0.2f", knownTiTv), novelVariants, sprintf("%0.2f", novelTiTv)); -} - -plot.variantTable <- function(eval, title) { - aonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[1]); - aonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[1]); - intersection.row = .plot.variantTable.getRowText(eval, "Intersection"); - bonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[2]); - bonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[2]); - - variantsummary = as.data.frame(rbind(bonly.row, bonly.filtered.row, intersection.row, aonly.filtered.row, aonly.row)); - - rownames(variantsummary) = c( - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ); - colnames(variantsummary) = c("counts (all)", "counts (known)", "ti/tv (known)", "counts (novel)", "ti/tv (novel)"); - - textplot(variantsummary); -} - -plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) { - aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]); - aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetrics(eval, "Intersection"); - bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]); - bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]); - - union = list( - all = intersection$all + aonly$all + bonly$all, - all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all - ); - - plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col); - - text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union)); - text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union)); - text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union)); -} - -plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - aonly.filtered$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - bonly.filtered$n = 0; - } - - #par.def = par(no.readonly = TRUE); - #par(mar=c(5, 5, 3, 5)); - - if (normalize == TRUE) { - norm = aonly$n + aonly.filtered$n + intersection$n + bonly$n + bonly.filtered$n; - matnorm = rbind(aonly$n/norm, aonly.filtered$n/norm, intersection$n/norm, bonly.filtered$n/norm, bonly$n/norm); - - barplot(matnorm, col=col, xlab="Allele count", ylab="", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), ylim=c(0, 1.3), border=NA, yaxt="n", cex=1.3, cex.axis=1.3, cex.lab=1.3); - axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3); - mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3); - } else { - mat = rbind(aonly$n, aonly.filtered$n, intersection$n, bonly.filtered$n, bonly$n); - - barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, max(intersection$AC)), ylim=c(0, 1), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3); - } - - legend( - "topright", - c( - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ), - fill=rev(col), - cex=1.3 - ); - - #par(par.def); -} - -plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - intersection.all = eval.getMetrics(eval, "Intersection"); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Allele count spectrum (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - aonly.filtered$n = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - bonly.filtered$n = 0; - } - - suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, max(aonly$n + aonly.filtered$n + intersection$n, bonly$n + bonly.filtered$n + intersection$n)), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3)); - suppressWarnings(points(intersection$AC, aonly$n + aonly.filtered$n + intersection$n, type="l", lwd=2, col=col[1])); - suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[1])); - suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[3])); - suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[4])); - suppressWarnings(points(intersection$AC, bonly$n + bonly.filtered$n + intersection$n, type="l", lwd=2, col=col[5])); - - loci = (unique(eval$CountVariants$nProcessedLoci))[1]; - - points(c(1:max(intersection$AC)), 0.9*(1/1000)*loci*(1/c(1:max(intersection$AC))), type="l", lwd=2, lty=2, col="black"); - - legend( - "bottomleft", - c( - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Neutral expectation ( 0.9*(1/1000)*%d*(1/c(1:max(%d))) )", loci, max(intersection$AC)) - ), - lwd=c(2, 2, 3, 2, 2, 2), - lty=c(1, 2, 1, 2, 1, 2), - col=c(rev(col), "black"), - cex=1.3 - ); -} - -eval.getMetricsByAc <- function(eval, jexl_expression, novelty_name="all") { - piece = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(jexl_expression) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; -} - -plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) { - aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); - aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]); - intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); - bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); - bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]); - - title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep=""); - - if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { - aonly = intersection; - aonly$n = 0; - aonly$nTi = 0; - aonly$nTv = 0; - aonly.filtered$n = 0; - aonly.filtered$nTi = 0; - aonly.filtered$nTv = 0; - } - - if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { - bonly = intersection; - bonly$n = 0; - bonly$nTi = 0; - bonly$nTv = 0; - bonly.filtered$n = 0; - bonly.filtered$nTi = 0; - bonly.filtered$nTv = 0; - } - - titv.aonly.withfiltered = (aonly$nTi + aonly.filtered$nTi + intersection$nTi)/(aonly$nTv + aonly.filtered$nTv + intersection$nTv); - titv.aonly.withfiltered.finite = titv.aonly.withfiltered[which(is.finite(titv.aonly.withfiltered))]; - - titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv); - titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))]; - - titv.intersection.finite = intersection$Ti.Tv[which(is.finite(intersection$Ti.Tv))]; - - titv.bonly = (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv); - titv.bonly.finite = titv.bonly[which(is.finite(titv.bonly))]; - - titv.bonly.withfiltered = (bonly$nTi + bonly.filtered$nTi + intersection$nTi)/(bonly$nTv + bonly.filtered$nTv + intersection$nTv); - titv.bonly.withfiltered.finite = titv.bonly.withfiltered[which(is.finite(titv.bonly.withfiltered))]; - - titv.min = min(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); - titv.max = max(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite); - - plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(0, 4), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3); - points(intersection$AC, (aonly.filtered$nTi + intersection$nTi)/(aonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[1]); - points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[2]); - points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[3]); - points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[4]); - points(intersection$AC, (bonly.filtered$nTi + intersection$nTi)/(bonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[5]); - - abline(h=2.3, lty=2); - mtext("2.3", side=4, at=2.3, cex=0.9); - - abline(h=3.3, lty=2); - mtext("3.3", side=4, at=3.3, cex=0.9); - - #legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); - - legend( - "topleft", - c( - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]), - "Intersection", - sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]), - sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]) - ), - lwd=c(2, 2, 3, 2, 2), - lty=c(1, 2, 1, 2, 1), - col=rev(col), - cex=1.3 - ); -} - -plot.variantsPerSample2 <- function(eval) { - if (!is.na(eval$MetricsBySample)) { - metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; - metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; - metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),]; - - title = "Calls per sample"; - indices = order(metrics.all$nVariants, decreasing=TRUE); - - plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); - points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black"); - points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue"); - points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red"); - - legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red")); - - axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4); - } -} - -plot.variantsPerSample <- function(eval, novelty_name="all") { - if (!is.na(eval$SimpleMetricsBySample)) { - metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),]; - - title = paste("Calls per sample (", novelty_name, ")", sep=""); - indices = order(metrics$CountVariants, decreasing=TRUE); - - par.def = par(no.readonly = TRUE); - par(mar=c(5, 4, 4, 4)); - - plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); - points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black"); - - axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4); - - par(new=TRUE); - plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n"); - points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black"); - - titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio)); - axis(4, at=titvaxis, labels=titvaxis, las=2); - - par(par.def); - } -} diff --git a/R/src/gsalib/DESCRIPTION b/R/src/gsalib/DESCRIPTION new file mode 100644 index 000000000..6116e8c66 --- /dev/null +++ b/R/src/gsalib/DESCRIPTION @@ -0,0 +1,10 @@ +Package: gsalib +Type: Package +Title: Utility functions +Version: 1.0 +Date: 2010-10-02 +Author: Kiran Garimella +Maintainer: Kiran Garimella +Description: Utility functions for GATK NGS analyses +License: BSD +LazyLoad: yes diff --git a/R/src/gsalib/R/getargs.R b/R/src/gsalib/R/getargs.R new file mode 100644 index 000000000..f8f30ddd0 --- /dev/null +++ b/R/src/gsalib/R/getargs.R @@ -0,0 +1,116 @@ +.getargs.usage <- function(argspec, doc) { + cargs = commandArgs(); + + usage = "Usage:"; + + fileIndex = grep("--file=", cargs); + if (length(fileIndex) > 0) { + progname = gsub("--file=", "", cargs[fileIndex[1]]); + + usage = sprintf("Usage: Rscript %s [arguments]", progname); + + if (!is.na(doc)) { + message(sprintf("%s: %s\n", progname, doc)); + } + } + + message(usage); + + for (argname in names(argspec)) { + key = argname; + defaultValue = 0; + doc = ""; + + if (is.list(argspec[[argname]])) { + defaultValue = argspec[[argname]]$value; + doc = argspec[[argname]]$doc; + } + + message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc)); + } + + message(""); + + stop(call. = FALSE); +} + +getargs <- function(argspec, doc = NA) { + argsenv = new.env(); + + for (argname in names(argspec)) { + value = 0; + if (is.list(argspec[[argname]])) { + value = argspec[[argname]]$value; + } else { + value = argspec[[argname]]; + } + + assign(argname, value, envir=argsenv); + } + + if (interactive()) { + for (argname in names(argspec)) { + value = get(argname, envir=argsenv); + + if (is.na(value) | is.null(value)) { + if (exists("cmdargs")) { + assign(argname, cmdargs[[argname]], envir=argsenv); + } else { + assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv); + } + } else { + assign(argname, value, envir=argsenv); + } + } + } else { + cargs = commandArgs(TRUE); + + if (length(cargs) == 0) { + .getargs.usage(argspec, doc); + } + + for (i in 1:length(cargs)) { + if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) { + key = gsub("-", "", cargs[i]); + value = cargs[i+1]; + + if (key == "h" | key == "help") { + .getargs.usage(argspec, doc); + } + + if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) { + value = as.numeric(value); + } + + assign(key, value, envir=argsenv); + } + } + } + + args = as.list(argsenv); + + isMissingArgs = 0; + missingArgs = c(); + + for (arg in names(argspec)) { + if (is.na(args[[arg]]) | is.null(args[[arg]])) { + gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg)); + + isMissingArgs = 1; + missingArgs = c(missingArgs, arg); + } + } + + if (isMissingArgs) { + gsa.error( + paste( + "Missing required arguments: -", + paste(missingArgs, collapse=" -"), + ". Specify -h or -help to this script for a list of available arguments.", + sep="" + ) + ); + } + + args; +} diff --git a/R/src/gsalib/R/gsa.error.R b/R/src/gsalib/R/gsa.error.R new file mode 100644 index 000000000..1c6a56046 --- /dev/null +++ b/R/src/gsalib/R/gsa.error.R @@ -0,0 +1,12 @@ +gsa.error <- function(message) { + message(""); + gsa.message("Error: **********"); + gsa.message(sprintf("Error: %s", message)); + gsa.message("Error: **********"); + message(""); + + traceback(); + + message(""); + stop(message, call. = FALSE); +} diff --git a/R/src/gsalib/R/gsa.message.R b/R/src/gsalib/R/gsa.message.R new file mode 100644 index 000000000..a2b909d3d --- /dev/null +++ b/R/src/gsalib/R/gsa.message.R @@ -0,0 +1,3 @@ +gsa.message <- function(message) { + message(sprintf("[gsalib] %s", message)); +} diff --git a/R/src/gsalib/R/gsa.warn.R b/R/src/gsalib/R/gsa.warn.R new file mode 100644 index 000000000..7ee08ce65 --- /dev/null +++ b/R/src/gsalib/R/gsa.warn.R @@ -0,0 +1,3 @@ +gsa.warn <- function(message) { + gsa.message(sprintf("Warning: %s", message)); +} diff --git a/R/src/gsalib/R/plot.venn.R b/R/src/gsalib/R/plot.venn.R new file mode 100644 index 000000000..477fadd75 --- /dev/null +++ b/R/src/gsalib/R/plot.venn.R @@ -0,0 +1,50 @@ +plot.venn <- +function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, + col=c("#FF6342", "#63C6DE", "#ADDE63"), + pos=c(0.20, 0.20, 0.80, 0.82), + debug=0 + ) { + library(png); + library(graphics); + + # Set up properties + for (i in 1:length(col)) { + rgbcol = col2rgb(col[i]); + col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]); + } + + chco = paste(col[1], col[2], col[3], sep=","); + chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=","); + + props = c( + 'cht=v', + 'chs=525x525', + 'chds=0,10000000000', + paste('chco=', chco, sep=""), + paste('chd=t:', chd, sep="") + ); + proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&'); + + # Get the venn diagram (as a temporary file) + filename = tempfile("venn"); + cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep=""); + + if (debug == 1) { + print(cmd); + } + system(cmd); + + # Render the temp png file into a plotting frame + a = readPNG(filename); + + plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab=""); + if (c == 0 || a >= b) { + rasterImage(a, pos[1], pos[2], pos[3], pos[4]); + } else { + rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180); + } + + # Clean up! + unlink(filename); +} + diff --git a/R/src/gsalib/R/read.eval.R b/R/src/gsalib/R/read.eval.R new file mode 100644 index 000000000..2a9ba9e6a --- /dev/null +++ b/R/src/gsalib/R/read.eval.R @@ -0,0 +1,74 @@ +.attemptToLoadFile <- function(filename) { + file = NA; + + if (file.exists(filename) & file.info(filename)$size > 500) { + file = read.csv(filename, header=TRUE, comment.char="#"); + } + + file; +} + +read.eval <- +function(evalRoot) { + fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); + fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); + fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); + fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); + fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); + fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); + fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); + fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); + fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); + fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); + fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); + fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); + fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); + fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); + + eval = list( + AlleleCountStats = NA, + CompOverlap = NA, + CountVariants = NA, + GenotypeConcordance = NA, + MetricsByAc = NA, + MetricsBySample = NA, + Quality_Metrics_by_allele_count = NA, + QualityScoreHistogram = NA, + SampleStatistics = NA, + SampleSummaryStatistics = NA, + SimpleMetricsBySample = NA, + TiTv = NA, + TiTvStats = NA, + Variant_Quality_Score = NA, + + CallsetNames = c(), + CallsetOnlyNames = c(), + CallsetFilteredNames = c() + ); + + eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); + eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); + eval$CountVariants = .attemptToLoadFile(fileCountVariants); + eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); + eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); + eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); + eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); + eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); + eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); + eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); + eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); + eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); + eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); + eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); + + uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); + eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); + eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); + eval$CallsetFilteredNames = as.vector(c( + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""), + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep="")) + ); + + eval; +} + diff --git a/R/GATKReport.R b/R/src/gsalib/R/read.gatkreport.R similarity index 95% rename from R/GATKReport.R rename to R/src/gsalib/R/read.gatkreport.R index 7590a8020..e503e05d9 100644 --- a/R/GATKReport.R +++ b/R/src/gsalib/R/read.gatkreport.R @@ -6,7 +6,9 @@ for (i in 1:ncol(d)) { v = suppressWarnings(as.numeric(d[,i])); - d[,i] = v; + if (length(na.omit(as.numeric(v))) == length(d[,i])) { + d[,i] = v; + } } usedNames = ls(envir=tableEnv, pattern=tableName); diff --git a/R/src/gsalib/Read-and-delete-me b/R/src/gsalib/Read-and-delete-me new file mode 100644 index 000000000..d04323a6e --- /dev/null +++ b/R/src/gsalib/Read-and-delete-me @@ -0,0 +1,9 @@ +* Edit the help file skeletons in 'man', possibly combining help files + for multiple functions. +* Put any C/C++/Fortran code in 'src'. +* If you have compiled code, add a .First.lib() function in 'R' to load + the shared library. +* Run R CMD build to build the package tarball. +* Run R CMD check to check the package tarball. + +Read "Writing R Extensions" for more information. diff --git a/R/src/gsalib/man/getargs.Rd b/R/src/gsalib/man/getargs.Rd new file mode 100644 index 000000000..01f3a2312 --- /dev/null +++ b/R/src/gsalib/man/getargs.Rd @@ -0,0 +1,57 @@ +\name{getargs} +\alias{getargs} +\title{ +Get script arguments +} +\description{ +Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit. +} +\usage{ +getargs(argspec, doc = NA) +} +\arguments{ + \item{argspec}{ +A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example: +\preformatted{argspec = list( + arg1 = list(value=10, doc="Info for optional arg1"), + arg2 = list(value=NA, doc="Info for required arg2") +); +} + +If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for. +} + + \item{doc}{ +An optional string succinctly documenting the purpose of the script. +} +} +\details{ +Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes. + +To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided. +} +\value{ +Returns a list with keys matching the argspec and values representing the specified arguments. + +\item{arg1 }{Value for argument 1} +\item{arg2 }{Value for argument 2} +...etc. +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\examples{ +argspec = list( + file = list(value="/my/test.vcf", doc="VCF file"), + verbose = list(value=0, doc="If 1, set verbose mode"), + test2 = list(value=2.3e9, doc="Another argument that does stuff") +); + +cmdargs = getargs(argspec, doc="My test program"); + +print(cmdargs$file); # will print '[1] "/my/test.vcf"' +} +\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.error.Rd b/R/src/gsalib/man/gsa.error.Rd new file mode 100644 index 000000000..df7c0cbde --- /dev/null +++ b/R/src/gsalib/man/gsa.error.Rd @@ -0,0 +1,49 @@ +\name{gsa.error} +\alias{gsa.error} +\title{ +GSA error +} +\description{ +Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit. +} +\usage{ +gsa.error(message) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{message}{ +The error message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +gsa.error("This is a message"); +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsa.message.Rd b/R/src/gsalib/man/gsa.message.Rd new file mode 100644 index 000000000..9752de8a9 --- /dev/null +++ b/R/src/gsalib/man/gsa.message.Rd @@ -0,0 +1,44 @@ +\name{gsa.message} +\alias{gsa.message} +\title{ +GSA message +} +\description{ +Write a message to standard out with the prefix '[gsalib]'. +} +\usage{ +gsa.message(message) +} +\arguments{ + \item{message}{ +The message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Write message to stdout +gsa.message("This is a message"); +} +\keyword{ ~kwd1 } diff --git a/R/src/gsalib/man/gsa.warn.Rd b/R/src/gsalib/man/gsa.warn.Rd new file mode 100644 index 000000000..0b9770b5c --- /dev/null +++ b/R/src/gsalib/man/gsa.warn.Rd @@ -0,0 +1,46 @@ +\name{gsa.warn} +\alias{gsa.warn} +\title{ +GSA warn +} +\description{ +Write a warning message to standard out with the prefix '[gsalib] Warning:'. +} +\usage{ +gsa.warn(message) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{message}{ +The warning message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Write message to stdout +gsa.warn("This is a warning message"); +} +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/gsalib-package.Rd b/R/src/gsalib/man/gsalib-package.Rd new file mode 100644 index 000000000..b395e090c --- /dev/null +++ b/R/src/gsalib/man/gsalib-package.Rd @@ -0,0 +1,55 @@ +\name{gsalib-package} +\alias{gsalib-package} +\alias{gsalib} +\docType{package} +\title{ +GATK utility analysis functions +} +\description{ +Utility functions for analyzing GATK-processed NGS data +} +\details{ +This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more. +} +\author{ +Genome Sequencing and Analysis Group + +Medical and Population Genetics Program + +Maintainer: Kiran Garimella +} +\references{ +GSA wiki page: http://www.broadinstitute.org/gsa/wiki + +GATK help forum: http://www.getsatisfaction.com/gsa +} +\examples{ +## get script arguments in interactive and non-interactive mode +cmdargs = getargs( list( + requiredArg1 = list( + value = NA, + doc = "Documentation for requiredArg1" + ), + + optionalArg1 = list( + value = 3e9, + doc = "Documentation for optionalArg1" + ) +) ); + +## plot a proportional Venn diagram +plot.venn(500, 250, 0, 100); + +## read a GATKReport file +report = gatk.report("/path/to/my/output.gatkreport"); + +## emit a message +gsa.message("This is a message"); + +## emit a warning message +gsa.message("This is a warning message"); + +## emit an error message +gsa.message("This is an error message"); +} +\keyword{ package } diff --git a/R/src/gsalib/man/plot.venn.Rd b/R/src/gsalib/man/plot.venn.Rd new file mode 100644 index 000000000..70955d9d7 --- /dev/null +++ b/R/src/gsalib/man/plot.venn.Rd @@ -0,0 +1,75 @@ +\name{plot.venn} +\alias{plot.venn} +\title{ +Plot a proportional venn diagram +} +\description{ +Plot a proportional venn diagram (two or three-way venns allowed) +} +\usage{ +plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0) +} +\arguments{ + \item{a}{ +size of 'a' circle +} + \item{b}{ +size of 'b' circle +} + \item{c}{ +size of 'c' circle +} + \item{a_and_b}{ +size of a and b overlap +} + \item{a_and_c}{ +size of a and c overlap +} + \item{b_and_c}{ +size of b and c overlap +} + \item{col}{ +vector of colors for each venn piece +} + \item{pos}{ +vector of positional elements +} + \item{debug}{ +if 1, set debug mode and print useful information +} +} +\details{ +Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways. +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Plot a two-way Venn diagram +plot.venn(1000, 750, 0, 400); + +## Plot a three-way Venn diagram +plot.venn(1000, 750, 900, 400, 650, 500); +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/read.eval.Rd b/R/src/gsalib/man/read.eval.Rd new file mode 100644 index 000000000..cfe32df2e --- /dev/null +++ b/R/src/gsalib/man/read.eval.Rd @@ -0,0 +1,111 @@ +\name{read.eval} +\alias{read.eval} +\title{ +Read a VariantEval file +} +\description{ +Read a VariantEval file that's output in R format. +} +\usage{ +read.eval(evalRoot) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{evalRoot}{ +%% ~~Describe \code{evalRoot} here~~ +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +%% ~~who you are~~ +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function(evalRoot) { + fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); + fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); + fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); + fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); + fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); + fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); + fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); + fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); + fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); + fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); + fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); + fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); + fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); + fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); + + eval = list( + AlleleCountStats = NA, + CompOverlap = NA, + CountVariants = NA, + GenotypeConcordance = NA, + MetricsByAc = NA, + MetricsBySample = NA, + Quality_Metrics_by_allele_count = NA, + QualityScoreHistogram = NA, + SampleStatistics = NA, + SampleSummaryStatistics = NA, + SimpleMetricsBySample = NA, + TiTv = NA, + TiTvStats = NA, + Variant_Quality_Score = NA, + + CallsetNames = c(), + CallsetOnlyNames = c(), + CallsetFilteredNames = c() + ); + + eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); + eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); + eval$CountVariants = .attemptToLoadFile(fileCountVariants); + eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); + eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); + eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); + eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); + eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); + eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); + eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); + eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); + eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); + eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); + eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); + + uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); + eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); + eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); + eval$CallsetFilteredNames = as.vector(c()); + eval; + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/R/src/gsalib/man/read.gatkreport.Rd b/R/src/gsalib/man/read.gatkreport.Rd new file mode 100644 index 000000000..8d51b6e09 --- /dev/null +++ b/R/src/gsalib/man/read.gatkreport.Rd @@ -0,0 +1,55 @@ +\name{read.gatkreport} +\alias{read.gatkreport} +\title{ +read.gatkreport +} +\description{ +Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list. +} +\usage{ +read.gatkreport(filename) +} +\arguments{ + \item{filename}{ +The path to the GATKReport file. +} +} +\details{ +The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function. + +The file format looks like this: +\preformatted{##:GATKReport.v0.1 TableName : The description of the table +col1 col2 col3 +0 0.007451835696110506 25.474613284804366 +1 0.002362777171937477 29.844949954504095 +2 9.087604507451836E-4 32.87590975254731 +3 5.452562704471102E-4 34.498999090081895 +4 9.087604507451836E-4 35.14831665150137 +} + +} +\value{ +Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN". +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +report = read.gatkreport("/path/to/my/output.gatkreport"); +} +\keyword{ ~kwd1 } diff --git a/build.xml b/build.xml index d70ae0cef..19103054e 100644 --- a/build.xml +++ b/build.xml @@ -676,4 +676,11 @@ + + + + + + +