Merge pull request #767 from broadinstitute/gg_fix_gsalib_doc

Updated gsalib package to match version we put in CRAN
2014-11-04 10:46:27 -05:00 · 2014-11-04 10:46:27 -05:00 · f6af776590
parent 65d834fece e351ca869d
commit f6af776590
23 changed files with 38 additions and 1147 deletions
--- a/public/gsalib/src/R/DESCRIPTION
+++ b/public/gsalib/src/R/DESCRIPTION
@ -1,13 +1,15 @@
 Package: gsalib
 Type: Package
-Title: Utility functions
+Title: Utility functions for GATK
-Version: 1.0
+Version: 2.0
-Date: 2010-10-02
+Date: 2013-07-02
 Imports: gplots, ggplot2, png
 Author: Kiran Garimella
-Maintainer: Mauricio Carneiro <carneiro@broadinstitute.org>
+Maintainer: Geraldine Van der Auwera <vdauwera@broadinstitute.org>
-BugReports: http://gatkforums.broadinstitute.org
+Description: Utility functions for GATK analyses of genome sequence
-Description: Utility functions for GATK NGS analyses
+        data
 License: MIT + file LICENSE
 LazyLoad: yes
 Packaged: 2013-07-02 07:56:13 UTC; gege
 NeedsCompilation: no
 Repository: CRAN
 Date/Publication: 2013-07-03 17:30:49
--- a/public/gsalib/src/R/NAMESPACE
+++ b/public/gsalib/src/R/NAMESPACE
@ -1 +1 @@
-exportPattern("^[^\\.]")
+export(gsa.read.gatkreport)
--- a/public/gsalib/src/R/R/gsa.error.R
+++ b/public/gsalib/src/R/R/gsa.error.R
@ -1,12 +0,0 @@
 gsa.error <- function(message) {
    message("");
    gsa.message("Error: **********");
    gsa.message(sprintf("Error: %s", message));
    gsa.message("Error: **********");
    message("");
    traceback();
    message("");
    stop(message, call. = FALSE);
 }
--- a/public/gsalib/src/R/R/gsa.getargs.R
+++ b/public/gsalib/src/R/R/gsa.getargs.R
@ -1,116 +0,0 @@
 .gsa.getargs.usage <- function(argspec, doc) {
    cargs = commandArgs();
    usage = "Usage:";
    fileIndex = grep("--file=", cargs);
    if (length(fileIndex) > 0) {
        progname = gsub("--file=", "", cargs[fileIndex[1]]);
        usage = sprintf("Usage: Rscript %s [arguments]", progname);
        if (!is.na(doc)) {
            message(sprintf("%s: %s\n", progname, doc));
        }
    }
    message(usage);
    for (argname in names(argspec)) {
        key = argname;
        defaultValue = 0;
        doc = "";
        if (is.list(argspec[[argname]])) {
            defaultValue = argspec[[argname]]$value;
            doc = argspec[[argname]]$doc;
        }
        message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc));
    }
    message("");
    stop(call. = FALSE);
 }
 gsa.getargs <- function(argspec, doc = NA) {
    argsenv = new.env();
    for (argname in names(argspec)) {
        value = 0;
        if (is.list(argspec[[argname]])) {
            value = argspec[[argname]]$value;
        } else {
            value = argspec[[argname]];
        }
        assign(argname, value, envir=argsenv);
    }
    if (interactive()) {
        for (argname in names(argspec)) {
            value = get(argname, envir=argsenv);
            if (is.na(value) | is.null(value)) {
                if (exists("cmdargs")) {
                    assign(argname, cmdargs[[argname]], envir=argsenv);
                } else {
                    assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv);
                }
            } else {
                assign(argname, value, envir=argsenv);
            }
        }
    } else {
        cargs = commandArgs(TRUE);
        if (length(cargs) == 0) {
            .gsa.getargs.usage(argspec, doc);
        }
        for (i in 1:length(cargs)) {
            if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) {
                key = gsub("-", "", cargs[i]);
                value = cargs[i+1];
                if (key == "h" | key == "help") {
                    .gsa.getargs.usage(argspec, doc);
                }
                if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) {
                    value = as.numeric(value);
                }
                assign(key, value, envir=argsenv);
            }
        }
    }
    args = as.list(argsenv);
    isMissingArgs = 0;
    missingArgs = c();
    for (arg in names(argspec)) {
        if (is.na(args[[arg]]) | is.null(args[[arg]])) {
            gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg));
            isMissingArgs = 1;
            missingArgs = c(missingArgs, arg);
        }
    }
    if (isMissingArgs) {
        gsa.error(
            paste(
                "Missing required arguments: -",
                paste(missingArgs, collapse=" -"),
                ".  Specify -h or -help to this script for a list of available arguments.",
                sep=""
            )
        );
    }
    args;
 }
--- a/public/gsalib/src/R/R/gsa.message.R
+++ b/public/gsalib/src/R/R/gsa.message.R
@ -1,3 +0,0 @@
 gsa.message <- function(message) {
    message(sprintf("[gsalib] %s", message));
 }
--- a/public/gsalib/src/R/R/gsa.plot.venn.R
+++ b/public/gsalib/src/R/R/gsa.plot.venn.R
@ -1,50 +0,0 @@
 gsa.plot.venn <-
 function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
                     col=c("#FF6342", "#63C6DE", "#ADDE63"),
                     pos=c(0.20, 0.20, 0.80, 0.82),
                     debug=0
                    ) {
    library(png);
    library(graphics);
    # Set up properties
    for (i in 1:length(col)) {
        rgbcol = col2rgb(col[i]);
        col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]);
    }
    chco = paste(col[1], col[2], col[3], sep=",");
    chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=",");
    props = c(
        'cht=v',
        'chs=525x525',
        'chds=0,10000000000',
        paste('chco=', chco, sep=""),
        paste('chd=t:', chd, sep="")
    );
    proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&');
    # Get the venn diagram (as a temporary file)
    filename = tempfile("venn");
    cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep="");
    if (debug == 1) {
        print(cmd);
    }
    system(cmd);
    # Render the temp png file into a plotting frame
    a = readPNG(filename);
    plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
    if (c == 0 || a >= b) {
        rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
    } else {
        rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
    }
    # Clean up!
    unlink(filename);
 }
--- a/public/gsalib/src/R/R/gsa.read.eval.R
+++ b/public/gsalib/src/R/R/gsa.read.eval.R
@ -1,83 +0,0 @@
 .gsa.attemptToLoadFile <- function(filename) {
    file = NA;
    if (file.exists(filename) & file.info(filename)$size > 500) {
        file = read.csv(filename, header=TRUE, comment.char="#");
    }
    file;
 }
 gsa.read.eval <-
 function(evalRoot) {
    fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
    fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
    fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
    fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
    fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
    fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
    fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
    fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
    fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
    fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
    fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
    fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
    fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
    fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
    eval = list(
        AlleleCountStats = NA,
        CompOverlap = NA,
        CountVariants = NA,
        GenotypeConcordance = NA,
        MetricsByAc = NA,
        MetricsBySample = NA,
        Quality_Metrics_by_allele_count = NA,
        QualityScoreHistogram = NA,
        SampleStatistics = NA,
        SampleSummaryStatistics = NA,
        SimpleMetricsBySample = NA,
        TiTv = NA,
        TiTvStats = NA,
        Variant_Quality_Score = NA,
        CallsetNames = c(),
        CallsetOnlyNames = c(),
        CallsetFilteredNames = c()
    );
    eval$AlleleCountStats                = .gsa.attemptToLoadFile(fileAlleleCountStats);
    eval$CompOverlap                     = .gsa.attemptToLoadFile(fileCompOverlap);
    eval$CountVariants                   = .gsa.attemptToLoadFile(fileCountVariants);
    eval$GenotypeConcordance             = .gsa.attemptToLoadFile(fileGenotypeConcordance);
    eval$MetricsByAc                     = .gsa.attemptToLoadFile(fileMetricsByAc);
    eval$MetricsBySample                 = .gsa.attemptToLoadFile(fileMetricsBySample);
    eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count);
    eval$QualityScoreHistogram           = .gsa.attemptToLoadFile(fileQualityScoreHistogram);
    eval$SampleStatistics                = .gsa.attemptToLoadFile(fileSampleStatistics);
    eval$SampleSummaryStatistics         = .gsa.attemptToLoadFile(fileSampleSummaryStatistics);
    eval$SimpleMetricsBySample           = .gsa.attemptToLoadFile(fileSimpleMetricsBySample);
    eval$TiTv                            = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
    eval$TiTvStats                       = .gsa.attemptToLoadFile(fileTiTvStats);
    eval$Variant_Quality_Score           = .gsa.attemptToLoadFile(fileVariant_Quality_Score);
    uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
    eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
    eval$CallsetFilteredNames = as.vector(c(
        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
    );
    if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) {
        eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep="");
    }
    if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) {
        eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep="");
        #eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep="");
    }
    eval;
 }
--- a/public/gsalib/src/R/R/gsa.read.gatkreport.R
+++ b/public/gsalib/src/R/R/gsa.read.gatkreport.R
@ -112,7 +112,7 @@ gsa.read.gatkreportv1 <- function(lines) {
  finishTable <- function() {
    if ( rowCount == 1 )
-      # good I hate R.  Work around to avoid collapsing into an unstructured vector when 
+      # Workaround to avoid collapsing into an unstructured vector when 
      # there's only 1 row
      sub <- t(as.matrix(tableRows[1:rowCount,]))
    else
--- a/public/gsalib/src/R/R/gsa.read.squidmetrics.R
+++ b/public/gsalib/src/R/R/gsa.read.squidmetrics.R
@ -1,28 +0,0 @@
 gsa.read.squidmetrics = function(project, bylane = FALSE) {
    suppressMessages(library(ROracle));
    drv = dbDriver("Oracle");
    con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
    if (bylane) {
        statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep="");
        print(statement);
        rs  = dbSendQuery(con, statement = statement);
        d = fetch(rs, n=-1);
        dbHasCompleted(rs);
        dbClearResult(rs);
    } else {
        statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep="");
        print(statement);
        rs = dbSendQuery(con, statement = statement);
        d = fetch(rs, n=-1);
        dbHasCompleted(rs);
        dbClearResult(rs);
    }
    oraCloseDriver(drv);
    subset(d, Project == project);
 }
--- a/public/gsalib/src/R/R/gsa.read.vcf.R
+++ b/public/gsalib/src/R/R/gsa.read.vcf.R
@ -1,23 +0,0 @@
 gsa.read.vcf <- function(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE) {
    headers = readLines(vcffile, n=100);
    headerline = headers[grep("#CHROM", headers)];
    header = unlist(strsplit(gsub("#", "", headerline), "\t"))
    d = read.table(vcffile, header=FALSE, skip=skip, nrows=nrows, stringsAsFactors=FALSE);
    colnames(d) = header;
    if (expandGenotypeFields) {
        columns = ncol(d);
        offset = columns + 1;
        for (sampleIndex in 10:columns) {
            gt = unlist(lapply(strsplit(d[,sampleIndex], ":"), function(x) x[1]));
            d[,offset] = gt;
            colnames(d)[offset] = sprintf("%s.GT", colnames(d)[sampleIndex]);
            offset = offset + 1;
        }
    }
    return(d);
 }
--- a/public/gsalib/src/R/R/gsa.variantqc.utils.R
+++ b/public/gsalib/src/R/R/gsa.variantqc.utils.R
@ -1,246 +0,0 @@
 library(gplots)
 library(ggplot2)
 library(tools)
 # -------------------------------------------------------
 # Utilities for displaying multiple plots per page
 # -------------------------------------------------------
 distributeGraphRows <- function(graphs, heights = c()) {
  # Viewport layout 2 graphs top to bottom with given relative heights
  #
  #
  if (length(heights) == 0) {
    heights <- rep.int(1, length(graphs))
  }
  heights <- heights[!is.na(graphs)]
  graphs <- graphs[!is.na(graphs)]
  numGraphs <- length(graphs)
  Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
  grid.newpage()
  pushViewport(viewport(layout = Layout))
  subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
  for (i in 1:numGraphs) {
    print(graphs[[i]], vp = subplot(i))
  }
 }
 distributeLogGraph <- function(graph, xName) {
  continuousGraph <- graph + scale_x_continuous(xName)
  logGraph <- graph + scale_x_log10(xName) + ggtitle("")
  distributeGraphRows(list(continuousGraph, logGraph))
 }
 distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) {
  distributeGraphRows(list(perSampleGraph, distGraph), ratio)
 }
 removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) {
  # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame
  #
  # Only keeps the column marked with "all" for each removed column
  #
  for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) {
    if (toRemove %in% colnames(variantEvalDataFrame)) {
      variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",]
    }
  }
  variantEvalDataFrame    
 }
 openPDF <- function(outputPDF) {
  # Open the outputPDF file with standard dimensions, if outputPDF is not NA
  if ( ! is.na(outputPDF) ) {
    pdf(outputPDF, height=8.5, width=11)
  }
 }
 closePDF <- function(outputPDF) {
  # close the outputPDF file if not NA, and try to compact the PDF if possible
  if ( ! is.na(outputPDF) ) {
    dev.off()
    if (exists("compactPDF")) {
      print("compacting PDF")
      compactPDF(outputPDF)
    }
  }
 }
 makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) {
  if ( is.null(widths) ) widths <- rep(1, length(ACs))
  value = NULL
  titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom)
 }
 .reduceACs <- function(binWidthForAC, ACs) {
  # computes data structures necessary to reduce the full range of ACs
  #
  # binWidthForAC returns the number of upcoming bins that should be merged into 
  # that AC bin.  ACs is a vector of all AC values from 0 to 2N that should be 
  # merged together
  #
  # Returns a list containing the reduced ACs starts, their corresponding widths,
  # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
  maxAC <- max(ACs)
  newACs <- c()
  widths <- c()
  newACMap <- c()
  ac <- 0
  while ( ac < maxAC ) {
    newACs <- c(newACs, ac)
    width <- binWidthForAC(ac)
    widths <- c(widths, width)
    newACMap <- c(newACMap, rep(ac, width))
    ac <- ac + width
  }
  list(ACs = newACs, widths=widths, newACMap = newACMap)
 }
 # geometricACs <- function(k, ACs) {
 #   nBins <- round(k * log10(max(ACs)))
 #   
 #   binWidthForAC <- function(ac) {
 #     max(ceiling(ac / nBins), 1)
 #   }
 #   
 #   return(reduceACs(binWidthForAC, ACs))
 # }
 reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) {
  # map the full range of AC values onto a log linear scale
  #
  # Reduce the full AC range onto one where the width of each new AC increases at a rate of
  # 10^scaleFactor in size with growing AC values.  This is primarily useful for accurately
  # computing ratios or other quantities by AC that aren't well determined when the AC 
  # values are very large
  #
  # Returns a list containing the reduced ACs starts, their corresponding widths,
  # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
  maxAC <- max(ACs)
  afs <- ACs / maxAC
  breaks <- 10^(seq(-4, -1, scaleFactor))
  widths <- c()
  lastBreak <- 1
  for ( i in length(breaks):1 ) {
    b <- breaks[i]
    width <- sum(afs < lastBreak & afs >= b)
    widths <- c(widths, width)
    lastBreak <- b
  }
  widths <- rev(widths)
  binWidthForAC <- function(ac) {
    af <- ac / maxAC
    value = 1
    for ( i in length(breaks):1 )
      if ( af >= breaks[i] ) {
        value = widths[i]
        break
      }
    return(value)
  }
  return(.reduceACs(binWidthForAC, ACs))
 }
 .remapACs <- function(remapper, k, df) {
  newACs <- remapper(k, df$AC)
  n = length(newACs$ACs)
  num = rep(0, n)
  denom = rep(0, n)
  for ( i in 1:dim(df)[1] ) {
    rowI = df$AC == i
    row = df[rowI,]
    newAC = newACs$newACMap[row$AC]
    newRowI = newACs$ACs == newAC
    num[newRowI] = num[newRowI] + df$num[rowI]
    denom[newRowI] = denom[newRowI] + df$denom[rowI]
  }
  newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths )
  newdf
 }
 compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) {
  df = makeRatioDataFrame(ACs, num, denom, 1)
  return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df))
 }
 plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", 
                          fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels", 
                          onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, 
                          moreTitle="", note = NULL) {
  metrics$strat = metrics[[requestedStrat]]
  otherFacet = "."
  id.vars = c("strat", "nobs")
  metrics$nobs <- metrics[[nObsField]]
  # keep track of the other strat and it's implied facet value
  if (! is.null(anotherStrat)) { 
    id.vars = c(id.vars, anotherStrat)
    otherFacet = anotherStrat
  }
  molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures))
  perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable))
  # create the title
  titleText=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle)
  if ( !is.null(note) ) {
    titleText=paste(titleText, note, sep="\n")
  }
  paste(titleText)
  title <- ggtitle(titleText)
  determineFacet <- function(onX) {
    if ( onX ) { 
      paste(otherFacet, "~ variable")
    } else {
      paste("variable ~", otherFacet)
    }
  }
  sampleFacet = determineFacet(facetVariableOnXPerSample)
  distFacet   = determineFacet(facetVariableOnXForDist)
  if ( requestedStrat == "Sample" ) {
    perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
    perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)")
  } else { # by AlleleCount
    perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
    perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
  }    
  perSampleGraph <- perSampleGraph + ylab("Variable value") + title
  perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free")
  nValues = length(unique(molten$value))
  if (nValues > 2) {
    if ( requestedStrat == "Sample" ) {
      distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable))
    } else {
      distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs))
    }
    distGraph <- distGraph + geom_histogram(aes(y=..ndensity..))
    distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..))
    distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter"))
    scale = "free"
    if ( fixHistogramX ) scale = "fixed"
    distGraph <- distGraph + facet_grid(distFacet, scales=scale)
    distGraph <- distGraph + ylab("Relative frequency")
    distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
    distGraph <- distGraph + theme(axis.text.x=element_text(angle=-45)) # , legend.position="none")
  } else {
    distGraph <- NA
  }
  if ( onSamePage ) {
    suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph))
  } else {
    suppressMessages(print(perSampleGraph))
    suppressMessages(print(distGraph + title))
  }
 }
--- a/public/gsalib/src/R/R/gsa.warn.R
+++ b/public/gsalib/src/R/R/gsa.warn.R
@ -1,3 +0,0 @@
 gsa.warn <- function(message) {
    gsa.message(sprintf("Warning: %s", message));
 }
--- a/public/gsalib/src/R/Read-and-delete-me
+++ b/public/gsalib/src/R/Read-and-delete-me
@ -1,9 +0,0 @@
 * Edit the help file skeletons in 'man', possibly combining help files
  for multiple functions.
 * Put any C/C++/Fortran code in 'src'.
 * If you have compiled code, add a .First.lib() function in 'R' to load
  the shared library.
 * Run R CMD build to build the package tarball.
 * Run R CMD check to check the package tarball.
 Read "Writing R Extensions" for more information.
--- a/public/gsalib/src/R/man/gsa.error.Rd
+++ b/public/gsalib/src/R/man/gsa.error.Rd
@ -1,49 +0,0 @@
 \name{gsa.error}
 \alias{gsa.error}
 \title{
 GSA error
 }
 \description{
 Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit.
 }
 \usage{
 gsa.error(message)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
  \item{message}{
 The error message to write.
 }
 }
 \details{
 %%  ~~ If necessary, more details than the description above ~~
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \note{
 %%  ~~further notes~~
 }
 %% ~Make other sections like Warning with \section{Warning }{....} ~
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 gsa.error("This is a message");
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the
 % R documentation directory.
 \keyword{ ~kwd1 }
 \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/gsalib/src/R/man/gsa.getargs.Rd
+++ b/public/gsalib/src/R/man/gsa.getargs.Rd
@ -1,57 +0,0 @@
 \name{gsa.getargs}
 \alias{gsa.getargs}
 \title{
 Get script arguments
 }
 \description{
 Get script arguments given a list object specifying arguments and documentation.  Can be used in command-line or interactive mode.  This is helpful when developing scripts in interactive mode that will eventually become command-line programs.  If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit.
 }
 \usage{
 gsa.getargs(argspec, doc = NA)
 }
 \arguments{
  \item{argspec}{
 A list object.  Each key is an argument name.  The value is another list object with a 'value' and 'doc' keys.  For example:
 \preformatted{argspec = list(
    arg1 = list(value=10, doc="Info for optional arg1"),
    arg2 = list(value=NA, doc="Info for required arg2")
 );
 }
 If the value provided is NA, the argument is considered required and must be specified when the script is invoked.  For command-line mode, this means the argument must be specified on the command-line.  In interactive mode, there are two ways of specifying these arguments.  First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object.  Otherwise, the argument is prompted for.
 }
  \item{doc}{
 An optional string succinctly documenting the purpose of the script.
 }
 }
 \details{
 Interactive scripts typically make use of hardcoded filepaths and parameter settings.  This makes testing easy, but generalization to non-interactive mode more difficult.  This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes.
 To use this method, specify a list with key-value pairs representing the arguments as specified above.  In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided.
 }
 \value{
 Returns a list with keys matching the argspec and values representing the specified arguments.
 \item{arg1 }{Value for argument 1}
 \item{arg2 }{Value for argument 2}
 ...etc.
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \examples{
 argspec = list(
    file    = list(value="/my/test.vcf", doc="VCF file"),
    verbose = list(value=0,              doc="If 1, set verbose mode"),
    test2   = list(value=2.3e9,          doc="Another argument that does stuff")
 );
 cmdargs = gsa.getargs(argspec, doc="My test program");
 print(cmdargs$file);  # will print '[1] "/my/test.vcf"'
 }
 \keyword{ ~kwd1 }
--- a/public/gsalib/src/R/man/gsa.message.Rd
+++ b/public/gsalib/src/R/man/gsa.message.Rd
@ -1,44 +0,0 @@
 \name{gsa.message}
 \alias{gsa.message}
 \title{
 GSA message
 }
 \description{
 Write a message to standard out with the prefix '[gsalib]'.
 }
 \usage{
 gsa.message(message)
 }
 \arguments{
  \item{message}{
 The message to write.
 }
 }
 \details{
 %%  ~~ If necessary, more details than the description above ~~
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \note{
 %%  ~~further notes~~
 }
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 ## Write message to stdout
 gsa.message("This is a message");
 }
 \keyword{ ~kwd1 }
--- a/public/gsalib/src/R/man/gsa.plot.venn.Rd
+++ b/public/gsalib/src/R/man/gsa.plot.venn.Rd
@ -1,75 +0,0 @@
 \name{gsa.plot.venn}
 \alias{gsa.plot.venn}
 \title{
 Plot a proportional venn diagram
 }
 \description{
 Plot a proportional venn diagram (two or three-way venns allowed)
 }
 \usage{
 gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0)
 }
 \arguments{
  \item{a}{
 size of 'a' circle
 }
  \item{b}{
 size of 'b' circle
 }
  \item{c}{
 size of 'c' circle
 }
  \item{a_and_b}{
 size of a and b overlap
 }
  \item{a_and_c}{
 size of a and c overlap
 }
  \item{b_and_c}{
 size of b and c overlap
 }
  \item{col}{
 vector of colors for each venn piece
 }
  \item{pos}{
 vector of positional elements
 }
  \item{debug}{
 if 1, set debug mode and print useful information
 }
 }
 \details{
 Plots a two-way or three-way proportional Venn diagram.  Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways.
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 }
 \author{
 Kiran Garimella
 }
 \note{
 %%  ~~further notes~~
 }
 %% ~Make other sections like Warning with \section{Warning }{....} ~
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 ## Plot a two-way Venn diagram
 gsa.plot.venn(1000, 750, 0, 400);
 ## Plot a three-way Venn diagram
 gsa.plot.venn(1000, 750, 900, 400, 650, 500);
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the
 % R documentation directory.
 \keyword{ ~kwd1 }
 \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/gsalib/src/R/man/gsa.read.eval.Rd
+++ b/public/gsalib/src/R/man/gsa.read.eval.Rd
@ -1,111 +0,0 @@
 \name{gsa.read.eval}
 \alias{gsa.read.eval}
 \title{
 Read a VariantEval file
 }
 \description{
 Read a VariantEval file that's output in R format.
 }
 \usage{
 gsa.read.eval(evalRoot)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
  \item{evalRoot}{
 %%     ~~Describe \code{evalRoot} here~~
 }
 }
 \details{
 %%  ~~ If necessary, more details than the description above ~~
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 %%  ~~who you are~~
 }
 \note{
 %%  ~~further notes~~
 }
 %% ~Make other sections like Warning with \section{Warning }{....} ~
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 ##---- Should be DIRECTLY executable !! ----
 ##-- ==>  Define data, use random,
 ##--	or do  help(data=index)  for the standard data sets.
 ## The function is currently defined as
 function(evalRoot) {
    fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
    fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
    fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
    fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
    fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
    fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
    fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
    fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
    fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
    fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
    fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
    fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
    fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
    fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
    eval = list(
        AlleleCountStats = NA,
        CompOverlap = NA,
        CountVariants = NA,
        GenotypeConcordance = NA,
        MetricsByAc = NA,
        MetricsBySample = NA,
        Quality_Metrics_by_allele_count = NA,
        QualityScoreHistogram = NA,
        SampleStatistics = NA,
        SampleSummaryStatistics = NA,
        SimpleMetricsBySample = NA,
        TiTv = NA,
        TiTvStats = NA,
        Variant_Quality_Score = NA,
        CallsetNames = c(),
        CallsetOnlyNames = c(),
        CallsetFilteredNames = c()
    );
    eval$AlleleCountStats                = .attemptToLoadFile(fileAlleleCountStats);
    eval$CompOverlap                     = .attemptToLoadFile(fileCompOverlap);
    eval$CountVariants                   = .attemptToLoadFile(fileCountVariants);
    eval$GenotypeConcordance             = .attemptToLoadFile(fileGenotypeConcordance);
    eval$MetricsByAc                     = .attemptToLoadFile(fileMetricsByAc);
    eval$MetricsBySample                 = .attemptToLoadFile(fileMetricsBySample);
    eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count);
    eval$QualityScoreHistogram           = .attemptToLoadFile(fileQualityScoreHistogram);
    eval$SampleStatistics                = .attemptToLoadFile(fileSampleStatistics);
    eval$SampleSummaryStatistics         = .attemptToLoadFile(fileSampleSummaryStatistics);
    eval$SimpleMetricsBySample           = .attemptToLoadFile(fileSimpleMetricsBySample);
    eval$TiTv                            = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
    eval$TiTvStats                       = .attemptToLoadFile(fileTiTvStats);
    eval$Variant_Quality_Score           = .attemptToLoadFile(fileVariant_Quality_Score);
    uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
    eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
    eval$CallsetFilteredNames = as.vector(c()); 
    eval;
  }
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the
 % R documentation directory.
 \keyword{ ~kwd1 }
 \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/gsalib/src/R/man/gsa.read.gatkreport.Rd
+++ b/public/gsalib/src/R/man/gsa.read.gatkreport.Rd
@ -1,10 +1,10 @@
 \name{gsa.read.gatkreport}
 \alias{gsa.read.gatkreport}
 \title{
-gsa.read.gatkreport
+Function to read in a GATKReport
 }
 \description{
-Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list.
+This function reads in data from a GATKReport. A GATKReport is a document containing multiple tables produced by the GATK. Each table is loaded as a separate data.frame object in a list.
 }
 \usage{
 gsa.read.gatkreport(filename)
@ -15,41 +15,22 @@ The path to the GATKReport file.
 }
 }
 \details{
-The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format.  This format accomodates multiple tables and is still R-loadable - through this function.
+The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format.  This format accommodates multiple tables and is still R-loadable through this function. 
 The file format looks like this:
 \preformatted{##:GATKReport.v0.1 TableName : The description of the table
 col1   col2                      col3
 0      0.007451835696110506      25.474613284804366
 1      0.002362777171937477      29.844949954504095
 2      9.087604507451836E-4      32.87590975254731
 3      5.452562704471102E-4      34.498999090081895
 4      9.087604507451836E-4      35.14831665150137
 }
 }
 \value{
-Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table.  If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN".
+Returns a LIST object, where each key is the TableName and the value is the data.frame object with the contents of the table.  If multiple tables with the same name exist, each one after the first will be given names of TableName.v1, TableName.v2, ..., TableName.vN.
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
-%% ~put references to the literature/web site here ~
+http://www.broadinstitute.org/gatk/guide/article?id=1244
 }
 \author{
 Kiran Garimella
 }
 \note{
-%%  ~~further notes~~
+This function accepts different versions of the GATKReport format by making internal calls to gsa.read.gatkreportv0() or gsa.read.gatkreportv1() as appropriate.
 }
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
-report = gsa.read.gatkreport("/path/to/my/output.gatkreport");
+test_file = system.file("inst", "extdata", "test_gatkreport.table", package = "gsalib");
 report = gsa.read.gatkreport(test_file);
 }
-\keyword{ ~kwd1 }
+\keyword{ manip }
--- a/public/gsalib/src/R/man/gsa.read.squidmetrics.Rd
+++ b/public/gsalib/src/R/man/gsa.read.squidmetrics.Rd
@ -1,48 +0,0 @@
 \name{gsa.read.squidmetrics}
 \alias{gsa.read.squidmetrics}
 \title{
 gsa.read.squidmetrics
 }
 \description{
 Reads metrics for a specified SQUID project into a dataframe.
 }
 \usage{
 gsa.read.squidmetrics("C315")
 }
 \arguments{
  \item{project}{
 The project for which metrics should be obtained.
 }
  \item{bylane}{
 If TRUE, obtains per-lane metrics rather than the default per-sample metrics.
 }
 }
 \details{
 %%  ~~ If necessary, more details than the description above ~~
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 Returns a data frame with samples (or lanes) as the row and the metric as the column.
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \note{
 This method will only work within the Broad Institute internal network.
 }
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 ## Obtain metrics for project C315.
 d = gsa.read.squidmetrics("C315");
 }
 \keyword{ ~kwd1 }
--- a/public/gsalib/src/R/man/gsa.read.vcf.Rd
+++ b/public/gsalib/src/R/man/gsa.read.vcf.Rd
@ -1,53 +0,0 @@
 \name{gsa.read.vcf}
 \alias{gsa.read.vcf}
 \title{
 gsa.read.vcf
 }
 \description{
 Reads a VCF file into a table.  Optionally expands genotype columns into separate columns containing the genotype, separate from the other fields specified in the FORMAT field.
 }
 \usage{
 gsa.read.vcf(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE)
 }
 \arguments{
  \item{vcffile}{
 The path to the vcf file.
 }
  \item{skip}{
 The number of lines of the data file to skip before beginning to read data.
 }
  \item{nrows}{
 The maximum number of rows to read in.  Negative and other invalid values are ignored.
 }
  \item{expandGenotypeFields}{
 If TRUE, adds an additional column per sample containing just the genotype.
 }
 }
 \details{
 The VCF format is the standard variant call file format used in the GATK.  This function reads that data in as a table for easy analysis.
 }
 \value{
 Returns a data.frame object, where each column corresponds to the columns in the VCF file.
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \note{
 %%  ~~further notes~~
 }
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 vcf = gsa.read.vcf("/path/to/my/output.vcf");
 }
 \keyword{ ~kwd1 }
--- a/public/gsalib/src/R/man/gsa.warn.Rd
+++ b/public/gsalib/src/R/man/gsa.warn.Rd
@ -1,46 +0,0 @@
 \name{gsa.warn}
 \alias{gsa.warn}
 \title{
 GSA warn
 }
 \description{
 Write a warning message to standard out with the prefix '[gsalib] Warning:'.
 }
 \usage{
 gsa.warn(message)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
  \item{message}{
 The warning message to write.
 }
 }
 \details{
 %%  ~~ If necessary, more details than the description above ~~
 }
 \value{
 %%  ~Describe the value returned
 %%  If it is a LIST, use
 %%  \item{comp1 }{Description of 'comp1'}
 %%  \item{comp2 }{Description of 'comp2'}
 %% ...
 }
 \references{
 %% ~put references to the literature/web site here ~
 }
 \author{
 Kiran Garimella
 }
 \note{
 %%  ~~further notes~~
 }
 \seealso{
 %% ~~objects to See Also as \code{\link{help}}, ~~~
 }
 \examples{
 ## Write message to stdout
 gsa.warn("This is a warning message");
 }
 \keyword{ ~kwd1 }
 \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/gsalib/src/R/man/gsalib-package.Rd
+++ b/public/gsalib/src/R/man/gsalib-package.Rd
@ -3,68 +3,32 @@
 \alias{gsalib}
 \docType{package}
 \title{
-GATK utility analysis functions
+Utility functions for GATK
 }
 \description{
-Utility functions for analyzing GATK-processed NGS data
+Utility functions for analysis of genome sequence data with the GATK
 }
 \details{
-This package contains functions for working with GATK-processed NGS data.  These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more.
+\tabular{ll}{
 Package: \tab gsalib\cr
 Type: \tab Package\cr
 Version: \tab 1.0\cr
 Date: \tab 2013-07-01\cr
 License: \tab What license is it under?\cr
 LazyLoad: \tab yes\cr
 }
 This package is primarily meant to be used programmatically by GATK tools. However the gsa.read.gatkreport() function can be used to easily read in data from a GATKReport. A GATKReport is a multi-table document generated by GATK tools. 
 }
 \author{
-Genome Sequencing and Analysis Group
+Kiran Garimella
-Medical and Population Genetics Program
+Maintainer: Geraldine Van der Auwera <vdauwera@broadinstitute.org>
 Maintainer: Kiran Garimella
 }
 \references{
-GATK website: http://www.broadinstitute.org/gatk
+http://www.broadinstitute.org/gatk/guide/article?id=1244
 GATK documentation guide: http://www.broadinstitute.org/gatk/guide
 GATK help forum: http://gatkforums.broadinstitute.org
 }
 \examples{
 ## get script arguments in interactive and non-interactive mode
 cmdargs = gsa.getargs( list(
    requiredArg1 = list(
        value = NA,
        doc   = "Documentation for requiredArg1"
    ),
    optionalArg1 = list(
        value = 3e9,
        doc   = "Documentation for optionalArg1"
    )
 ) );
 ## plot a proportional Venn diagram
 gsa.plot.venn(500, 250, 0, 100);
 ## read a GATKReport file
 report = gsa.gatk.report("/path/to/my/output.gatkreport");
 ## emit a message
 gsa.message("This is a message");
 ## emit a warning message
 gsa.message("This is a warning message");
 ## emit an error message
 gsa.message("This is an error message");
 ## read the SQUID metrics for a given sequencing project (internal to the Broad only)
 s = gsa.read.squidmetrics("C427");
 ## read command-line arguments
 cmdargs = gsa.getargs(
    list(
        file    = list(value="/my/test.vcf", doc="VCF file"),
        verbose = list(value=0,              doc="If 1, set verbose mode"),
        test2   = list(value=2.3e9,          doc="Another argument that does stuff")
    ),
    doc="My test program"
 );
 }
 \keyword{ package }
 \examples{
 test_file = system.file("inst", "extdata", "test_gatkreport.table", package = "gsalib");
 report = gsa.read.gatkreport(test_file);
 }
		`@ -1 +1 @@`
			`exportPattern("^[^\\.]")`				`export(gsa.read.gatkreport)`