Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-07-27 16:15:23 -04:00 · 2011-07-27 16:15:23 -04:00 · ff31fa7990
parent a53aeb75ab 5809a61b20
commit ff31fa7990
38 changed files with 1200 additions and 77 deletions
--- a/build.xml
+++ b/build.xml
@ -520,7 +520,7 @@
            <fileset dir="${java.classes}">
              <include name="**/utils/codecs/**/*.class"/>
              <include name="**/utils/variantcontext/**/*.class"/>
-              <include name="**/UserException*.class"/>
+              <include name="org/broadinstitute/sting/utils/exceptions/**"/>
 	      <include name="org/broadinstitute/sting/utils/help/DocumentedGATKFeature.class"/>
            </fileset>
        </jar>
@ -1089,7 +1089,7 @@
    <!-- Build gsalib R module -->
    <target name="gsalib">
      <exec executable="R" failonerror="true">
-        <arg line="R CMD INSTALL -l private/R/ private/R/src/gsalib/" />
+        <arg line="R CMD INSTALL -l public/R/ public/R/src/gsalib/" />
      </exec>
    </target>
 </project>
--- a/public/R/src/gsalib/DESCRIPTION
+++ b/public/R/src/gsalib/DESCRIPTION
@ -0,0 +1,10 @@
+Package: gsalib
+Type: Package
+Title: Utility functions
+Version: 1.0
+Date: 2010-10-02
+Author: Kiran Garimella
+Maintainer: Kiran Garimella <kiran@broadinstitute.org>
+Description: Utility functions for GATK NGS analyses
+License: BSD
+LazyLoad: yes
--- a/public/R/src/gsalib/R/gsa.error.R
+++ b/public/R/src/gsalib/R/gsa.error.R
@ -0,0 +1,12 @@
+gsa.error <- function(message) {
+    message("");
+    gsa.message("Error: **********");
+    gsa.message(sprintf("Error: %s", message));
+    gsa.message("Error: **********");
+    message("");
+
+    traceback();
+
+    message("");
+    stop(message, call. = FALSE);
+}
--- a/public/R/src/gsalib/R/gsa.getargs.R
+++ b/public/R/src/gsalib/R/gsa.getargs.R
@ -0,0 +1,116 @@
+.gsa.getargs.usage <- function(argspec, doc) {
+    cargs = commandArgs();
+
+    usage = "Usage:";
+
+    fileIndex = grep("--file=", cargs);
+    if (length(fileIndex) > 0) {
+        progname = gsub("--file=", "", cargs[fileIndex[1]]);
+
+        usage = sprintf("Usage: Rscript %s [arguments]", progname);
+
+        if (!is.na(doc)) {
+            message(sprintf("%s: %s\n", progname, doc));
+        }
+    }
+
+    message(usage);
+
+    for (argname in names(argspec)) {
+        key = argname;
+        defaultValue = 0;
+        doc = "";
+
+        if (is.list(argspec[[argname]])) {
+            defaultValue = argspec[[argname]]$value;
+            doc = argspec[[argname]]$doc;
+        }
+
+        message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc));
+    }
+
+    message("");
+
+    stop(call. = FALSE);
+}
+
+gsa.getargs <- function(argspec, doc = NA) {
+    argsenv = new.env();
+
+    for (argname in names(argspec)) {
+        value = 0;
+        if (is.list(argspec[[argname]])) {
+            value = argspec[[argname]]$value;
+        } else {
+            value = argspec[[argname]];
+        }
+
+        assign(argname, value, envir=argsenv);
+    }
+
+    if (interactive()) {
+        for (argname in names(argspec)) {
+            value = get(argname, envir=argsenv);
+
+            if (is.na(value) | is.null(value)) {
+                if (exists("cmdargs")) {
+                    assign(argname, cmdargs[[argname]], envir=argsenv);
+                } else {
+                    assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv);
+                }
+            } else {
+                assign(argname, value, envir=argsenv);
+            }
+        }
+    } else {
+        cargs = commandArgs(TRUE);
+
+        if (length(cargs) == 0) {
+            .gsa.getargs.usage(argspec, doc);
+        }
+
+        for (i in 1:length(cargs)) {
+            if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) {
+                key = gsub("-", "", cargs[i]);
+                value = cargs[i+1];
+
+                if (key == "h" | key == "help") {
+                    .gsa.getargs.usage(argspec, doc);
+                }
+
+                if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) {
+                    value = as.numeric(value);
+                }
+                
+                assign(key, value, envir=argsenv);
+            }
+        }
+    }
+
+    args = as.list(argsenv);
+
+    isMissingArgs = 0;
+    missingArgs = c();
+    
+    for (arg in names(argspec)) {
+        if (is.na(args[[arg]]) | is.null(args[[arg]])) {
+            gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg));
+
+            isMissingArgs = 1;
+            missingArgs = c(missingArgs, arg);
+        }
+    }
+
+    if (isMissingArgs) {
+        gsa.error(
+            paste(
+                "Missing required arguments: -",
+                paste(missingArgs, collapse=" -"),
+                ".  Specify -h or -help to this script for a list of available arguments.",
+                sep=""
+            )
+        );
+    }
+
+    args;
+}
--- a/public/R/src/gsalib/R/gsa.message.R
+++ b/public/R/src/gsalib/R/gsa.message.R
@ -0,0 +1,3 @@
+gsa.message <- function(message) {
+    message(sprintf("[gsalib] %s", message));
+}
--- a/public/R/src/gsalib/R/gsa.plot.venn.R
+++ b/public/R/src/gsalib/R/gsa.plot.venn.R
@ -0,0 +1,50 @@
+gsa.plot.venn <-
+function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
+                     col=c("#FF6342", "#63C6DE", "#ADDE63"),
+                     pos=c(0.20, 0.20, 0.80, 0.82),
+                     debug=0
+                    ) {
+    library(png);
+    library(graphics);
+
+    # Set up properties
+    for (i in 1:length(col)) {
+        rgbcol = col2rgb(col[i]);
+        col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]);
+    }
+
+    chco = paste(col[1], col[2], col[3], sep=",");
+    chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=",");
+
+    props = c(
+        'cht=v',
+        'chs=525x525',
+        'chds=0,10000000000',
+        paste('chco=', chco, sep=""),
+        paste('chd=t:', chd, sep="")
+    );
+    proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&');
+
+    # Get the venn diagram (as a temporary file)
+    filename = tempfile("venn");
+    cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep="");
+
+    if (debug == 1) {
+        print(cmd);
+    }
+    system(cmd);
+
+    # Render the temp png file into a plotting frame
+    a = readPNG(filename);
+    
+    plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
+    if (c == 0 || a >= b) {
+        rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
+    } else {
+        rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
+    }
+
+    # Clean up!
+    unlink(filename);
+}
+
--- a/public/R/src/gsalib/R/gsa.read.eval.R
+++ b/public/R/src/gsalib/R/gsa.read.eval.R
@ -0,0 +1,83 @@
+.gsa.attemptToLoadFile <- function(filename) {
+    file = NA;
+
+    if (file.exists(filename) & file.info(filename)$size > 500) {
+        file = read.csv(filename, header=TRUE, comment.char="#");
+    }
+
+    file;
+}
+
+gsa.read.eval <-
+function(evalRoot) {
+    fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
+    fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
+    fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
+    fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
+    fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
+    fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
+    fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
+    fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
+    fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
+    fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
+    fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
+    fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
+    fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
+    fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
+
+    eval = list(
+        AlleleCountStats = NA,
+        CompOverlap = NA,
+        CountVariants = NA,
+        GenotypeConcordance = NA,
+        MetricsByAc = NA,
+        MetricsBySample = NA,
+        Quality_Metrics_by_allele_count = NA,
+        QualityScoreHistogram = NA,
+        SampleStatistics = NA,
+        SampleSummaryStatistics = NA,
+        SimpleMetricsBySample = NA,
+        TiTv = NA,
+        TiTvStats = NA,
+        Variant_Quality_Score = NA,
+
+        CallsetNames = c(),
+        CallsetOnlyNames = c(),
+        CallsetFilteredNames = c()
+    );
+
+    eval$AlleleCountStats                = .gsa.attemptToLoadFile(fileAlleleCountStats);
+    eval$CompOverlap                     = .gsa.attemptToLoadFile(fileCompOverlap);
+    eval$CountVariants                   = .gsa.attemptToLoadFile(fileCountVariants);
+    eval$GenotypeConcordance             = .gsa.attemptToLoadFile(fileGenotypeConcordance);
+    eval$MetricsByAc                     = .gsa.attemptToLoadFile(fileMetricsByAc);
+    eval$MetricsBySample                 = .gsa.attemptToLoadFile(fileMetricsBySample);
+    eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count);
+    eval$QualityScoreHistogram           = .gsa.attemptToLoadFile(fileQualityScoreHistogram);
+    eval$SampleStatistics                = .gsa.attemptToLoadFile(fileSampleStatistics);
+    eval$SampleSummaryStatistics         = .gsa.attemptToLoadFile(fileSampleSummaryStatistics);
+    eval$SimpleMetricsBySample           = .gsa.attemptToLoadFile(fileSimpleMetricsBySample);
+    eval$TiTv                            = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
+    eval$TiTvStats                       = .gsa.attemptToLoadFile(fileTiTvStats);
+    eval$Variant_Quality_Score           = .gsa.attemptToLoadFile(fileVariant_Quality_Score);
+
+    uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
+    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
+    eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
+    eval$CallsetFilteredNames = as.vector(c(
+        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
+        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
+    );
+
+    if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) {
+        eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep="");
+    }
+
+    if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) {
+        eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep="");
+        #eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep="");
+    }
+
+    eval;
+}
+
--- a/public/R/src/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R
@ -0,0 +1,64 @@
+# Load a table into the specified environment.  Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last.
+.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
+    d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
+    colnames(d) = tableHeader;
+
+    for (i in 1:ncol(d)) {
+        v = suppressWarnings(as.numeric(d[,i]));
+
+        if (length(na.omit(as.numeric(v))) == length(d[,i])) {
+            d[,i] = v;
+        }
+    }
+
+    usedNames = ls(envir=tableEnv, pattern=tableName);
+
+    if (length(usedNames) > 0) {
+        tableName = paste(tableName, ".", length(usedNames), sep="");
+    }
+
+    assign(tableName, d, envir=tableEnv);
+}
+
+# Load all GATKReport tables from a file
+gsa.read.gatkreport <- function(filename) {
+    con = file(filename, "r", blocking = TRUE);
+    lines = readLines(con);
+    close(con);
+
+    tableEnv = new.env();
+
+    tableName = NA;
+    tableHeader = c();
+    tableRows = c();
+
+    for (line in lines) {
+        if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+            headerFields = unlist(strsplit(line, "[[:space:]]+"));
+
+            if (!is.na(tableName)) {
+                .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+            }
+
+            tableName = headerFields[2];
+            tableHeader = c();
+            tableRows = c();
+        } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
+            # do nothing
+        } else if (!is.na(tableName)) {
+            row = unlist(strsplit(line, "[[:space:]]+"));
+
+            if (length(tableHeader) == 0) {
+                tableHeader = row;
+            } else {
+                tableRows = rbind(tableRows, row);
+            }
+        }
+    }
+
+    if (!is.na(tableName)) {
+        .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+    }
+
+    gatkreport = as.list(tableEnv);
+}
--- a/public/R/src/gsalib/R/gsa.read.squidmetrics.R
+++ b/public/R/src/gsalib/R/gsa.read.squidmetrics.R
@ -0,0 +1,28 @@
+gsa.read.squidmetrics = function(project, bylane = FALSE) {
+    suppressMessages(library(ROracle));
+
+    drv = dbDriver("Oracle");
+    con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
+
+    if (bylane) {
+        statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep="");
+        print(statement);
+
+        rs  = dbSendQuery(con, statement = statement);
+        d = fetch(rs, n=-1);
+        dbHasCompleted(rs);
+        dbClearResult(rs);
+    } else {
+        statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep="");
+        print(statement);
+
+        rs = dbSendQuery(con, statement = statement);
+        d = fetch(rs, n=-1);
+        dbHasCompleted(rs);
+        dbClearResult(rs);
+    }
+
+    oraCloseDriver(drv);
+
+    subset(d, Project == project);
+}
--- a/public/R/src/gsalib/R/gsa.read.vcf.R
+++ b/public/R/src/gsalib/R/gsa.read.vcf.R
@ -0,0 +1,23 @@
+gsa.read.vcf <- function(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE) {
+    headers = readLines(vcffile, n=100);
+    headerline = headers[grep("#CHROM", headers)];
+    header = unlist(strsplit(gsub("#", "", headerline), "\t"))
+    
+    d = read.table(vcffile, header=FALSE, skip=skip, nrows=nrows, stringsAsFactors=FALSE);
+    colnames(d) = header;
+
+    if (expandGenotypeFields) {
+        columns = ncol(d);
+
+        offset = columns + 1;
+        for (sampleIndex in 10:columns) {
+            gt = unlist(lapply(strsplit(d[,sampleIndex], ":"), function(x) x[1]));
+            d[,offset] = gt;
+            colnames(d)[offset] = sprintf("%s.GT", colnames(d)[sampleIndex]);
+
+            offset = offset + 1;
+        }
+    }
+
+    return(d);
+}
--- a/public/R/src/gsalib/R/gsa.warn.R
+++ b/public/R/src/gsalib/R/gsa.warn.R
@ -0,0 +1,3 @@
+gsa.warn <- function(message) {
+    gsa.message(sprintf("Warning: %s", message));
+}
--- a/public/R/src/gsalib/Read-and-delete-me
+++ b/public/R/src/gsalib/Read-and-delete-me
@ -0,0 +1,9 @@
+* Edit the help file skeletons in 'man', possibly combining help files
+  for multiple functions.
+* Put any C/C++/Fortran code in 'src'.
+* If you have compiled code, add a .First.lib() function in 'R' to load
+  the shared library.
+* Run R CMD build to build the package tarball.
+* Run R CMD check to check the package tarball.
+
+Read "Writing R Extensions" for more information.
--- a/public/R/src/gsalib/data/tearsheetdrop.jpg
+++ b/public/R/src/gsalib/data/tearsheetdrop.jpg
--- a/public/R/src/gsalib/man/gsa.error.Rd
+++ b/public/R/src/gsalib/man/gsa.error.Rd
@ -0,0 +1,49 @@
+\name{gsa.error}
+\alias{gsa.error}
+\title{
+GSA error
+}
+\description{
+Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit.
+}
+\usage{
+gsa.error(message)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{message}{
+The error message to write.
+}
+}
+\details{
+%%  ~~ If necessary, more details than the description above ~~
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+gsa.error("This is a message");
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/R/src/gsalib/man/gsa.getargs.Rd
+++ b/public/R/src/gsalib/man/gsa.getargs.Rd
@ -0,0 +1,57 @@
+\name{gsa.getargs}
+\alias{gsa.getargs}
+\title{
+Get script arguments
+}
+\description{
+Get script arguments given a list object specifying arguments and documentation.  Can be used in command-line or interactive mode.  This is helpful when developing scripts in interactive mode that will eventually become command-line programs.  If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit.
+}
+\usage{
+gsa.getargs(argspec, doc = NA)
+}
+\arguments{
+  \item{argspec}{
+A list object.  Each key is an argument name.  The value is another list object with a 'value' and 'doc' keys.  For example:
+\preformatted{argspec = list(
+    arg1 = list(value=10, doc="Info for optional arg1"),
+    arg2 = list(value=NA, doc="Info for required arg2")
+);
+}
+
+If the value provided is NA, the argument is considered required and must be specified when the script is invoked.  For command-line mode, this means the argument must be specified on the command-line.  In interactive mode, there are two ways of specifying these arguments.  First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object.  Otherwise, the argument is prompted for.
+}
+
+  \item{doc}{
+An optional string succinctly documenting the purpose of the script.
+}
+}
+\details{
+Interactive scripts typically make use of hardcoded filepaths and parameter settings.  This makes testing easy, but generalization to non-interactive mode more difficult.  This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes.
+
+To use this method, specify a list with key-value pairs representing the arguments as specified above.  In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided.
+}
+\value{
+Returns a list with keys matching the argspec and values representing the specified arguments.
+
+\item{arg1 }{Value for argument 1}
+\item{arg2 }{Value for argument 2}
+...etc.
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\examples{
+argspec = list(
+    file    = list(value="/my/test.vcf", doc="VCF file"),
+    verbose = list(value=0,              doc="If 1, set verbose mode"),
+    test2   = list(value=2.3e9,          doc="Another argument that does stuff")
+);
+        
+cmdargs = gsa.getargs(argspec, doc="My test program");
+
+print(cmdargs$file);  # will print '[1] "/my/test.vcf"'
+}
+\keyword{ ~kwd1 }
--- a/public/R/src/gsalib/man/gsa.message.Rd
+++ b/public/R/src/gsalib/man/gsa.message.Rd
@ -0,0 +1,44 @@
+\name{gsa.message}
+\alias{gsa.message}
+\title{
+GSA message
+}
+\description{
+Write a message to standard out with the prefix '[gsalib]'.
+}
+\usage{
+gsa.message(message)
+}
+\arguments{
+  \item{message}{
+The message to write.
+}
+}
+\details{
+%%  ~~ If necessary, more details than the description above ~~
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Write message to stdout
+gsa.message("This is a message");
+}
+\keyword{ ~kwd1 }
--- a/public/R/src/gsalib/man/gsa.plot.venn.Rd
+++ b/public/R/src/gsalib/man/gsa.plot.venn.Rd
@ -0,0 +1,75 @@
+\name{gsa.plot.venn}
+\alias{gsa.plot.venn}
+\title{
+Plot a proportional venn diagram
+}
+\description{
+Plot a proportional venn diagram (two or three-way venns allowed)
+}
+\usage{
+gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0)
+}
+\arguments{
+  \item{a}{
+size of 'a' circle
+}
+  \item{b}{
+size of 'b' circle
+}
+  \item{c}{
+size of 'c' circle
+}
+  \item{a_and_b}{
+size of a and b overlap
+}
+  \item{a_and_c}{
+size of a and c overlap
+}
+  \item{b_and_c}{
+size of b and c overlap
+}
+  \item{col}{
+vector of colors for each venn piece
+}
+  \item{pos}{
+vector of positional elements
+}
+  \item{debug}{
+if 1, set debug mode and print useful information
+}
+}
+\details{
+Plots a two-way or three-way proportional Venn diagram.  Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways.
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Plot a two-way Venn diagram
+gsa.plot.venn(1000, 750, 0, 400);
+
+## Plot a three-way Venn diagram
+gsa.plot.venn(1000, 750, 900, 400, 650, 500);
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/R/src/gsalib/man/gsa.read.eval.Rd
+++ b/public/R/src/gsalib/man/gsa.read.eval.Rd
@ -0,0 +1,111 @@
+\name{gsa.read.eval}
+\alias{gsa.read.eval}
+\title{
+Read a VariantEval file
+}
+\description{
+Read a VariantEval file that's output in R format.
+}
+\usage{
+gsa.read.eval(evalRoot)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{evalRoot}{
+%%     ~~Describe \code{evalRoot} here~~
+}
+}
+\details{
+%%  ~~ If necessary, more details than the description above ~~
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+%%  ~~who you are~~
+}
+\note{
+%%  ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+##---- Should be DIRECTLY executable !! ----
+##-- ==>  Define data, use random,
+##--	or do  help(data=index)  for the standard data sets.
+
+## The function is currently defined as
+function(evalRoot) {
+    fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
+    fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
+    fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
+    fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
+    fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
+    fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
+    fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
+    fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
+    fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
+    fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
+    fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
+    fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
+    fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
+    fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
+
+    eval = list(
+        AlleleCountStats = NA,
+        CompOverlap = NA,
+        CountVariants = NA,
+        GenotypeConcordance = NA,
+        MetricsByAc = NA,
+        MetricsBySample = NA,
+        Quality_Metrics_by_allele_count = NA,
+        QualityScoreHistogram = NA,
+        SampleStatistics = NA,
+        SampleSummaryStatistics = NA,
+        SimpleMetricsBySample = NA,
+        TiTv = NA,
+        TiTvStats = NA,
+        Variant_Quality_Score = NA,
+
+        CallsetNames = c(),
+        CallsetOnlyNames = c(),
+        CallsetFilteredNames = c()
+    );
+
+    eval$AlleleCountStats                = .attemptToLoadFile(fileAlleleCountStats);
+    eval$CompOverlap                     = .attemptToLoadFile(fileCompOverlap);
+    eval$CountVariants                   = .attemptToLoadFile(fileCountVariants);
+    eval$GenotypeConcordance             = .attemptToLoadFile(fileGenotypeConcordance);
+    eval$MetricsByAc                     = .attemptToLoadFile(fileMetricsByAc);
+    eval$MetricsBySample                 = .attemptToLoadFile(fileMetricsBySample);
+    eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count);
+    eval$QualityScoreHistogram           = .attemptToLoadFile(fileQualityScoreHistogram);
+    eval$SampleStatistics                = .attemptToLoadFile(fileSampleStatistics);
+    eval$SampleSummaryStatistics         = .attemptToLoadFile(fileSampleSummaryStatistics);
+    eval$SimpleMetricsBySample           = .attemptToLoadFile(fileSimpleMetricsBySample);
+    eval$TiTv                            = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
+    eval$TiTvStats                       = .attemptToLoadFile(fileTiTvStats);
+    eval$Variant_Quality_Score           = .attemptToLoadFile(fileVariant_Quality_Score);
+
+    uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
+    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
+    eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
+    eval$CallsetFilteredNames = as.vector(c()); 
+    eval;
+  }
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/R/src/gsalib/man/gsa.read.gatkreport.Rd
+++ b/public/R/src/gsalib/man/gsa.read.gatkreport.Rd
@ -0,0 +1,55 @@
+\name{gsa.read.gatkreport}
+\alias{gsa.read.gatkreport}
+\title{
+gsa.read.gatkreport
+}
+\description{
+Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list.
+}
+\usage{
+gsa.read.gatkreport(filename)
+}
+\arguments{
+  \item{filename}{
+The path to the GATKReport file.
+}
+}
+\details{
+The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format.  This format accomodates multiple tables and is still R-loadable - through this function.
+
+The file format looks like this:
+\preformatted{##:GATKReport.v0.1 TableName : The description of the table
+col1   col2                      col3
+0      0.007451835696110506      25.474613284804366
+1      0.002362777171937477      29.844949954504095
+2      9.087604507451836E-4      32.87590975254731
+3      5.452562704471102E-4      34.498999090081895
+4      9.087604507451836E-4      35.14831665150137
+}
+
+}
+\value{
+Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table.  If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN".
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+report = gsa.read.gatkreport("/path/to/my/output.gatkreport");
+}
+\keyword{ ~kwd1 }
--- a/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd
+++ b/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd
@ -0,0 +1,48 @@
+\name{gsa.read.squidmetrics}
+\alias{gsa.read.squidmetrics}
+\title{
+gsa.read.squidmetrics
+}
+\description{
+Reads metrics for a specified SQUID project into a dataframe.
+}
+\usage{
+gsa.read.squidmetrics("C315")
+}
+\arguments{
+  \item{project}{
+The project for which metrics should be obtained.
+}
+  \item{bylane}{
+If TRUE, obtains per-lane metrics rather than the default per-sample metrics.
+}
+}
+\details{
+%%  ~~ If necessary, more details than the description above ~~
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+Returns a data frame with samples (or lanes) as the row and the metric as the column.
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+This method will only work within the Broad Institute internal network.
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Obtain metrics for project C315.
+d = gsa.read.squidmetrics("C315");
+}
+\keyword{ ~kwd1 }
--- a/public/R/src/gsalib/man/gsa.read.vcf.Rd
+++ b/public/R/src/gsalib/man/gsa.read.vcf.Rd
@ -0,0 +1,53 @@
+\name{gsa.read.vcf}
+\alias{gsa.read.vcf}
+\title{
+gsa.read.vcf
+}
+\description{
+Reads a VCF file into a table.  Optionally expands genotype columns into separate columns containing the genotype, separate from the other fields specified in the FORMAT field.
+}
+\usage{
+gsa.read.vcf(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE)
+}
+\arguments{
+  \item{vcffile}{
+The path to the vcf file.
+}
+  \item{skip}{
+The number of lines of the data file to skip before beginning to read data.
+}
+  \item{nrows}{
+The maximum number of rows to read in.  Negative and other invalid values are ignored.
+}
+  \item{expandGenotypeFields}{
+If TRUE, adds an additional column per sample containing just the genotype.
+}
+}
+\details{
+The VCF format is the standard variant call file format used in the GATK.  This function reads that data in as a table for easy analysis.
+}
+\value{
+Returns a data.frame object, where each column corresponds to the columns in the VCF file.
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+vcf = gsa.read.vcf("/path/to/my/output.vcf");
+}
+\keyword{ ~kwd1 }
--- a/public/R/src/gsalib/man/gsa.warn.Rd
+++ b/public/R/src/gsalib/man/gsa.warn.Rd
@ -0,0 +1,46 @@
+\name{gsa.warn}
+\alias{gsa.warn}
+\title{
+GSA warn
+}
+\description{
+Write a warning message to standard out with the prefix '[gsalib] Warning:'.
+}
+\usage{
+gsa.warn(message)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{message}{
+The warning message to write.
+}
+}
+\details{
+%%  ~~ If necessary, more details than the description above ~~
+}
+\value{
+%%  ~Describe the value returned
+%%  If it is a LIST, use
+%%  \item{comp1 }{Description of 'comp1'}
+%%  \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%%  ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Write message to stdout
+gsa.warn("This is a warning message");
+}
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
--- a/public/R/src/gsalib/man/gsalib-package.Rd
+++ b/public/R/src/gsalib/man/gsalib-package.Rd
@ -0,0 +1,68 @@
+\name{gsalib-package}
+\alias{gsalib-package}
+\alias{gsalib}
+\docType{package}
+\title{
+GATK utility analysis functions
+}
+\description{
+Utility functions for analyzing GATK-processed NGS data
+}
+\details{
+This package contains functions for working with GATK-processed NGS data.  These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more.
+}
+\author{
+Genome Sequencing and Analysis Group
+
+Medical and Population Genetics Program
+
+Maintainer: Kiran Garimella
+}
+\references{
+GSA wiki page: http://www.broadinstitute.org/gsa/wiki
+
+GATK help forum: http://www.getsatisfaction.com/gsa
+}
+\examples{
+## get script arguments in interactive and non-interactive mode
+cmdargs = gsa.getargs( list(
+    requiredArg1 = list(
+        value = NA,
+        doc   = "Documentation for requiredArg1"
+    ),
+
+    optionalArg1 = list(
+        value = 3e9,
+        doc   = "Documentation for optionalArg1"
+    )
+) );
+
+## plot a proportional Venn diagram
+gsa.plot.venn(500, 250, 0, 100);
+
+## read a GATKReport file
+report = gsa.gatk.report("/path/to/my/output.gatkreport");
+
+## emit a message
+gsa.message("This is a message");
+
+## emit a warning message
+gsa.message("This is a warning message");
+
+## emit an error message
+gsa.message("This is an error message");
+
+## read the SQUID metrics for a given sequencing project (internal to the Broad only)
+s = gsa.read.squidmetrics("C427");
+
+## read command-line arguments
+cmdargs = gsa.getargs(
+    list(
+        file    = list(value="/my/test.vcf", doc="VCF file"),
+        verbose = list(value=0,              doc="If 1, set verbose mode"),
+        test2   = list(value=2.3e9,          doc="Another argument that does stuff")
+    ),
+    doc="My test program"
+);
+}
+\keyword{ package }
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java
@ -174,7 +174,8 @@ public class ArgumentDefinitions implements Iterable<ArgumentDefinition> {

    static DefinitionMatcher VerifiableDefinitionMatcher = new DefinitionMatcher() {
        public boolean matches( ArgumentDefinition definition, Object key ) {
-            return definition.validation != null;
+            // We can perform some sort of validation for anything that isn't a flag.
+            return !definition.isFlag;
        }        
    };
 }
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
@ -44,7 +44,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
    public final String label;

    /**
-     * Maps indicies of command line arguments to values paired with that argument.
+     * Maps indices of command line arguments to values paired with that argument.
     */
    public final SortedMap<Integer,List<String>> indices = new TreeMap<Integer,List<String>>();

--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
@ -41,6 +41,11 @@ import java.util.*;
 * A parser for Sting command-line arguments.
 */
 public class ParsingEngine {
+    /**
+     * The loaded argument sources along with their back definitions.
+     */
+    private Map<ArgumentDefinition,ArgumentSource> argumentSourcesByDefinition = new HashMap<ArgumentDefinition,ArgumentSource>();
+
    /**
     * A list of defined arguments against which command lines are matched.
     * Package protected for testing access.
@ -107,8 +112,13 @@ public class ParsingEngine {
     */
    public void addArgumentSource( String sourceName, Class sourceClass ) {
        List<ArgumentDefinition> argumentsFromSource = new ArrayList<ArgumentDefinition>();
-        for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) )
-            argumentsFromSource.addAll( argumentSource.createArgumentDefinitions() );
+        for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) {
+            List<ArgumentDefinition> argumentDefinitions = argumentSource.createArgumentDefinitions();
+            for(ArgumentDefinition argumentDefinition: argumentDefinitions) {
+                argumentSourcesByDefinition.put(argumentDefinition,argumentSource);
+                argumentsFromSource.add( argumentDefinition );
+            }
+        }
        argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) );
    }

@ -199,16 +209,25 @@ public class ParsingEngine {
                throw new InvalidArgumentException( invalidArguments );
        }

-        // Find invalid argument values (arguments that fail the regexp test.
+        // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression.
        if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) {
            Collection<ArgumentDefinition> verifiableArguments = 
                    argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher );
            Collection<Pair<ArgumentDefinition,String>> invalidValues = new ArrayList<Pair<ArgumentDefinition,String>>();
            for( ArgumentDefinition verifiableArgument: verifiableArguments ) {
                ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument );
+                // Check to see whether an argument value was specified.  Argument values must be provided
+                // when the argument name is specified and the argument is not a flag type.
+                for(ArgumentMatch verifiableMatch: verifiableMatches) {
+                    ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument);
+                    if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault())
+                        invalidValues.add(new Pair<ArgumentDefinition,String>(verifiableArgument,null));
+                }
+
+                // Ensure that the field contents meet the validation criteria specified by the regular expression.
                for( ArgumentMatch verifiableMatch: verifiableMatches ) {
                    for( String value: verifiableMatch.values() ) {
-                        if( !value.matches(verifiableArgument.validation) )
+                        if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) )
                            invalidValues.add( new Pair<ArgumentDefinition,String>(verifiableArgument, value) );
                    }
                }
@ -515,10 +534,14 @@ class InvalidArgumentValueException extends ArgumentException {
    private static String formatArguments( Collection<Pair<ArgumentDefinition,String>> invalidArgumentValues ) {
        StringBuilder sb = new StringBuilder();
        for( Pair<ArgumentDefinition,String> invalidValue: invalidArgumentValues ) {
-            sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)",
-                                     invalidValue.first.fullName,
-                                     invalidValue.second,
-                                     invalidValue.first.validation) );
+            if(invalidValue.getSecond() == null)
+                sb.append( String.format("%nArgument '--%s' requires a value but none was provided",
+                                         invalidValue.first.fullName) );
+            else
+                sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)",
+                        invalidValue.first.fullName,
+                        invalidValue.second,
+                        invalidValue.first.validation) );
        }
        return sb.toString();
    }
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@ -893,6 +893,7 @@ public class SAMDataSource {
         * Custom representation of interval bounds.
         * Makes it simpler to track current position. 
         */
+        private int[] intervalContigIndices;
        private int[] intervalStarts;
        private int[] intervalEnds;

@ -917,12 +918,14 @@ public class SAMDataSource {
            if(foundMappedIntervals) {
                if(keepOnlyUnmappedReads)
                    throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals.  Please apply this filter to only mapped or only unmapped reads");
+                this.intervalContigIndices = new int[intervals.size()];
                this.intervalStarts = new int[intervals.size()];
                this.intervalEnds = new int[intervals.size()];
                int i = 0;
                for(GenomeLoc interval: intervals) {
-                    intervalStarts[i] = (int)interval.getStart();
-                    intervalEnds[i] = (int)interval.getStop();
+                    intervalContigIndices[i] = interval.getContigIndex();
+                    intervalStarts[i] = interval.getStart();
+                    intervalEnds[i] = interval.getStop();
                    i++;
                }
            }
@ -961,11 +964,10 @@ public class SAMDataSource {
            while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
                if(!keepOnlyUnmappedReads) {
                    // Mapped read filter; check against GenomeLoc-derived bounds.
-                    if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
-                            (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
-                        // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
+                    if(readEndsOnOrAfterStartingBound(candidateRead)) {
+                        // This read ends after the current interval begins.
                        // Promising, but this read must be checked against the ending bound.
-                        if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
+                        if(readStartsOnOrBeforeEndingBound(candidateRead)) {
                            // Yes, this read is within both bounds.  This must be our next read.
                            nextRead = candidateRead;
                            break;
@ -993,6 +995,37 @@ public class SAMDataSource {
                candidateRead = iterator.next();
            }
        }
+
+        /**
+         * Check whether the read lies after the start of the current bound.  If the read is unmapped but placed, its
+         * end will be distorted, so rely only on the alignment start.
+         * @param read The read to position-check.
+         * @return True if the read starts after the current bounds.  False otherwise.
+         */
+        private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
+            return
+                    // Read ends on a later contig, or...
+                    read.getReferenceIndex() > intervalContigIndices[currentBound] ||
+                    // Read ends of this contig...
+                    (read.getReferenceIndex() == intervalContigIndices[currentBound] &&
+                            // either after this location, or...
+                            (read.getAlignmentEnd() >= intervalStarts[currentBound] ||
+                            // read is unmapped but positioned and alignment start is on or after this start point.
+                            (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
+        }
+
+        /**
+         * Check whether the read lies before the end of the current bound.
+         * @param read The read to position-check.
+         * @return True if the read starts after the current bounds.  False otherwise.
+         */
+        private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
+            return
+                    // Read starts on a prior contig, or...
+                    read.getReferenceIndex() < intervalContigIndices[currentBound] ||
+                    // Read starts on this contig and the alignment start is registered before this end point.
+                   (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
+        }
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@ -122,14 +122,16 @@ public class VariantContextAdaptors {

                Map<String, Object> attributes = new HashMap<String, Object>();
                attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
-                if ( DbSNPHelper.isDeletion(dbsnp) ) {
+
+                boolean vcIsDeletion = DbSNPHelper.isDeletion(dbsnp) || DbSNPHelper.isComplexIndel(dbsnp);
+                if ( vcIsDeletion ) {
                    int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
                    if ( index < 0 )
                        return null; // we weren't given enough reference context to create the VariantContext
                    attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(ref.getBases()[index]));
                }
                Collection<Genotype> genotypes = null;
-                VariantContext vc = new VariantContext(name, dbsnp.getChr(),dbsnp.getStart() - (DbSNPHelper.isDeletion(dbsnp) ? 1 : 0),dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes);
+                VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (vcIsDeletion ? 1 : 0),dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes);
                return vc;
            } else
                return null; // can't handle anything else
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java
@ -117,7 +117,11 @@ public class DbSNPHelper {
    }

    public static boolean isIndel(DbSNPFeature feature) {
-        return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del");
+        return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature);
+    }
+
+    public static boolean isComplexIndel(DbSNPFeature feature) {
+        return feature.getVariantType().contains("in-del");
    }

    public static boolean isHapmap(DbSNPFeature feature) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java
@ -25,7 +25,6 @@
 package org.broadinstitute.sting.gatk.walkers.diffengine;

 import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -35,34 +34,45 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;

 import java.io.File;
 import java.io.PrintStream;
-import java.util.Arrays;
 import java.util.List;

 /**
 * A generic engine for comparing tree-structured objects
+ *
 * <p>
- * Compares two record-oriented files, itemizing specific difference between equivalent
- * records in the two files.  Reports both itemized and summarized differences.
- * <p>
- *     <b>What are the summarized differences and the DiffObjectsWalker</b>
+ *      Compares two record-oriented files, itemizing specific difference between equivalent
+ *      records in the two files.  Reports both itemized and summarized differences.
+ * </p>
+ *
+ * <h3>What are the summarized differences and the DiffObjectsWalker?</h3>
+ *
 * <p>
 *     The GATK contains a summarizing difference engine that compares hierarchical data structures to emit:
- * <ul>
- *     <li>A list of specific differences between the two data structures.  This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G.
- *     <li>A summarized list of differences ordered by frequency of the difference.  This output is similar to saying field A in 50 records in files F and G differed.
- * </ul>
+ *      <ul>
+ *          <li>A list of specific differences between the two data structures.  This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G.
+ *          <li>A summarized list of differences ordered by frequency of the difference.  This output is similar to saying field A in 50 records in files F and G differed.
+ *      </ul>
+ * </p>
 *
 * <p>
- * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line.  Simply provide the walker with the master and test files and it will emit summarized differences for you.
+ *      The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line.  Simply provide the walker with the master and test files and it will emit summarized differences for you.
+ * </p>
+ *
+ * <h3>Why?</h3>
 *
 * <p>
- *     <b>Why?</b>
- * <p>
- * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them.  This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.
+ *      The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them.  This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.
+ * </p>
 *
- * <p>Understanding the output
- * <p>The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named
- * nodes.  Suppose I have two trees:
+ * <h2>Input</h2>
+ * <p>
+ *      The DiffObjectsWalker works with BAM or VCF files.
+ * </p>
+ *
+ * <h2>Output</h2>
+ * <p>
+ *      The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named
+ *      nodes.  Suppose I have two trees:
 * <pre>
 *     Tree1=(A=1 B=(C=2 D=3))
 *     Tree2=(A=1 B=(C=3 D=3 E=4))
@ -70,33 +80,37 @@ import java.util.List;
 * </pre>
 * <p>
 *     where every node in the tree is named, or is a raw value (here all leaf values are integers).  The DiffEngine
- * traverses these data structures by name, identifies equivalent nodes by fully qualified names
- * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are).
- * These itemized differences are listed as:
+ *      traverses these data structures by name, identifies equivalent nodes by fully qualified names
+ *      (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are).
+ *      These itemized differences are listed as:
 * <pre>
 *     Tree1.B.C=2 != Tree2.B.C=3
 *     Tree1.B.C=2 != Tree3.B.C=4
 *     Tree2.B.C=3 != Tree3.B.C=4
 *     Tree1.B.E=MISSING != Tree2.B.E=4
 * </pre>
+ *
 * <p>
- *     This conceptually very similar to the output of the unix command line tool diff.  What's nice about DiffEngine though
- * is that it computes similarity among the itemized differences and displays the count of differences names
- * in the system.  In the above example, the field C is not equal three times, while the missing E in Tree1 occurs
- * only once.  So the summary is:
+ *      This conceptually very similar to the output of the unix command line tool diff.  What's nice about DiffEngine though
+ *      is that it computes similarity among the itemized differences and displays the count of differences names
+ *      in the system.  In the above example, the field C is not equal three times, while the missing E in Tree1 occurs
+ *      only once.  So the summary is:
 *
 * <pre>
 *     *.B.C : 3
 *     *.B.E : 1
 * </pre>
- * <p>where the * operator indicates that any named field matches.  This output is sorted by counts, and provides an
- * immediate picture of the commonly occurring differences among the files.
+ *
 * <p>
- * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines,
- * detected by the integrationtest integration (more below).  You can see that in the although there are many specific
- * instances of these differences between the two files, the summarized differences provide an immediate picture that
- * the AC, AF, and AN fields are the major causes of the differences.
+ *      where the * operator indicates that any named field matches.  This output is sorted by counts, and provides an
+ *      immediate picture of the commonly occurring differences among the files.
 * <p>
+ *      Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines,
+ *      detected by the integrationtest integration (more below).  You can see that in the although there are many specific
+ *      instances of these differences between the two files, the summarized differences provide an immediate picture that
+ *      the AC, AF, and AN fields are the major causes of the differences.
+ * <p>
+ *
 * <pre>
   [testng] path                                                             count
   [testng] *.*.*.AC                                                         6
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@ -234,7 +234,7 @@ public class PhaseByTransmission extends RodWalker<Integer, Integer> {
        finalGenotypes.add(father);
        finalGenotypes.add(child);

-        if (mother.isCalled() && father.isCalled() && child.isCalled() && !(mother.isHet() && father.isHet() && child.isHet())) {
+        if (mother.isCalled() && father.isCalled() && child.isCalled()) {
            ArrayList<Genotype> possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother);
            ArrayList<Genotype> possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father);
            ArrayList<Genotype> possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child);
@ -265,12 +265,14 @@ public class PhaseByTransmission extends RodWalker<Integer, Integer> {
                }
            }

-            Map<String, Object> attributes = new HashMap<String, Object>();
-            attributes.putAll(bestChildGenotype.getAttributes());
-            attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
-            bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);
+            if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) {
+                Map<String, Object> attributes = new HashMap<String, Object>();
+                attributes.putAll(bestChildGenotype.getAttributes());
+                attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
+                bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);

-            finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+                finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+            }
        }

        return finalGenotypes;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -67,7 +67,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
    protected String[] STRATIFICATIONS_TO_USE = {};

-    @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)")
+    @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required=false)
    protected Boolean NO_STANDARD_STRATIFICATIONS = false;

    @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
@ -77,7 +77,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
    protected String[] MODULES_TO_USE = {};

-    @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)")
+    @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false)
    protected Boolean NO_STANDARD_MODULES = false;

    // Other arguments
--- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
@ -92,7 +92,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
        for(Tag tag: classdoc.firstSentenceTags())
            summaryBuilder.append(tag.text());
        root.put("summary", summaryBuilder.toString());
-        root.put("description", classdoc.commentText());
+        root.put("description", classdoc.commentText().substring(summaryBuilder.toString().length()));
        root.put("timestamp", toProcess.buildTimestamp);
        root.put("version", toProcess.absoluteVersion);

--- a/public/java/test/org/broadinstitute/sting/MD5DB.java
+++ b/public/java/test/org/broadinstitute/sting/MD5DB.java
@ -79,8 +79,8 @@ public class MD5DB {
     * @return
     */
    public static String getMD5FilePath(final String md5, final String valueIfNotFound) {
-        // we prefer the local db to the global DB, so match it first
-        for ( String dir : Arrays.asList(LOCAL_MD5_DB_DIR, GLOBAL_MD5_DB_DIR)) {
+        // we prefer the global db to the local DB, so match it first
+        for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) {
            File f = getFileForMD5(md5, dir);
            if ( f.exists() && f.canRead() )
                return f.getPath();
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
@ -20,7 +20,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
                        "-o %s"
                ),
                1,
-                Arrays.asList("ff02b1583ee3a12ed66a9c0e08e346b2")
+                Arrays.asList("45fef0e23113e2fcd9570379e2fc1b75")
        );
        executeTest("testBasicFunctionality", spec);
    }
--- a/settings/helpTemplates/common.html
+++ b/settings/helpTemplates/common.html
@ -6,10 +6,10 @@
 </#macro>

 <#macro headerInfo>
+</#macro>
+
+<#macro footerInfo>
    <p class="see-also">See also <a href="index.html">Main index</a> | <a href="http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit">GATK wiki</a> | <a href="http://getsatisfaction.com/gsa">GATK support forum</a></p>
    <p class="version">GATK version ${version} built at ${timestamp}.</p>
 </#macro>

-<#macro footerInfo>
-</#macro>
-
--- a/settings/helpTemplates/generic.template.html
+++ b/settings/helpTemplates/generic.template.html
@ -53,19 +53,18 @@
 <body>
  <h1>${name}</h1>
  <@headerInfo />
-  <h2>Brief summary</h2>
-  ${summary}
+  <p class="summary">${summary}</p>
  <#if author??>
      <h2>Author</h2>
      ${author}
  </#if>
-  <h2>Detailed description</h2>
+  <h2>Introduction</h2>
  ${description}
  
  <#-- Create the argument summary -->
  <#if arguments.all?size != 0> 
      <hr>
-      <h2>Feature specific arguments</h2>
+      <h2>${name} specific arguments</h2>
      <table id="hor-minimalist-b">
      <thead>
      <tr>
--- a/settings/helpTemplates/style.css
+++ b/settings/helpTemplates/style.css
@ -14,29 +14,67 @@ p, ul, ol, dl, dt, dd, td
 	font-size: 12pt;
 }

-p.version, p.see-also
+p
 {
-	font-size: 8pt;
+	margin-left: 1em;
 }

-h1, h2, h3
+p.summary
+{
+    margin-left: 2em;
+    margin-top: -20pt;
+    font-style: italic;
+}
+
+p.see-also
+{
+	font-size: 10pt;
+    margin-left: 0em;
+    margin-top: 3em;
+    text-align: center;
+}
+
+p.version
+{
+    font-size: 8pt;
+    margin-left: 0em;
+    margin-top: -8pt;
+    text-align: center;
+}
+
+
+h1, h2, h3, h4
 {
 	font-family: Corbel, Arial, Helvetica, Sans-Serif;
 	font-weight: bold;
 	text-align: left;
-	color: #669;
 }

 h1
 {
 	font-size: 32pt;
 	letter-spacing: -2px;
+    color: #669;
 }

-h3 
+h2
 {
-	font-size: 16pt;
-	font-weight: normal;
+    font-size: 16pt;
+	font-weight: bold;
+    margin-top: 2em;
+    color: #669;
+}
+
+h3
+{
+    font-size: 12pt;
+    margin-left: 1em;
+    color: #000;
+}
+
+hr
+{
+    margin-top: 4em;
 }

 /*