From b57a0a03102c1c880f9f6c8aef67301765be0ea2 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 15 Sep 2010 00:45:13 +0000 Subject: [PATCH] improvements to the report code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4280 348d0f76-0448-11de-a6fe-93d51630548a --- R/GATKRunReport.R | 121 ++++++++++++++++++++++++------------ python/analyzeRunReports.py | 66 +++++++++++++++----- 2 files changed, 130 insertions(+), 57 deletions(-) diff --git a/R/GATKRunReport.R b/R/GATKRunReport.R index a809b8aa8..6be5aa9c0 100644 --- a/R/GATKRunReport.R +++ b/R/GATKRunReport.R @@ -6,7 +6,7 @@ if (! is.na(args[3]) ) { name = args[3] } else { name = "" } if ( onCMDLine ) { print(paste("Reading data from", args[1])) d = read.table(args[1], header=T, sep="\t") - d$start.time = as.Date(d$start.time) + #d$start.time = as.Date(d$start.time) d$end.time = as.Date(d$end.time) } # only read into d if its' available, otherwise assume the data is already loaded @@ -14,11 +14,26 @@ reportCountingPlot <- function(values, name, moreMargin = 0, ...) { par(las=2) # make label text perpendicular to axis oldMar <- par("mar") par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin. - barplot(sort(table(values)), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", ...) + barplot(sort(table(factor(values))), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", ...) par("mar" = oldMar) par("las" = 1) } +reportConditionalCountingPlot <- function(values, conditions, name, moreMargin = 0, ...) { + par(las=2) # make label text perpendicular to axis + oldMar <- par("mar") + par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin. + t = table(values, conditions) + t = t[, order(colSums(t))] + print(list(t = t)) + nconds = dim(t)[2] + cols = rainbow(nconds) + barplot(t, legend.text = T, horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", col=cols, cex=0.5, ...) + par("mar" = oldMar) + par("las" = 1) +} + + reportHist <- function(values, name, ...) { if ( ! all(is.na(values) ) ) hist(values, main=name, 20, xlab="", col="cornflowerblue", ...) @@ -37,18 +52,20 @@ myTable <- function(x, y, reqRowNonZero = F) { # todo -- must be robust to smaller sizes -plotTable <- function(table, name) { +plotTable <- function(table, name, ...) { ncols = dim(table)[2] nrows = dim(table)[1] - cols = rainbow(nrows) - tableMin = min(apply(table, 2, min)) - tableMax = max(apply(table, 2, max)) - plot( as.numeric(apply(table, 2, sum)), ylim=c(tableMin, tableMax), type="n", main = name, ylab="Frequency", xlab="Date", xaxt="n") - axis(1, 1:ncols, labels=colnames(table)) - for ( i in 1:nrows ) - points(table[i,], type="b", col=cols[i]) - legend("topright", row.names(table), fill=cols, cex=0.5) - #return(table) + if ( ! is.null(nrows) ) { + cols = rainbow(nrows) + tableMin = min(apply(table, 2, min)) + tableMax = max(apply(table, 2, max)) + plot( as.numeric(apply(table, 2, sum)), ylim=c(tableMin, tableMax), type="n", main = name, ylab="Frequency", xlab="Date", xaxt="n", ...) + axis(1, 1:ncols, labels=colnames(table)) + for ( i in 1:nrows ) + points(table[i,], type="b", col=cols[i]) + legend("topright", row.names(table), fill=cols, cex=0.5) + #return(table) + } } RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure @@ -61,38 +78,61 @@ successfulRuns <- function(d) { return(x) } +addSection <- function(name) { + par("mar", c(5, 4, 4, 2)) + frame() + title(name, cex=2) +} + generateOneReport <- function(d, header, includeByWeek = T) { head <- function(s) { - return(paste("Section:", header, ":", s)) + return(paste("Section:", header, "\n", s)) } excepted <- subset(d, exception.msg != "NA") - badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME) + UserExceptions <- subset(excepted, is.user.exception == "true") + StingExceptions <- subset(excepted, is.user.exception == "false" | is.user.exception == "NA" | is.na(is.user.exception)) + + addSection(paste("GATK run report", name, "for", Sys.Date(), "\nwith", dim(d)[1], "run repository records")) + + reportCountingPlot(d$walker.name, head("Walker invocations")) + reportConditionalCountingPlot(d$user.name, d$walker.name, head("Walker invocations by user")) + reportCountingPlot(d$svn.version, head("SVN version")) + reportConditionalCountingPlot(d$svn.version, d$user.name, head("SVN by user")) - par("mar", c(5, 4, 4, 2)) - frame() - title(paste("GATK run report", name, "for", Sys.Date(), "\nwith", dim(d)[1], "run repository records"), cex=2) # cuts by time if ( includeByWeek ) { - plotTable(table(rep("GATK Invocations", length(d$start.time)), cut(d$start.time, "weeks")), head("GATK Invocations by week")) - plotTable(myTable(successfulRuns(d), cut(d$start.time, "weeks")), head("Successful and failing GATK invocations per week")) + plotTable(table(rep("GATK Invocations", length(d$end.time)), cut(d$end.time, "weeks")), head("GATK Invocations by week")) + plotTable(myTable(successfulRuns(d), cut(d$end.time, "weeks")), head("Successful and failing GATK invocations per week")) - plotTable(myTable(d$svn.version, cut(d$start.time, "weeks")), head("SVN version by week")) - plotTable(myTable(excepted$walker.name, cut(excepted$start.time, "weeks"), reqRowNonZero = T), head("Walkers with exceptions by week")) + plotTable(myTable(d$svn.version, cut(d$end.time, "weeks")), head("SVN version by week")) } - plotTable(table(rep("GATK Invocations", length(d$start.time)), d$start.time), head("GATK Invocations by day")) - plotTable(myTable(d$svn.version, d$start.time), head("SVN version by day")) + plotTable(table(rep("GATK Invocations", length(d$end.time)), d$end.time), head("GATK Invocations by day")) + plotTable(myTable(d$svn.version, d$end.time), head("SVN version by day")) - reportCountingPlot(d$walker.name, head("Walker invocations")) - reportCountingPlot(d$svn.version, head("GATK SVN version")) + # + # Exception handling + # + addExceptionSection <- function(subd, subname, exceptionColor) { + addSection(paste(subname)) + #print(list(subd = length(subd$end.time), name=subname)) + reportCountingPlot(subd$walker.name, head(paste("Walkers with", subname)), col=exceptionColor) + reportCountingPlot(subd$exception.at, head(paste(subname, "locations")), 12, col=exceptionColor) + reportCountingPlot(subd$exception.msg, head(paste(subname, "messages")), 12, col=exceptionColor) + reportConditionalCountingPlot(subd$user.name, subd$exception.at, head("Walker invocations by user"), 12) + + if ( includeByWeek && length(subd$end.time) > 0 ) { + plotTable(myTable(subd$walker.name, cut(subd$end.time, "weeks"), reqRowNonZero = T), head(paste("Walkers with", subname,"by week")), col=exceptionColor) + } + } + + addExceptionSection(excepted, "Exceptions", "grey") + reportCountingPlot(excepted$user.name, head("Usernames generating exceptions"), col="grey") + + addExceptionSection(StingExceptions, "StingExceptions", "red") + addExceptionSection(UserExceptions, "UserExceptions", "blue") - # reportCountingPlot(d$java.tmp.directory, head("Java tmp directory")) - reportCountingPlot(d$working.directory, head("Working directory")) - reportCountingPlot(d$user.name, head("user")) - reportCountingPlot(d$host.name, head("host")) - reportCountingPlot(d$java, head("Java version")) - reportCountingPlot(d$machine, head("Machine")) Gb <- 1024^3 reportHist(d$total.memory / Gb, head("Used memory")) @@ -100,21 +140,20 @@ generateOneReport <- function(d, header, includeByWeek = T) { min <- 60 reportHist(log10(d$run.time / min), head("Run time (log10[min])")) - - exceptionColor = "red" - reportCountingPlot(excepted$walker.name, head("Walker exceptions"), col=exceptionColor) - reportCountingPlot(subset(excepted, run.time > RUNNING_GATK_RUNTIME)$walker.name, paste(head("Long-running walker exceptions (>"),RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor) - reportCountingPlot(subset(excepted, run.time < RUNNING_GATK_RUNTIME)$walker.name, paste(head("Start-up walker exceptions (<"),RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor) - reportCountingPlot(excepted$user.name, head("Usernames generating exceptions"), col=exceptionColor) - reportCountingPlot(excepted$exception.msg, head("Exception messages"), 12) - reportCountingPlot(excepted$exception.at, head("Exception locations"), 12) + + reportCountingPlot(d$user.name, head("user")) + reportCountingPlot(d$host.name, head("host")) + + reportCountingPlot(d$java, head("Java version")) + reportCountingPlot(d$machine, head("Machine")) + reportCountingPlot(d$working.directory, head("Working directory")) } RUNME = T if ( RUNME ) { - lastWeek = levels(cut(d$start.time, "weeks"))[-1] + lastWeek = levels(cut(d$end.time, "weeks"))[-1] generateOneReport(d, "Overall") - #generateOneReport(subset(d, start.time >= lastWeek), "Just last week to date", includeByWeek = F) + #generateOneReport(subset(d, end.time >= lastWeek), "Just last week to date", includeByWeek = F) } if ( onCMDLine ) dev.off() diff --git a/python/analyzeRunReports.py b/python/analyzeRunReports.py index 2f22912c1..fcba9578d 100755 --- a/python/analyzeRunReports.py +++ b/python/analyzeRunReports.py @@ -5,6 +5,7 @@ from itertools import * from xml.etree.ElementTree import * import gzip import datetime +import re MISSING_VALUE = "NA" RUN_REPORT_LIST = "GATK-run-reports" @@ -101,9 +102,20 @@ def eltIsException(elt): def parseException(elt): msgElt = elt.find("message") msgText = "MISSING" + userException = "NA" if msgElt != None: msgText = msgElt.text - return msgText, elt.find("stacktrace").find("string").text + stackTrace = elt.find("stacktrace").find("string").text + if elt.find("is-user-exception") != None: + #print elt.find("is-user-exception") + userException = elt.find("is-user-exception").text + return msgText, stackTrace, userException +def javaExceptionFile(javaException): + m = re.search("\((.*\.java:.*)\)", javaException) + if m != None: + return m.group(1) + else: + javaException class RecordDecoder: def __init__(self): @@ -118,6 +130,12 @@ class RecordDecoder: def formatExceptionAt(elt): return '%s' % parseException(elt)[1] + + def formatExceptionAtBrief(elt): + return '%s' % javaExceptionFile(parseException(elt)[1]) + + def formatExceptionUser(elt): + return '%s' % parseException(elt)[2] def add(names, func): for name in names: @@ -132,7 +150,7 @@ class RecordDecoder: add(["run-time", "java-tmp-directory", "working-directory", "user-name", "host-name"], id) add(["java", "machine"], toString) add(["max-memory", "total-memory", "iterations", "reads"], id) - addComplex("exception", ["exception-msg", "exception-at"], [formatExceptionMsg, formatExceptionAt]) + addComplex("exception", ["exception-msg", "exception-at", "exception-at-brief", "is-user-exception"], [formatExceptionMsg, formatExceptionAt, formatExceptionAtBrief, formatExceptionUser]) # add(["command-line"], toString) def decode(self, report): @@ -234,17 +252,18 @@ class ExceptionReport(StageHandler): commonExceptions = sorted(commonExceptions, None, lambda x: x.counts) for common in commonExceptions: - msg, at, svns, walkers, counts, ids, duration, users = common.toStrings() + msg, at, svns, walkers, counts, ids, duration, users, userError = common.toStrings() print >> self.out, ''.join(['*'] * 80) - print >> self.out, 'Exception :', msg - print >> self.out, ' at :', at - print >> self.out, ' walkers :', walkers - print >> self.out, ' svns :', svns - print >> self.out, ' duration :', duration - print >> self.out, ' occurrences :', counts - print >> self.out, ' users :', users - print >> self.out, ' ids :', ids + print >> self.out, 'Exception :', msg + print >> self.out, ' is-user-exception? :', userError + print >> self.out, ' at :', at + print >> self.out, ' walkers :', walkers + print >> self.out, ' svns :', svns + print >> self.out, ' duration :', duration + print >> self.out, ' occurrences :', counts + print >> self.out, ' users :', users + print >> self.out, ' ids :', ids class CommonException: MAX_SET_ITEMS_TO_SHOW = 5 @@ -254,6 +273,7 @@ class CommonException: self.at = ex['exception-at'] self.svns = set([ex['svn-version']]) self.users = set([ex['user-name']]) + self.userError = ex['is-user-exception'] self.counts = 1 self.times = set([decodeTime(ex['start-time'])]) self.walkers = set([ex['walker-name']]) @@ -285,11 +305,17 @@ class CommonException: return ','.join(s) def duration(self): - x = sorted(self.times) - return "-".join(map(lambda x: x.strftime("%m/%d/%y"), [x[0], x[-1]])) + x = sorted(filter(lambda x: x != "ND", self.times)) + if len(x) >= 2: + return "-".join(map(lambda x: x.strftime("%m/%d/%y"), [x[0], x[-1]])) + elif len(x) == 1: + return x[0] + else: + return "ND" + def toStrings(self): - return [self.bestExample(self.msgs), self.at, self.setString(self.svns), self.setString(self.walkers), self.counts, self.setString(self.ids), self.duration(), self.setString(self.users)] + return [self.bestExample(self.msgs), self.at, self.setString(self.svns), self.setString(self.walkers), self.counts, self.setString(self.ids), self.duration(), self.setString(self.users), self.userError] addHandler('exceptions', ExceptionReport) @@ -354,7 +380,10 @@ def resolveFiles(paths): return allFiles def decodeTime(time): - return datetime.datetime.strptime(time.split()[0], "%Y/%m/%d") + if time == "ND": + return "ND" + else: + return datetime.datetime.strptime(time.split()[0], "%Y/%m/%d") #return datetime.datetime.strptime(time, "%Y/%m/%d %H.%M.%S") def passesFilters(elt): @@ -377,7 +406,12 @@ def readReports(files): for file in files: if OPTIONS.verbose: print 'Reading file', file input = openFile(file) - tree = ElementTree(file=input) + try: + tree = ElementTree(file=input) + except: + print "EXCEPTING FILE", file + raise + elem = tree.getroot() if elem.tag == RUN_REPORT_LIST: for sub in elem: