diff --git a/R/GATKRunReport.R b/R/GATKRunReport.R index 6c918d6c5..418257a56 100644 --- a/R/GATKRunReport.R +++ b/R/GATKRunReport.R @@ -34,6 +34,8 @@ myTable <- function(x, y, reqRowNonZero = F) { return(table) } +# todo -- must be robust to smaller sizes + plotTable <- function(table, name) { ncols = dim(table)[2] nrows = dim(table)[1] @@ -47,25 +49,34 @@ plotTable <- function(table, name) { } RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure -excepted <- subset(d, exception.msg != "NA") -badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME) if ( onCMDLine ) pdf(args[2]) -generateOneReport <- function(d, header) { +generateOneReport <- function(d, header, includeByWeek = T) { head <- function(s) { return(paste("Section:", header, ":", s)) } + excepted <- subset(d, exception.msg != "NA") + badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME) + par("mar", c(5, 4, 4, 2)) frame() title(paste("Section:", header), cex=2) reportCountingPlot(d$walker.name, head("Walker invocations")) reportCountingPlot(d$svn.version, head("GATK SVN version")) - reportCountingPlot(d$java.tmp.directory, head("Java tmp directory")) + + # cuts by time + plotTable(myTable(d$svn.version, d$start.time), head("SVN version by day")) + if ( includeByWeek ) { + plotTable(myTable(d$svn.version, cut(d$start.time, "weeks")), head("SVN version by week")) + plotTable(myTable(excepted$walker.name, cut(excepted$start.time, "weeks"), reqRowNonZero = T), head("Walkers with exceptions by week")) + } + + # reportCountingPlot(d$java.tmp.directory, head("Java tmp directory")) reportCountingPlot(d$working.directory, head("Working directory")) - reportCountingPlot(d$user.name, head("User")) + reportCountingPlot(d$user.name, head("user")) reportCountingPlot(d$host.name, head("host")) reportCountingPlot(d$java, head("Java version")) reportCountingPlot(d$machine, head("Machine")) @@ -88,15 +99,9 @@ generateOneReport <- function(d, header) { RUNME = T if ( RUNME ) { -generateOneReport(d, "Overall") - -lastWeek = levels(cut(d$start.time, "weeks"))[-1] -generateOneReport(subset(d, start.time == lastWeek), "Just last week to date") - -# cuts by time -plotTable(myTable(d$svn.version, d$start.time), "SVN version by day") -plotTable(myTable(d$svn.version, cut(d$start.time, "weeks")), "SVN version by week") -plotTable(myTable(excepted$walker.name, cut(excepted$start.time, "weeks"), reqRowNonZero = T), "Walkers with exceptions by week") + lastWeek = levels(cut(d$start.time, "weeks"))[-1] + generateOneReport(d, "Overall") + #generateOneReport(subset(d, start.time >= lastWeek), "Just last week to date", includeByWeek = F) } if ( onCMDLine ) dev.off() diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py index 3a4e90a7f..07d5b50d9 100755 --- a/python/1kgStatsForCalls.py +++ b/python/1kgStatsForCalls.py @@ -8,6 +8,13 @@ import itertools import re import vcfReader import string +import gzip + +def openMaybeGZ(filename): + if ( filename.endswith(".gz") ): + return gzip.open(filename) + else: + return open(filename) def average(l): sum = reduce(operator.add, l, 0) @@ -35,7 +42,7 @@ class Sample: def flatFileIterator(file, fields = None, skip = 0): count = 0 - for line in open(file): + for line in openMaybeGZ(file): count += 1 if count > skip: s = map(string.strip, line.split('\t')) @@ -99,7 +106,7 @@ def findVariantEvalResults(key, file, type=str): else: return None - return [val for val in map(capture1, open(file)) if val != None] + return [val for val in map(capture1, openMaybeGZ(file)) if val != None] def getDBSNPRate(file): @@ -121,7 +128,7 @@ def countMappedBases(samples, alignmentIndex): if ( OPTIONS.coverageFile != None ): # read from summary file, looking for the line: # Total 340710 1187.14 N/A N/A N/A - for parts in map( string.split, open(OPTIONS.coverageFile) ): + for parts in map( string.split, openMaybeGZ(OPTIONS.coverageFile) ): if parts[0] == "Total": return -1, int(parts[1]) else: @@ -156,7 +163,7 @@ def countSNPs(samples, snpsVCF, useIndels = False): total = 0 novel = 0 - header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF)) + header, columnNames, remainingLines = vcfReader.readVCFHeader(openMaybeGZ(snpsVCF)) sampleIDs = columnNames[9:] print 'Counting SNPs...' @@ -204,7 +211,7 @@ def countIndels(samples, indelsVCF): def readSamples(vcf): print 'Reading samples for', OPTIONS.population - header, columnNames, remainingLines = vcfReader.readVCFHeader(open(vcf)) + header, columnNames, remainingLines = vcfReader.readVCFHeader(openMaybeGZ(vcf)) samples = map(Sample, columnNames[9:]) if ( OPTIONS.onlySample != None ): samples = filter( lambda x: x.getName() == OPTIONS.onlySample, samples ) diff --git a/python/analyzeRunReports.py b/python/analyzeRunReports.py index 9fa004d08..234fb7335 100755 --- a/python/analyzeRunReports.py +++ b/python/analyzeRunReports.py @@ -4,6 +4,7 @@ from optparse import OptionParser from itertools import * from xml.etree.ElementTree import * import gzip +import datetime MISSING_VALUE = "NA" RUN_REPORT_LIST = "GATK-run-reports" @@ -12,6 +13,7 @@ def main(): global OPTIONS usage = "usage: %prog [options] mode file1 ... fileN" parser = OptionParser(usage=usage) + parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="If provided, verbose progress will be enabled") @@ -19,6 +21,14 @@ def main(): parser.add_option("-o", "--o", dest="output", type='string', default=None, help="if provided, output will go here instead of stdout") + + parser.add_option("", "--no-dev", dest="noDev", + action='store_true', default=False, + help="if provided, only records not coming from a dev version of GATK will be included") + + parser.add_option("", "--max_days", dest="maxDays", + type='int', default=None, + help="if provided, only records generated within X days of today will be included") (OPTIONS, args) = parser.parse_args() if len(args) == 0: @@ -44,7 +54,7 @@ def main(): counter += 1 handler.finalize(files) - out.close() + if OPTIONS.output != None: out.close() print 'Processed records:', counter # @@ -74,24 +84,27 @@ def addHandler(name, handler): def getHandler(stage): return HANDLERS[stage] + +def eltIsException(elt): + return elt.tag == "exception" -# def -class RecordAsTable(StageHandler): - def __init__(self, name, out): - StageHandler.__init__(self, name, out) - - def initialize(self, args): +def parseException(elt): + return elt.find("message").text, elt.find("stacktrace").find("string").text + + +class RecordDecoder: + def __init__(self): self.fields = list() self.formatters = dict() def id(elt): return elt.text - def toString(elt): return '"%s"' % elt.text + def toString(elt): return '%s' % elt.text def formatExceptionMsg(elt): - return '"%s"' % elt.find("message").text + return '%s' % parseException(elt)[0] def formatExceptionAt(elt): - return '"%s"' % elt.find("stacktrace").find("string").text + return '%s' % parseException(elt)[1] def add(names, func): for name in names: @@ -107,31 +120,45 @@ class RecordAsTable(StageHandler): add(["java", "machine"], toString) add(["max-memory", "total-memory", "iterations", "reads"], id) addComplex("exception", ["exception-msg", "exception-at"], [formatExceptionMsg, formatExceptionAt]) - # add(["command-line"], toString) + # add(["command-line"], toString) + + def decode(self, report): + bindings = dict() + for elt in report: + if elt.tag in self.formatters: + fieldFormats = self.formatters[elt.tag] + # we actually care about this tag + for field, formatter in fieldFormats: + bindings[field] = formatter(elt) + + # add missing data + for field in self.fields: + if field not in bindings: + bindings[field] = MISSING_VALUE + + return bindings + +# def +class RecordAsTable(StageHandler): + def __init__(self, name, out): + StageHandler.__init__(self, name, out) - print >> self.out, "\t".join(self.fields) + def initialize(self, args): + self.decoder = RecordDecoder() + print >> self.out, "\t".join(self.decoder.fields) def processRecord(self, record): - parsed = parseReport(record, self.formatters) + parsed = self.decoder.decode(record) def oneField(field): val = MISSING_VALUE if field in parsed: val = parsed[field] + if val.find(" ") != -1: + val = "\"" + val + "\"" return val - print >> self.out, "\t".join([ oneField(field) for field in self.fields ]) - -def parseReport(report, allFormatters): - bindings = dict() - for elt in report: - if elt.tag in allFormatters: - fieldFormats = allFormatters[elt.tag] - # we actually care about this tag - for field, formatter in fieldFormats: - bindings[field] = formatter(elt) - return bindings - + print >> self.out, "\t".join([ oneField(field) for field in self.decoder.fields ]) addHandler('table', RecordAsTable) @@ -161,7 +188,125 @@ class Archive(RecordAsXML): addHandler('archive', Archive) +class ExceptionReport(StageHandler): + #FIELDS = ["Msg", "At", "SVN.versions", "Walkers", 'Occurrences', 'IDs'] + def __init__(self, name, out): + StageHandler.__init__(self, name, out) + self.exceptions = [] + def initialize(self, args): + self.decoder = RecordDecoder() + #print >> self.out, "\t".join(self.FIELDS) + + def processRecord(self, record): + for elt in record: + if eltIsException(elt): + self.exceptions.append(self.decoder.decode(record)) + break + + def finalize(self, args): + commonExceptions = list() + + def addToCommons(ex): + for common in commonExceptions: + if common.equals(ex): + common.update(ex) + return + commonExceptions.append(CommonException(ex)) + + for ex in self.exceptions: + addToCommons(ex) + commonExceptions = sorted(commonExceptions, None, lambda x: x.counts) + + for common in commonExceptions: + msg, at, svns, walkers, counts, ids, duration = common.toStrings() + + print >> self.out, ''.join(['*'] * 80) + print >> self.out, 'Exception :', msg + print >> self.out, ' at :', at + print >> self.out, ' walkers :', walkers + print >> self.out, ' svns :', svns + print >> self.out, ' duration :', duration + print >> self.out, ' occurrences :', counts + print >> self.out, ' ids :', ids + +class CommonException: + MAX_SET_ITEMS_TO_SHOW = 5 + + def __init__(self, ex): + self.msgs = set([ex['exception-msg']]) + self.at = ex['exception-at'] + self.svns = set([ex['svn-version']]) + self.counts = 1 + self.times = set([decodeTime(ex['start-time'])]) + self.walkers = set([ex['walker-name']]) + self.ids = set([ex['id']]) + + def equals(self, ex): + return self.at == ex['exception-at'] + + def update(self, ex): + self.msgs.add(ex['exception-msg']) + self.svns.add(ex['svn-version']) + self.counts += 1 + self.walkers.add(ex['walker-name']) + self.times.add(decodeTime(ex['start-time'])) + self.ids.add(ex['id']) + + def bestExample(self, examples): + def takeShorter(x, y): + if len(y) < len(x): + return y + else: + return x + return reduce(takeShorter, examples) + + def setString(self, s): + if len(s) > self.MAX_SET_ITEMS_TO_SHOW: + s = [x for x in s][0:self.MAX_SET_ITEMS_TO_SHOW] + ["..."] + return ','.join(s) + + def duration(self): + x = sorted(self.times) + return "-".join(map(lambda x: x.strftime("%m/%d/%y"), [x[0], x[-1]])) + + def toStrings(self): + return [self.bestExample(self.msgs), self.at, self.setString(self.svns), self.setString(self.walkers), self.counts, self.setString(self.ids), self.duration()] + +addHandler('exceptions', ExceptionReport) + + +# +# def long_substr(data): +# substr = '' +# if len(data) > 1 and len(data[0]) > 0: +# for i in range(len(data[0])): +# for j in range(len(data[0])-i+1): +# if j > len(substr) and is_substr(data[0][i:i+j], data): +# substr = data[0][i:i+j] +# return substr +# +# def is_substr(find, data): +# if len(data) < 1 and len(find) < 1: +# return False +# for i in range(len(data)): +# if find not in data[i]: +# return False +# return True +# +# def parameterizeStrings( strings ): +# example = strings[0] +# para = '' +# +# lcs = long_substr(strings) +# if lcs == '': +# # nothing common at all, we are done +# return para +# else: +# # we need to remove the LCS from all strings +# strings = map( lambda x: x.replace(lcs, ''), strings) +# + # # utilities # @@ -190,6 +335,25 @@ def resolveFiles(paths): map( resolve1, paths ) return allFiles +def decodeTime(time): + return datetime.datetime.strptime(time.split()[0], "%Y/%m/%d") + #return datetime.datetime.strptime(time, "%Y/%m/%d %H.%M.%S") + +def passesFilters(elt): + if OPTIONS.noDev and eltTagEquals(elt,'build-type','dev'): + return False + if OPTIONS.maxDays != None: + now = datetime.datetime.today() + now = datetime.datetime(now.year, now.month, now.day) + # 2010/08/31 15.38.00 + eltTime = decodeTime(elt.find('start-time').text) + diff = now - eltTime + #print eltTime, now, diff, diff.days + if diff.days > OPTIONS.maxDays: + return False + + return True + def readReports(files): #print files for file in files: @@ -198,9 +362,11 @@ def readReports(files): elem = tree.getroot() if elem.tag == RUN_REPORT_LIST: for sub in elem: - yield sub + if passesFilters(sub): + yield sub else: - yield elem + if passesFilters(elem): + yield elem if __name__ == "__main__": main() \ No newline at end of file