From 8683087756550dbfb9aa1ced04d6f0b2e7dbdb35 Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 31 Aug 2010 20:32:22 +0000 Subject: [PATCH] Suppl. tools for working with and displaying GATK run reports git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4176 348d0f76-0448-11de-a6fe-93d51630548a --- R/GATKRunReport.R | 52 +++++++++ R/titvFPEst.R | 74 +++++++++++++ python/analyzeRunReports.py | 203 ++++++++++++++++++++++++++++++++++++ 3 files changed, 329 insertions(+) create mode 100644 R/GATKRunReport.R create mode 100755 python/analyzeRunReports.py diff --git a/R/GATKRunReport.R b/R/GATKRunReport.R new file mode 100644 index 000000000..ea0de1c60 --- /dev/null +++ b/R/GATKRunReport.R @@ -0,0 +1,52 @@ +args = commandArgs(TRUE); + +onCMDLine = ! is.na(args[1]) + +if ( onCMDLine ) { + print(paste("Reading data from", args[1])) + d = read.table(args[1], header=T, sep="\t") +} # only read into d if its' available, otherwise assume the data is already loaded + +reportCountingPlot <- function(values, name, moreMargin = 0, ...) { + par(las=2) # make label text perpendicular to axis + par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin. + barplot(sort(table(values)), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", ...) +} + +reportHist <- function(values, name, ...) { + hist(values, main=name, 20, xlab="", col="cornflowerblue", ...) +} + +RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure +excepted <- subset(d, exception.msg != "NA") +badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME) + +if ( onCMDLine ) pdf(args[2]) +reportCountingPlot(d$walker.name, "Walker invocations") +reportCountingPlot(d$svn.version, "GATK SVN version") +reportCountingPlot(d$java.tmp.directory, "Java tmp directory") +reportCountingPlot(d$working.directory, "Working directory") +reportCountingPlot(d$user.name, "User") +reportCountingPlot(d$host.name, "host") +reportCountingPlot(d$java, "Java version") +reportCountingPlot(d$machine, "Machine") + +Gb <- 1024^3 +reportHist(d$total.memory / Gb, "Used memory") +reportHist(d$max.memory / Gb, "Max memory") + +min <- 60 +reportHist(log10(d$run.time / min), "Run time (log10[min])") + +exceptionColor = "red" +reportCountingPlot(excepted$walker.name, "Walker exceptions", col=exceptionColor) +reportCountingPlot(subset(excepted, run.time > RUNNING_GATK_RUNTIME)$walker.name, paste("Long-running walker exceptions (>",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor) +reportCountingPlot(subset(excepted, run.time < RUNNING_GATK_RUNTIME)$walker.name, paste("Start-up walker exceptions (<",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor) +reportCountingPlot(excepted$user.name, "Usernames generating exceptions", col=exceptionColor) +reportCountingPlot(excepted$exception.msg, "Exception messages", 12) +reportCountingPlot(excepted$exception.at, "Exception locations", 12) + +if ( onCMDLine ) dev.off() + + + diff --git a/R/titvFPEst.R b/R/titvFPEst.R index e6a0d4413..fbf13ccd7 100755 --- a/R/titvFPEst.R +++ b/R/titvFPEst.R @@ -49,3 +49,77 @@ cumhist <- function(d) { revcumsum <- function(x) { return(rev(cumsum(rev(x)))) } + +phred <- function(x) { + log10(max(x,10^(-9.9)))*-10 +} + +pOfB <- function(b, B, Q) { + #print(paste(b, B, Q)) + p = 1 - 10^(-Q/10) + if ( b == B ) + return(p) + else + return(1 - p) +} + +pOfG <- function(bs, qs, G) { + a1 = G[1] + a2 = G[2] + + log10p = 0 + for ( i in 1:length(bs) ) { + b = bs[i] + q = qs[i] + p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 + log10p = log10p + log10(p1) + } + + return(log10p) +} + +pOfGs <- function(nAs, nBs, Q) { + bs = c(rep("a", nAs), rep("t", nBs)) + qs = rep(Q, nAs + nBs) + G1 = c("a", "a") + G2 = c("a", "t") + G3 = c("t", "t") + + log10p1 = pOfG(bs, qs, G1) + log10p2 = pOfG(bs, qs, G2) + log10p3 = pOfG(bs, qs, G3) + Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) + + return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) +} + +QsampleExpected <- function(depth, Q) { + weightedAvg = 0 + for ( d in 1:(depth*3) ) { + Qsample = 0 + pOfD = dpois(d, depth) + for ( nBs in 0:d ) { + pOfnB = dbinom(nBs, d, 0.5) + nAs = d - nBs + Qsample = pOfGs(nAs, nBs, Q)$Qsample + #Qsample = 1 + weightedAvg = weightedAvg + Qsample * pOfD * pOfnB + print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) + } + } + + return(weightedAvg) +} + +plotQsamples <- function(depths, Qs, Qmax) { + cols = rainbow(length(Qs)) + plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") + + for ( i in 1:length(Qs) ) { + Q = Qs[i] + y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) + points(depths, y, col=cols[i], type="b") + } + + legend("topleft", paste("Q", Qs), fill=cols) +} \ No newline at end of file diff --git a/python/analyzeRunReports.py b/python/analyzeRunReports.py new file mode 100755 index 000000000..fd9d71ccc --- /dev/null +++ b/python/analyzeRunReports.py @@ -0,0 +1,203 @@ +import os.path +import sys +from optparse import OptionParser +from itertools import * +from xml.etree.ElementTree import * +import gzip + +MISSING_VALUE = "NA" +RUN_REPORT_LIST = "GATK-run-reports" + +def main(): + global OPTIONS + usage = "usage: %prog [options] mode file1 ... fileN" + parser = OptionParser(usage=usage) + parser.add_option("-v", "--verbose", dest="verbose", + action='store_true', default=False, + help="If provided, verbose progress will be enabled") + + parser.add_option("-o", "--o", dest="output", + type='string', default=None, + help="if provided, output will go here instead of stdout") + + (OPTIONS, args) = parser.parse_args() + if len(args) == 0: + parser.error("Requires at least GATKRunReport xml to analyze") + + stage = args[0] + files = resolveFiles(args[1:]) + + # open up the output file + if OPTIONS.output != None: + out = openFile(OPTIONS.output,'w') + else: + out = sys.stdout + + handler = getHandler(stage)(stage, out) + handler.initialize(files) + + # parse all of the incoming files + for report in readReports(files): + # todo -- add matching here + handler.processRecord(report) + + handler.finalize(files) + out.close() + +# +# Stage HANDLERS +# +class StageHandler: + def __init__(self, name, out): + self.name = name + self.out = out + + def getName(self): return self.name + + def initialize(self, args): + pass # print 'initialize' + + def processRecord(self, record): + pass # print 'processing record', record + + def finalize(self, args): + pass # print 'Finalize' + + +# a map from stage strings -> function to handle record +HANDLERS = dict() +def addHandler(name, handler): + HANDLERS[name] = handler + +def getHandler(stage): + return HANDLERS[stage] + +# def +class RecordAsTable(StageHandler): + def __init__(self, name, out): + StageHandler.__init__(self, name, out) + + def initialize(self, args): + self.fields = list() + self.formatters = dict() + + def id(elt): return elt.text + def toString(elt): return '"%s"' % elt.text + + def formatExceptionMsg(elt): + return '"%s"' % elt.find("message").text + + def formatExceptionAt(elt): + return '"%s"' % elt.find("stacktrace").find("string").text + + def add(names, func): + for name in names: + addComplex(name, [name], [func]) + + def addComplex(key, fields, funcs): + self.fields.extend(fields) + self.formatters[key] = zip(fields, funcs) + + add(["id", "walker-name", "svn-version"], id) + add(["start-time", "end-time"], toString) + add(["run-time", "java-tmp-directory", "working-directory", "user-name", "host-name"], id) + add(["java", "machine"], toString) + add(["max-memory", "total-memory", "iterations", "reads"], id) + addComplex("exception", ["exception-msg", "exception-at"], [formatExceptionMsg, formatExceptionAt]) + # add(["command-line"], toString) + + print >> self.out, "\t".join(self.fields) + + def processRecord(self, record): + parsed = parseReport(record, self.formatters) + + def oneField(field): + val = MISSING_VALUE + if field in parsed: + val = parsed[field] + return val + + print >> self.out, "\t".join([ oneField(field) for field in self.fields ]) + +def parseReport(report, allFormatters): + bindings = dict() + for elt in report: + if elt.tag in allFormatters: + fieldFormats = allFormatters[elt.tag] + # we actually care about this tag + for field, formatter in fieldFormats: + bindings[field] = formatter(elt) + return bindings + + +addHandler('table', RecordAsTable) + +class RecordAsXML(StageHandler): + def __init__(self, name, out): + StageHandler.__init__(self, name, out) + + def initialize(self, args): + print >> self.out, "<%s>" % RUN_REPORT_LIST + + def processRecord(self, record): + print >> self.out, tostring(record) + + def finalize(self, args): + print >> self.out, "" % RUN_REPORT_LIST + +addHandler('xml', RecordAsXML) + +class Archive(RecordAsXML): + def __init__(self, name, out): + RecordAsXML.__init__(self, name, out) + + def finalize(self, args): + for arg in args: + print 'Deleting file: ', arg + os.remove(arg) + +addHandler('archive', Archive) + + +# +# utilities +# +def openFile(filename, mode='r'): + if ( filename.endswith(".gz") ): + return gzip.open(filename, mode) + else: + return open(filename, mode) + +def resolveFiles(paths): + allFiles = list() + def resolve1(path): + if not os.path.exists(path): + raise Exception("Path doesn't exist: " + path) + elif os.path.isfile(path): + allFiles.append(path) + else: + def one(arg, dirname, files): + #print dirname, files + #print dirname + allFiles.extend(map( lambda x: os.path.join(path, x), files )) + #print files + + os.path.walk(path, one, None) + + map( resolve1, paths ) + return allFiles + +def readReports(files): + #print files + for file in files: + input = openFile(file) + tree = ElementTree(file=input) + elem = tree.getroot() + if elem.tag == RUN_REPORT_LIST: + for sub in elem: + yield sub + else: + yield elem + +if __name__ == "__main__": + main() \ No newline at end of file