From 8683087756550dbfb9aa1ced04d6f0b2e7dbdb35 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Tue, 31 Aug 2010 20:32:22 +0000
Subject: [PATCH] Suppl. tools for working with and displaying GATK run reports

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4176 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/GATKRunReport.R           |  52 +++++++++
 R/titvFPEst.R               |  74 +++++++++++++
 python/analyzeRunReports.py | 203 ++++++++++++++++++++++++++++++++++++
 3 files changed, 329 insertions(+)
 create mode 100644 R/GATKRunReport.R
 create mode 100755 python/analyzeRunReports.py

diff --git a/R/GATKRunReport.R b/R/GATKRunReport.R
new file mode 100644
index 000000000..ea0de1c60
--- /dev/null
+++ b/R/GATKRunReport.R
@@ -0,0 +1,52 @@
+args = commandArgs(TRUE);
+
+onCMDLine = ! is.na(args[1])
+
+if ( onCMDLine ) { 
+   print(paste("Reading data from", args[1]))
+   d = read.table(args[1], header=T, sep="\t")
+} # only read into d if its' available, otherwise assume the data is already loaded
+
+reportCountingPlot <- function(values, name, moreMargin = 0, ...) {
+    par(las=2) # make label text perpendicular to axis
+    par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
+    barplot(sort(table(values)), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", ...)
+}
+
+reportHist <- function(values, name, ...) {
+    hist(values, main=name, 20, xlab="", col="cornflowerblue", ...)
+}
+
+RUNNING_GATK_RUNTIME <- 60 * 5 #  5 minutes => bad failure
+excepted <- subset(d, exception.msg != "NA")
+badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME)
+
+if ( onCMDLine ) pdf(args[2])
+reportCountingPlot(d$walker.name, "Walker invocations")
+reportCountingPlot(d$svn.version, "GATK SVN version")
+reportCountingPlot(d$java.tmp.directory, "Java tmp directory")
+reportCountingPlot(d$working.directory, "Working directory")
+reportCountingPlot(d$user.name, "User")
+reportCountingPlot(d$host.name, "host")
+reportCountingPlot(d$java, "Java version")
+reportCountingPlot(d$machine, "Machine")
+
+Gb <- 1024^3
+reportHist(d$total.memory / Gb, "Used memory")
+reportHist(d$max.memory / Gb, "Max memory")
+
+min <- 60
+reportHist(log10(d$run.time / min), "Run time (log10[min])")
+
+exceptionColor = "red"
+reportCountingPlot(excepted$walker.name, "Walker exceptions", col=exceptionColor)
+reportCountingPlot(subset(excepted, run.time > RUNNING_GATK_RUNTIME)$walker.name, paste("Long-running walker exceptions (>",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor)
+reportCountingPlot(subset(excepted, run.time < RUNNING_GATK_RUNTIME)$walker.name, paste("Start-up walker exceptions (<",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor)
+reportCountingPlot(excepted$user.name, "Usernames generating exceptions", col=exceptionColor)
+reportCountingPlot(excepted$exception.msg, "Exception messages", 12)
+reportCountingPlot(excepted$exception.at, "Exception locations", 12)
+
+if ( onCMDLine ) dev.off()
+
+
+
diff --git a/R/titvFPEst.R b/R/titvFPEst.R
index e6a0d4413..fbf13ccd7 100755
--- a/R/titvFPEst.R
+++ b/R/titvFPEst.R
@@ -49,3 +49,77 @@ cumhist <- function(d) {
 revcumsum <- function(x) {
    return(rev(cumsum(rev(x))))
 }
+
+phred <- function(x) {
+    log10(max(x,10^(-9.9)))*-10
+}
+
+pOfB <- function(b, B, Q) {
+    #print(paste(b, B, Q))
+    p = 1 - 10^(-Q/10)
+    if ( b == B )
+        return(p) 
+    else
+        return(1 - p)
+}
+
+pOfG <- function(bs, qs, G) {
+    a1 = G[1]
+    a2 = G[2]
+    
+    log10p = 0
+    for ( i in 1:length(bs) ) {
+        b = bs[i]
+        q = qs[i]
+        p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
+        log10p = log10p + log10(p1)
+    }
+    
+    return(log10p)
+}
+
+pOfGs <- function(nAs, nBs, Q) {
+    bs = c(rep("a", nAs), rep("t", nBs))
+    qs = rep(Q, nAs + nBs)
+    G1 = c("a", "a")
+    G2 = c("a", "t")
+    G3 = c("t", "t")
+
+    log10p1 = pOfG(bs, qs, G1)
+    log10p2 = pOfG(bs, qs, G2)
+    log10p3 = pOfG(bs, qs, G3)
+    Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
+    
+    return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) 
+}
+
+QsampleExpected <- function(depth, Q) {
+    weightedAvg = 0
+    for ( d in 1:(depth*3) ) {
+        Qsample = 0
+        pOfD = dpois(d, depth)
+        for ( nBs in 0:d ) {
+            pOfnB = dbinom(nBs, d, 0.5)
+            nAs = d - nBs
+            Qsample = pOfGs(nAs, nBs, Q)$Qsample
+            #Qsample = 1
+            weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
+            print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
+        }
+    }
+    
+    return(weightedAvg)
+}
+
+plotQsamples <- function(depths, Qs, Qmax) {
+    cols = rainbow(length(Qs))
+    plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
+    
+    for ( i in 1:length(Qs) ) {
+        Q = Qs[i]
+        y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
+        points(depths, y, col=cols[i], type="b")
+    }
+
+    legend("topleft", paste("Q", Qs), fill=cols)
+}
\ No newline at end of file
diff --git a/python/analyzeRunReports.py b/python/analyzeRunReports.py
new file mode 100755
index 000000000..fd9d71ccc
--- /dev/null
+++ b/python/analyzeRunReports.py
@@ -0,0 +1,203 @@
+import os.path
+import sys
+from optparse import OptionParser
+from itertools import *
+from xml.etree.ElementTree import *
+import gzip
+
+MISSING_VALUE = "NA"
+RUN_REPORT_LIST = "GATK-run-reports"
+
+def main():
+    global OPTIONS
+    usage = "usage: %prog [options] mode file1 ... fileN"
+    parser = OptionParser(usage=usage)
+    parser.add_option("-v", "--verbose", dest="verbose",
+                        action='store_true', default=False,
+                        help="If provided, verbose progress will be enabled")         
+
+    parser.add_option("-o", "--o", dest="output",
+                        type='string', default=None,
+                        help="if provided, output will go here instead of stdout")
+         
+    (OPTIONS, args) = parser.parse_args()
+    if len(args) == 0:
+        parser.error("Requires at least GATKRunReport xml to analyze")
+
+    stage = args[0]
+    files = resolveFiles(args[1:])
+
+    # open up the output file
+    if OPTIONS.output != None:
+        out = openFile(OPTIONS.output,'w')
+    else:
+        out = sys.stdout
+
+    handler = getHandler(stage)(stage, out)
+    handler.initialize(files)
+
+    # parse all of the incoming files
+    for report in readReports(files):
+        # todo -- add matching here
+        handler.processRecord(report)
+
+    handler.finalize(files)
+    out.close()
+
+#
+# Stage HANDLERS
+#
+class StageHandler:
+    def __init__(self, name, out):
+        self.name = name
+        self.out = out
+        
+    def getName(self): return self.name
+        
+    def initialize(self, args):
+        pass # print 'initialize'
+        
+    def processRecord(self, record):
+        pass # print 'processing record', record
+        
+    def finalize(self, args):
+        pass # print 'Finalize'
+
+
+# a map from stage strings -> function to handle record
+HANDLERS = dict()
+def addHandler(name, handler):
+    HANDLERS[name] = handler
+    
+def getHandler(stage):
+    return HANDLERS[stage]
+    
+# def 
+class RecordAsTable(StageHandler):
+    def __init__(self, name, out):
+        StageHandler.__init__(self, name, out)
+        
+    def initialize(self, args):
+        self.fields = list()
+        self.formatters = dict()
+    
+        def id(elt): return elt.text
+        def toString(elt): return '"%s"' % elt.text
+        
+        def formatExceptionMsg(elt):
+            return '"%s"' % elt.find("message").text
+
+        def formatExceptionAt(elt):
+            return '"%s"' % elt.find("stacktrace").find("string").text
+        
+        def add(names, func):
+            for name in names:
+                addComplex(name, [name], [func])
+
+        def addComplex(key, fields, funcs):
+            self.fields.extend(fields)
+            self.formatters[key] = zip(fields, funcs)
+    
+        add(["id", "walker-name", "svn-version"], id)
+        add(["start-time", "end-time"], toString)      
+        add(["run-time", "java-tmp-directory", "working-directory", "user-name", "host-name"], id)
+        add(["java", "machine"], toString)
+        add(["max-memory", "total-memory", "iterations", "reads"], id)
+        addComplex("exception", ["exception-msg", "exception-at"], [formatExceptionMsg, formatExceptionAt])
+        # add(["command-line"], toString)    
+        
+        print >> self.out, "\t".join(self.fields)
+
+    def processRecord(self, record):
+        parsed = parseReport(record, self.formatters)
+
+        def oneField(field):
+            val = MISSING_VALUE
+            if field in parsed:
+                val = parsed[field]
+            return val
+
+        print >> self.out, "\t".join([ oneField(field) for field in self.fields ])
+
+def parseReport(report, allFormatters):
+    bindings = dict()
+    for elt in report:
+        if elt.tag in allFormatters:
+            fieldFormats = allFormatters[elt.tag]
+            # we actually care about this tag
+            for field, formatter in fieldFormats:
+                bindings[field] = formatter(elt)
+    return bindings
+
+
+addHandler('table', RecordAsTable)
+
+class RecordAsXML(StageHandler):
+    def __init__(self, name, out):
+        StageHandler.__init__(self, name, out)
+
+    def initialize(self, args):
+        print >> self.out, "<%s>" % RUN_REPORT_LIST
+        
+    def processRecord(self, record):
+        print >> self.out, tostring(record)
+
+    def finalize(self, args):
+        print >> self.out, "</%s>" % RUN_REPORT_LIST
+
+addHandler('xml', RecordAsXML)
+
+class Archive(RecordAsXML):
+    def __init__(self, name, out):
+        RecordAsXML.__init__(self, name, out)
+
+    def finalize(self, args):
+        for arg in args:
+            print 'Deleting file: ', arg
+            os.remove(arg)
+
+addHandler('archive', Archive)
+
+
+#
+# utilities
+#
+def openFile(filename, mode='r'):
+    if ( filename.endswith(".gz") ):
+        return gzip.open(filename, mode)
+    else:
+        return open(filename, mode)
+
+def resolveFiles(paths):
+    allFiles = list()
+    def resolve1(path):
+        if not os.path.exists(path):
+            raise Exception("Path doesn't exist: " + path)
+        elif os.path.isfile(path):
+            allFiles.append(path)
+        else:
+            def one(arg, dirname, files):
+                #print dirname, files
+                #print dirname
+                allFiles.extend(map( lambda x: os.path.join(path, x), files ))
+                #print files
+    
+            os.path.walk(path, one, None)
+
+    map( resolve1, paths )
+    return allFiles
+
+def readReports(files):
+    #print files
+    for file in files:
+        input = openFile(file)
+        tree = ElementTree(file=input)
+        elem = tree.getroot()
+        if elem.tag == RUN_REPORT_LIST:
+            for sub in elem:
+                yield sub
+        else:
+            yield elem
+    
+if __name__ == "__main__":
+    main()
\ No newline at end of file