Suppl. tools for working with and displaying GATK run reports
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4176 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
32c6b48106
commit
8683087756
|
|
@ -0,0 +1,52 @@
|
|||
args = commandArgs(TRUE);
|
||||
|
||||
onCMDLine = ! is.na(args[1])
|
||||
|
||||
if ( onCMDLine ) {
|
||||
print(paste("Reading data from", args[1]))
|
||||
d = read.table(args[1], header=T, sep="\t")
|
||||
} # only read into d if its' available, otherwise assume the data is already loaded
|
||||
|
||||
reportCountingPlot <- function(values, name, moreMargin = 0, ...) {
|
||||
par(las=2) # make label text perpendicular to axis
|
||||
par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
|
||||
barplot(sort(table(values)), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", ...)
|
||||
}
|
||||
|
||||
reportHist <- function(values, name, ...) {
|
||||
hist(values, main=name, 20, xlab="", col="cornflowerblue", ...)
|
||||
}
|
||||
|
||||
RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure
|
||||
excepted <- subset(d, exception.msg != "NA")
|
||||
badExcepted <- subset(excepted, run.time > RUNNING_GATK_RUNTIME)
|
||||
|
||||
if ( onCMDLine ) pdf(args[2])
|
||||
reportCountingPlot(d$walker.name, "Walker invocations")
|
||||
reportCountingPlot(d$svn.version, "GATK SVN version")
|
||||
reportCountingPlot(d$java.tmp.directory, "Java tmp directory")
|
||||
reportCountingPlot(d$working.directory, "Working directory")
|
||||
reportCountingPlot(d$user.name, "User")
|
||||
reportCountingPlot(d$host.name, "host")
|
||||
reportCountingPlot(d$java, "Java version")
|
||||
reportCountingPlot(d$machine, "Machine")
|
||||
|
||||
Gb <- 1024^3
|
||||
reportHist(d$total.memory / Gb, "Used memory")
|
||||
reportHist(d$max.memory / Gb, "Max memory")
|
||||
|
||||
min <- 60
|
||||
reportHist(log10(d$run.time / min), "Run time (log10[min])")
|
||||
|
||||
exceptionColor = "red"
|
||||
reportCountingPlot(excepted$walker.name, "Walker exceptions", col=exceptionColor)
|
||||
reportCountingPlot(subset(excepted, run.time > RUNNING_GATK_RUNTIME)$walker.name, paste("Long-running walker exceptions (>",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor)
|
||||
reportCountingPlot(subset(excepted, run.time < RUNNING_GATK_RUNTIME)$walker.name, paste("Start-up walker exceptions (<",RUNNING_GATK_RUNTIME,"seconds runtime)"), col=exceptionColor)
|
||||
reportCountingPlot(excepted$user.name, "Usernames generating exceptions", col=exceptionColor)
|
||||
reportCountingPlot(excepted$exception.msg, "Exception messages", 12)
|
||||
reportCountingPlot(excepted$exception.at, "Exception locations", 12)
|
||||
|
||||
if ( onCMDLine ) dev.off()
|
||||
|
||||
|
||||
|
||||
|
|
@ -49,3 +49,77 @@ cumhist <- function(d) {
|
|||
revcumsum <- function(x) {
|
||||
return(rev(cumsum(rev(x))))
|
||||
}
|
||||
|
||||
phred <- function(x) {
|
||||
log10(max(x,10^(-9.9)))*-10
|
||||
}
|
||||
|
||||
pOfB <- function(b, B, Q) {
|
||||
#print(paste(b, B, Q))
|
||||
p = 1 - 10^(-Q/10)
|
||||
if ( b == B )
|
||||
return(p)
|
||||
else
|
||||
return(1 - p)
|
||||
}
|
||||
|
||||
pOfG <- function(bs, qs, G) {
|
||||
a1 = G[1]
|
||||
a2 = G[2]
|
||||
|
||||
log10p = 0
|
||||
for ( i in 1:length(bs) ) {
|
||||
b = bs[i]
|
||||
q = qs[i]
|
||||
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
|
||||
log10p = log10p + log10(p1)
|
||||
}
|
||||
|
||||
return(log10p)
|
||||
}
|
||||
|
||||
pOfGs <- function(nAs, nBs, Q) {
|
||||
bs = c(rep("a", nAs), rep("t", nBs))
|
||||
qs = rep(Q, nAs + nBs)
|
||||
G1 = c("a", "a")
|
||||
G2 = c("a", "t")
|
||||
G3 = c("t", "t")
|
||||
|
||||
log10p1 = pOfG(bs, qs, G1)
|
||||
log10p2 = pOfG(bs, qs, G2)
|
||||
log10p3 = pOfG(bs, qs, G3)
|
||||
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
|
||||
|
||||
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
|
||||
}
|
||||
|
||||
QsampleExpected <- function(depth, Q) {
|
||||
weightedAvg = 0
|
||||
for ( d in 1:(depth*3) ) {
|
||||
Qsample = 0
|
||||
pOfD = dpois(d, depth)
|
||||
for ( nBs in 0:d ) {
|
||||
pOfnB = dbinom(nBs, d, 0.5)
|
||||
nAs = d - nBs
|
||||
Qsample = pOfGs(nAs, nBs, Q)$Qsample
|
||||
#Qsample = 1
|
||||
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
|
||||
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
|
||||
}
|
||||
}
|
||||
|
||||
return(weightedAvg)
|
||||
}
|
||||
|
||||
plotQsamples <- function(depths, Qs, Qmax) {
|
||||
cols = rainbow(length(Qs))
|
||||
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
|
||||
|
||||
for ( i in 1:length(Qs) ) {
|
||||
Q = Qs[i]
|
||||
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
|
||||
points(depths, y, col=cols[i], type="b")
|
||||
}
|
||||
|
||||
legend("topleft", paste("Q", Qs), fill=cols)
|
||||
}
|
||||
|
|
@ -0,0 +1,203 @@
|
|||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from itertools import *
|
||||
from xml.etree.ElementTree import *
|
||||
import gzip
|
||||
|
||||
MISSING_VALUE = "NA"
|
||||
RUN_REPORT_LIST = "GATK-run-reports"
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] mode file1 ... fileN"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="If provided, verbose progress will be enabled")
|
||||
|
||||
parser.add_option("-o", "--o", dest="output",
|
||||
type='string', default=None,
|
||||
help="if provided, output will go here instead of stdout")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) == 0:
|
||||
parser.error("Requires at least GATKRunReport xml to analyze")
|
||||
|
||||
stage = args[0]
|
||||
files = resolveFiles(args[1:])
|
||||
|
||||
# open up the output file
|
||||
if OPTIONS.output != None:
|
||||
out = openFile(OPTIONS.output,'w')
|
||||
else:
|
||||
out = sys.stdout
|
||||
|
||||
handler = getHandler(stage)(stage, out)
|
||||
handler.initialize(files)
|
||||
|
||||
# parse all of the incoming files
|
||||
for report in readReports(files):
|
||||
# todo -- add matching here
|
||||
handler.processRecord(report)
|
||||
|
||||
handler.finalize(files)
|
||||
out.close()
|
||||
|
||||
#
|
||||
# Stage HANDLERS
|
||||
#
|
||||
class StageHandler:
|
||||
def __init__(self, name, out):
|
||||
self.name = name
|
||||
self.out = out
|
||||
|
||||
def getName(self): return self.name
|
||||
|
||||
def initialize(self, args):
|
||||
pass # print 'initialize'
|
||||
|
||||
def processRecord(self, record):
|
||||
pass # print 'processing record', record
|
||||
|
||||
def finalize(self, args):
|
||||
pass # print 'Finalize'
|
||||
|
||||
|
||||
# a map from stage strings -> function to handle record
|
||||
HANDLERS = dict()
|
||||
def addHandler(name, handler):
|
||||
HANDLERS[name] = handler
|
||||
|
||||
def getHandler(stage):
|
||||
return HANDLERS[stage]
|
||||
|
||||
# def
|
||||
class RecordAsTable(StageHandler):
|
||||
def __init__(self, name, out):
|
||||
StageHandler.__init__(self, name, out)
|
||||
|
||||
def initialize(self, args):
|
||||
self.fields = list()
|
||||
self.formatters = dict()
|
||||
|
||||
def id(elt): return elt.text
|
||||
def toString(elt): return '"%s"' % elt.text
|
||||
|
||||
def formatExceptionMsg(elt):
|
||||
return '"%s"' % elt.find("message").text
|
||||
|
||||
def formatExceptionAt(elt):
|
||||
return '"%s"' % elt.find("stacktrace").find("string").text
|
||||
|
||||
def add(names, func):
|
||||
for name in names:
|
||||
addComplex(name, [name], [func])
|
||||
|
||||
def addComplex(key, fields, funcs):
|
||||
self.fields.extend(fields)
|
||||
self.formatters[key] = zip(fields, funcs)
|
||||
|
||||
add(["id", "walker-name", "svn-version"], id)
|
||||
add(["start-time", "end-time"], toString)
|
||||
add(["run-time", "java-tmp-directory", "working-directory", "user-name", "host-name"], id)
|
||||
add(["java", "machine"], toString)
|
||||
add(["max-memory", "total-memory", "iterations", "reads"], id)
|
||||
addComplex("exception", ["exception-msg", "exception-at"], [formatExceptionMsg, formatExceptionAt])
|
||||
# add(["command-line"], toString)
|
||||
|
||||
print >> self.out, "\t".join(self.fields)
|
||||
|
||||
def processRecord(self, record):
|
||||
parsed = parseReport(record, self.formatters)
|
||||
|
||||
def oneField(field):
|
||||
val = MISSING_VALUE
|
||||
if field in parsed:
|
||||
val = parsed[field]
|
||||
return val
|
||||
|
||||
print >> self.out, "\t".join([ oneField(field) for field in self.fields ])
|
||||
|
||||
def parseReport(report, allFormatters):
|
||||
bindings = dict()
|
||||
for elt in report:
|
||||
if elt.tag in allFormatters:
|
||||
fieldFormats = allFormatters[elt.tag]
|
||||
# we actually care about this tag
|
||||
for field, formatter in fieldFormats:
|
||||
bindings[field] = formatter(elt)
|
||||
return bindings
|
||||
|
||||
|
||||
addHandler('table', RecordAsTable)
|
||||
|
||||
class RecordAsXML(StageHandler):
|
||||
def __init__(self, name, out):
|
||||
StageHandler.__init__(self, name, out)
|
||||
|
||||
def initialize(self, args):
|
||||
print >> self.out, "<%s>" % RUN_REPORT_LIST
|
||||
|
||||
def processRecord(self, record):
|
||||
print >> self.out, tostring(record)
|
||||
|
||||
def finalize(self, args):
|
||||
print >> self.out, "</%s>" % RUN_REPORT_LIST
|
||||
|
||||
addHandler('xml', RecordAsXML)
|
||||
|
||||
class Archive(RecordAsXML):
|
||||
def __init__(self, name, out):
|
||||
RecordAsXML.__init__(self, name, out)
|
||||
|
||||
def finalize(self, args):
|
||||
for arg in args:
|
||||
print 'Deleting file: ', arg
|
||||
os.remove(arg)
|
||||
|
||||
addHandler('archive', Archive)
|
||||
|
||||
|
||||
#
|
||||
# utilities
|
||||
#
|
||||
def openFile(filename, mode='r'):
|
||||
if ( filename.endswith(".gz") ):
|
||||
return gzip.open(filename, mode)
|
||||
else:
|
||||
return open(filename, mode)
|
||||
|
||||
def resolveFiles(paths):
|
||||
allFiles = list()
|
||||
def resolve1(path):
|
||||
if not os.path.exists(path):
|
||||
raise Exception("Path doesn't exist: " + path)
|
||||
elif os.path.isfile(path):
|
||||
allFiles.append(path)
|
||||
else:
|
||||
def one(arg, dirname, files):
|
||||
#print dirname, files
|
||||
#print dirname
|
||||
allFiles.extend(map( lambda x: os.path.join(path, x), files ))
|
||||
#print files
|
||||
|
||||
os.path.walk(path, one, None)
|
||||
|
||||
map( resolve1, paths )
|
||||
return allFiles
|
||||
|
||||
def readReports(files):
|
||||
#print files
|
||||
for file in files:
|
||||
input = openFile(file)
|
||||
tree = ElementTree(file=input)
|
||||
elem = tree.getroot()
|
||||
if elem.tag == RUN_REPORT_LIST:
|
||||
for sub in elem:
|
||||
yield sub
|
||||
else:
|
||||
yield elem
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue