Improve queue script jobreport visualization script

-- the Queue jobreport PDF script now provides a high-level summary of the de-scattered runtimes of each analysis, so that its easy to see where your script is spending its time across scatters.
This commit is contained in:
Mark DePristo 2013-05-07 12:11:46 -04:00
parent b153042a9d
commit 2b86ab02be
1 changed files with 42 additions and 11 deletions

View File

@ -3,6 +3,7 @@ library(ggplot2)
library(gplots)
library(tools)
library(reshape)
library(plyr)
#
# Standard command line switch. Can we loaded interactively for development
@ -14,7 +15,7 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
inputFileName = "Q-26618@gsa4.jobreport.txt"
inputFileName = "~/Desktop/broadLocal/projects/pipelinePerformance/FullProcessingPipeline.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
@ -35,13 +36,11 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
plotJobsGantt <- function(gatkReport, sortOverall, title, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
} else {
title = "All jobs, sorted by start time"
allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
}
allJobs$index = 1:nrow(allJobs)
@ -54,11 +53,11 @@ plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + geom_text(aes(x=relStartTime, label=ganttName, hjust=0, vjust=-1), size=2)
p <- p + xlim(0, maxRelTime * 1.3)
p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS))
p <- p + ylab("Job number")
p <- p + opts(title=title)
p <- p + ggtitle(title)
print(p)
}
@ -182,6 +181,27 @@ plotTimeByHost <- function(gatkReportData) {
plotMe("Jittered points", geom_jitter)
}
mergeScattersForAnalysis <- function(table) {
#allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
ddply(table, .(analysisName, iteration), summarize,
jobName = analysisName[1],
exechosts = paste(length(exechosts), "hosts"),
formattedStartTime = "NA",
formattedDoneTime = "NA",
intermediate = intermediate[1],
startTime = min(startTime),
doneTime = min(startTime) + sum(runtime),
runtime = sum(runtime))
}
mergeScatters <- function(report) {
newReport = list()
for ( name in names(gatkReportData) ) {
newReport[[name]] = mergeScattersForAnalysis(gatkReportData[[name]])
}
newReport
}
# read the table
gatkReportData <- gsa.read.gatkreport(inputFileName)
@ -192,13 +212,24 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotJobsGantt(gatkReportData, T, "All jobs, by analysis, by start time", F)
plotJobsGantt(gatkReportData, F, "All jobs, sorted by start time", F)
plotProgressByTime(gatkReportData)
# plots summarizing overall costs, merging scattered counts
merged.by.scatter = mergeScatters(gatkReportData)
plotJobsGantt(merged.by.scatter, F, "Jobs merged by scatter by start time", T)
merged.as.df = do.call(rbind.data.frame, merged.by.scatter)[,c("analysisName", "runtime")]
merged.as.df$percent = merged.as.df$runtime / sum(merged.as.df$runtime) * 100
merged.as.df.formatted = data.frame(analysisName=merged.as.df$analysisName,runtime=prettyNum(merged.as.df$runtime), percent=prettyNum(merged.as.df$percent,digits=2))
textplot(merged.as.df.formatted[order(merged.as.df$runtime),], show.rownames=F)
title("Total runtime for each analysis")
plotTimeByHost(gatkReportData)
for ( group in gatkReportData ) {
print(group)
plotGroup(group)
#print(group)
plotGroup(group)
}
if ( ! is.na(outputPDF) ) {