gatk-3.8/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala

1006 lines
33 KiB
Scala
Raw Normal View History

/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.engine
import org.jgrapht.traverse.TopologicalOrderIterator
import org.jgrapht.graph.SimpleDirectedGraph
import scala.collection.JavaConversions._
import org.jgrapht.alg.CycleDetector
import org.jgrapht.EdgeFactory
import org.jgrapht.ext.DOTExporter
import java.io.File
import org.jgrapht.event.{TraversalListenerAdapter, EdgeTraversalEvent}
import org.broadinstitute.sting.queue.QException
import org.broadinstitute.sting.queue.function.{InProcessFunction, CommandLineFunction, QFunction}
import org.apache.commons.lang.StringUtils
import org.broadinstitute.sting.queue.util._
import collection.immutable.{TreeSet, TreeMap}
import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction}
/**
* The internal dependency tracker between sets of function input and output files.
*/
class QGraph extends Logging {
var settings: QGraphSettings = _
var debugMode = false
private def dryRun = !settings.run
private var numMissingValues = 0
private val jobGraph = newGraph
private val functionOrdering = Ordering.by[FunctionEdge, Iterable[Int]](edge => -graphDepth(edge) +: edge.function.addOrder)
private val fileOrdering = Ordering.by[File,String](_.getAbsolutePath)
// A map of nodes by list of files.
private var nodeMap = TreeMap.empty[Iterable[File], QNode](Ordering.Iterable(fileOrdering))
// The next unique id for a node if not found in the nodeMap.
private var nextNodeId = 0
private var running = true
private val runningLock = new Object
private var runningJobs = Set.empty[FunctionEdge]
private var intermediatesJobs = Set.empty[FunctionEdge]
private val nl = "%n".format()
private val commandLinePluginManager = new CommandLinePluginManager
private var commandLineManager: CommandLineJobManager[CommandLineJobRunner] = _
private val inProcessManager = new InProcessJobManager
private def managers = List[Any](inProcessManager, commandLineManager)
private class StatusCounts {
var pending = 0
var running = 0
var failed = 0
var done = 0
}
private val statusCounts = new StatusCounts
/**
* Adds a QScript created CommandLineFunction to the graph.
* @param command Function to add to the graph.
*/
def add(command: QFunction) {
try {
runningLock.synchronized {
if (running) {
command.qSettings = settings.qSettings
command.freeze
val inputs = getQNode(command.inputs.toList.sorted(fileOrdering))
val outputs = getQNode(command.outputs.toList.sorted(fileOrdering))
addEdge(new FunctionEdge(command, inputs, outputs))
}
}
} catch {
case e: Exception =>
throw new QException("Error adding function: " + command, e)
}
}
/**
* Checks the functions for missing values and the graph for cyclic dependencies and then runs the functions in the graph.
*/
def run() {
runningLock.synchronized {
if (running) {
IOUtils.checkTempDir(settings.qSettings.tempDirectory)
fillGraph
val isReady = numMissingValues == 0
if (this.jobGraph.edgeSet.isEmpty) {
logger.warn("Nothing to run! Were any Functions added?");
} else if (settings.getStatus) {
logger.info("Checking pipeline status.")
logStatus()
} else if (this.dryRun) {
dryRunJobs()
if (running && isReady) {
logger.info("Dry run completed successfully!")
logger.info("Re-run with \"-run\" to execute the functions.")
}
} else if (isReady) {
logger.info("Running jobs.")
runJobs()
}
if (numMissingValues > 0) {
logger.error("Total missing values: " + numMissingValues)
}
}
}
}
private def fillGraph {
logger.info("Generating graph.")
fill
if (settings.dotFile != null)
renderToDot(settings.dotFile)
validate()
if (running && numMissingValues == 0) {
logger.info("Generating scatter gather jobs.")
val scatterGathers = jobGraph.edgeSet.filter(edge => scatterGatherable(edge))
var addedFunctions = List.empty[QFunction]
for (scatterGather <- scatterGathers) {
val functions = scatterGather.asInstanceOf[FunctionEdge]
.function.asInstanceOf[ScatterGatherableFunction]
.generateFunctions()
if (this.debugMode)
logger.debug("Scattered into %d parts: %n%s".format(functions.size, functions.mkString(nl)))
addedFunctions ++= functions
}
logger.info("Removing original jobs.")
this.jobGraph.removeAllEdges(scatterGathers)
prune()
logger.info("Adding scatter gather jobs.")
addedFunctions.foreach(function => if (running) this.add(function))
logger.info("Regenerating graph.")
fill
val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile
if (scatterGatherDotFile != null)
renderToDot(scatterGatherDotFile)
validate()
}
}
private def scatterGatherable(edge: QEdge) = {
edge match {
case functionEdge: FunctionEdge => {
functionEdge.function match {
case scatterGather: ScatterGatherableFunction if (scatterGather.scatterGatherable) => true
case _ => false
}
}
case _ => false
}
}
/**
* Walks up the graph looking for the previous function edges.
* @param edge Graph edge to examine for the previous functions.
* @return A list of prior function edges.
*/
private def previousFunctions(edge: QEdge): List[FunctionEdge] = {
var previous = List.empty[FunctionEdge]
val source = this.jobGraph.getEdgeSource(edge)
for (incomingEdge <- this.jobGraph.incomingEdgesOf(source)) {
incomingEdge match {
// Stop recursing when we find a function edge and return it
case functionEdge: FunctionEdge => previous :+= functionEdge
// For any other type of edge find the jobs preceding the edge
case edge: QEdge => previous ++= previousFunctions(edge)
}
}
previous
}
/**
* Walks up the graph looking for the next function edges.
* @param edge Graph edge to examine for the next functions.
* @return A list of prior function edges.
*/
private def nextFunctions(edge: QEdge): List[FunctionEdge] = {
var next = List.empty[FunctionEdge]
val target = this.jobGraph.getEdgeTarget(edge)
for (outgoingEdge <- this.jobGraph.outgoingEdgesOf(target)) {
outgoingEdge match {
// Stop recursing when we find a function edge and return it
case functionEdge: FunctionEdge => next :+= functionEdge
// For any other type of edge find the jobs following the edge
case edge: QEdge => next ++= nextFunctions(edge)
}
}
next
}
/**
* Fills in the graph using mapping functions, then removes out of date
* jobs, then cleans up mapping functions and nodes that aren't need.
*/
private def fill() {
fillIn()
prune()
}
/**
* Looks through functions with multiple inputs and outputs and adds mapping functions for single inputs and outputs.
*/
private def fillIn() {
// clone since edgeSet is backed by the graph
asScalaSet(jobGraph.edgeSet).clone.foreach(edge => {
if (running) edge match {
case cmd: FunctionEdge => {
addCollectionOutputs(cmd.outputs)
addCollectionInputs(cmd.inputs)
}
case map: MappingEdge => /* do nothing for mapping edges */
}
})
}
private def getReadyJobs(): Set[FunctionEdge] = {
jobGraph.edgeSet.filter{
case f: FunctionEdge =>
this.previousFunctions(f).forall(_.status == RunnerStatus.DONE) && f.status == RunnerStatus.PENDING
case _ => false
}.toSet.asInstanceOf[Set[FunctionEdge]]
}
/**
* Removes mapping edges that aren't being used, and nodes that don't belong to anything.
*/
private def prune() {
var pruning = true
while (pruning) {
pruning = false
val filler = jobGraph.edgeSet.filter(isFiller(_))
if (filler.size > 0) {
jobGraph.removeAllEdges(filler)
pruning = running
}
}
if (running) {
for (orphan <- jobGraph.vertexSet.filter(isOrphan(_))) {
jobGraph.removeVertex(orphan)
nodeMap -= orphan.files
}
}
}
/**
* Validates that the functions in the graph have no missing values and that there are no cycles.
*/
private def validate() {
asScalaSet(jobGraph.edgeSet).foreach(
edge =>
if (running) edge match
{
case cmd: FunctionEdge =>
val missingFieldValues = cmd.function.missingFields
if (missingFieldValues.size > 0) {
numMissingValues += missingFieldValues.size
logger.error("Missing %s values for function: %s".format(missingFieldValues.size, cmd.function.description))
for (missing <- missingFieldValues)
logger.error(" " + missing)
}
case map: MappingEdge => /* do nothing for mapping edges */
}
)
val detector = new CycleDetector(jobGraph)
if (detector.detectCycles) {
logger.error("Cycles were detected in the graph:")
for (cycle <- detector.findCycles)
logger.error(" " + cycle)
throw new QException("Cycles were detected in the graph.")
}
}
/**
* Dry-runs the jobs by traversing the graph.
*/
private def dryRunJobs() {
if (settings.startFromScratch)
logger.info("Will remove outputs from previous runs.")
updateGraphStatus(false)
var readyJobs = getReadyJobs()
while (running && readyJobs.size > 0) {
logger.debug("+++++++")
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
foreachFunction(readyJobs.toList, edge => {
if (running) {
logEdge(edge)
edge.markAsDone
}
})
readyJobs = getReadyJobs()
}
}
private def logEdge(edge: FunctionEdge) {
logger.info("-------")
if (logger.isDebugEnabled) {
logger.debug("Inputs: " + edge.inputs)
}
logger.info(StringUtils.capitalize(edge.status.toString) + ": " + edge.function.description)
if (logger.isDebugEnabled)
logger.debug(edge.function.commandDirectory + " > " + edge.function.description)
logger.info("Log: " + edge.function.jobOutputFile.getAbsolutePath)
if (edge.function.jobErrorFile != null)
logger.info("Error: " + edge.function.jobErrorFile.getAbsolutePath)
}
/**
* Logs job statuses by traversing the graph and looking for status-related files
*/
private def logStatus() {
updateGraphStatus(false)
doStatus(status => logger.info(status))
}
/**
* Runs the jobs by traversing the graph.
*/
private def runJobs() {
try {
if (settings.bsub)
settings.jobRunner = "Lsf706"
else if (settings.jobRunner == null)
settings.jobRunner = "Shell"
commandLineManager = commandLinePluginManager.createByName(settings.jobRunner)
if (settings.startFromScratch)
logger.info("Removing outputs from previous runs.")
updateGraphStatus(true)
var readyJobs = TreeSet.empty[FunctionEdge](functionOrdering)
readyJobs ++= getReadyJobs()
runningJobs = Set.empty[FunctionEdge]
var lastRunningCheck = System.currentTimeMillis
var logNextStatusCounts = true
while (running && readyJobs.size + runningJobs.size > 0) {
var startedJobs = Set.empty[FunctionEdge]
var doneJobs = Set.empty[FunctionEdge]
var failedJobs = Set.empty[FunctionEdge]
while (running && readyJobs.size > 0 && !readyRunningCheck(lastRunningCheck)) {
val edge = readyJobs.head
edge.runner = newRunner(edge.function)
edge.start()
startedJobs += edge
readyJobs -= edge
logNextStatusCounts = true
}
runningJobs ++= startedJobs
statusCounts.pending -= startedJobs.size
statusCounts.running += startedJobs.size
if (logNextStatusCounts)
logStatusCounts
logNextStatusCounts = false
deleteDoneIntermediates(lastRunningCheck)
if (readyJobs.size == 0 && runningJobs.size > 0)
Thread.sleep(nextRunningCheck(lastRunningCheck))
lastRunningCheck = System.currentTimeMillis
updateStatus()
runningJobs.foreach(edge => edge.status match {
case RunnerStatus.DONE => doneJobs += edge
case RunnerStatus.FAILED => failedJobs += edge
case RunnerStatus.RUNNING => /* do nothing while still running */
})
runningJobs --= doneJobs
runningJobs --= failedJobs
if (!settings.keepIntermediates)
intermediatesJobs ++= doneJobs.filter(_.function.isIntermediate)
statusCounts.running -= doneJobs.size
statusCounts.running -= failedJobs.size
statusCounts.done += doneJobs.size
statusCounts.failed += failedJobs.size
if (doneJobs.size > 0 || failedJobs.size > 0)
logNextStatusCounts = true
if (running && failedJobs.size > 0) {
emailFailedJobs(failedJobs)
checkRetryJobs(failedJobs)
}
readyJobs ++= getReadyJobs()
}
logStatusCounts
deleteDoneIntermediates(-1)
} catch {
case e =>
logger.error("Uncaught error running jobs.", e)
throw e
} finally {
emailStatus()
}
}
private def readyRunningCheck(lastRunningCheck: Long) =
lastRunningCheck > 0 && nextRunningCheck(lastRunningCheck) <= 0
private def nextRunningCheck(lastRunningCheck: Long) =
0L max ((30 * 1000L) - (System.currentTimeMillis - lastRunningCheck))
private def logStatusCounts {
logger.info("%d Pend, %d Run, %d Fail, %d Done".format(
statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done))
}
/**
* Updates the status of edges in the graph.
* @param cleanOutputs If true will delete outputs when setting edges to pending.
*/
private def updateGraphStatus(cleanOutputs: Boolean) {
if (settings.startFromScratch)
foreachFunction(edge => edge.resetToPending(cleanOutputs))
else
traverseFunctions(edge => checkDone(edge, cleanOutputs))
traverseFunctions(edge => recheckDone(edge))
}
/**
* First pass that checks if an edge is done or if it's an intermediate edge if it can be skipped.
* This function may modify the status of previous edges if it discovers that the edge passed in
* is dependent jobs that were previously marked as skipped.
* @param edge Edge to check to see if it's done or can be skipped.
* @param cleanOutputs If true will delete outputs when setting edges to pending.
*/
private def checkDone(edge: FunctionEdge, cleanOutputs: Boolean) {
if (edge.function.isIntermediate) {
// By default we do not need to run intermediate edges.
// Mark any intermediate edges as skipped, if they're not already done.
if (edge.status != RunnerStatus.DONE)
edge.markAsSkipped()
} else {
val previous = this.previousFunctions(edge)
val isDone = edge.status == RunnerStatus.DONE &&
previous.forall(edge => edge.status == RunnerStatus.DONE || edge.status == RunnerStatus.SKIPPED)
if (!isDone) {
edge.resetToPending(cleanOutputs)
resetPreviousSkipped(edge, previous, cleanOutputs)
}
}
}
/**
* Second pass which
* a) Updates the status counts based on the function statuses
* b) Checks if the edge is a completed intermediate edge then adds it to the set of candidates for cleanup
* @param edge Edge to check to see if it's done or skipped.
*/
private def recheckDone(edge: FunctionEdge) {
edge.status match {
case RunnerStatus.PENDING => statusCounts.pending += 1
case RunnerStatus.FAILED => statusCounts.failed += 1
case RunnerStatus.DONE => statusCounts.done += 1
case RunnerStatus.SKIPPED => statusCounts.done += 1
}
if (edge.status == RunnerStatus.DONE || edge.status == RunnerStatus.SKIPPED) {
logger.debug("Already done: " + edge.function.description)
if (!settings.keepIntermediates && edge.function.isIntermediate)
intermediatesJobs += edge
}
}
/**
* Continues deleting the outputs of intermediate jobs that are no longer needed until it's time to recheck running status.
* @param lastRunningCheck The last time the status was checked.
*/
private def deleteDoneIntermediates(lastRunningCheck: Long) {
var doneJobs = Set.empty[FunctionEdge]
for (edge <- intermediatesJobs) {
val nextDone = nextFunctions(edge).forall(next => {
val status = next.status
(status == RunnerStatus.DONE || status == RunnerStatus.SKIPPED)
})
if (nextDone)
doneJobs += edge
}
for (edge <- doneJobs) {
if (running && !readyRunningCheck(lastRunningCheck)) {
logger.debug("Deleting intermediates:" + edge.function.description)
edge.function.deleteOutputs()
intermediatesJobs -= edge
}
}
}
/**
* Returns the graph depth for the function.
* @param edge Function edge to get the edge for.
* @return the graph depth for the function.
*/
private def graphDepth(edge: FunctionEdge): Int = {
if (edge.depth < 0) {
val previous = previousFunctions(edge)
if (previous.size == 0)
edge.depth = 0
else
edge.depth = previous.map(f => graphDepth(f)).max + 1
}
edge.depth
}
/**
* From the previous edges, resets any that are marked as skipped to pending.
* If those that are reset have skipped edges, those skipped edges are recursively also set
* to pending.
* @param edge Dependent edge.
* @param previous Previous edges that provide inputs to edge.
* @param cleanOutputs If true will clean up the output files when resetting skipped jobs to pending.
*/
private def resetPreviousSkipped(edge: FunctionEdge, previous: List[FunctionEdge], cleanOutputs: Boolean) {
for (previousEdge <- previous.filter(_.status == RunnerStatus.SKIPPED)) {
previousEdge.resetToPending(cleanOutputs)
resetPreviousSkipped(previousEdge, this.previousFunctions(previousEdge), cleanOutputs)
}
}
private def newRunner(f: QFunction) = {
f match {
case cmd: CommandLineFunction =>
commandLineManager.create(cmd)
case inProc: InProcessFunction =>
inProcessManager.create(inProc)
case _ =>
throw new QException("Unexpected function: " + f)
}
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
private def emailFailedJobs(failed: Set[FunctionEdge]) {
if (settings.statusEmailTo.size > 0) {
val emailMessage = new EmailMessage
emailMessage.from = settings.statusEmailFrom
emailMessage.to = settings.statusEmailTo
emailMessage.subject = "Queue function: Failure: " + settings.qSettings.jobNamePrefix
addFailedFunctions(emailMessage, failed.toList)
emailMessage.trySend(settings.qSettings.emailSettings)
}
}
private def checkRetryJobs(failed: Set[FunctionEdge]) {
if (settings.retries > 0) {
for (failedJob <- failed) {
if (failedJob.function.jobRestartable && failedJob.retries < settings.retries) {
failedJob.retries += 1
failedJob.resetToPending(true)
logger.info("Reset for retry attempt %d of %d: %s".format(
failedJob.retries, settings.retries, failedJob.function.description))
statusCounts.failed -= 1
statusCounts.pending += 1
} else {
logger.info("Giving up after retrying %d times: %s".format(
settings.retries, failedJob.function.description))
}
}
}
}
private def emailStatus() {
if (running && settings.statusEmailTo.size > 0) {
var failed = List.empty[FunctionEdge]
foreachFunction(edge => {
if (edge.status == RunnerStatus.FAILED) {
failed :+= edge
}
})
val emailMessage = new EmailMessage
emailMessage.from = settings.statusEmailFrom
emailMessage.to = settings.statusEmailTo
emailMessage.body = getStatus + nl
if (failed.size == 0) {
emailMessage.subject = "Queue run: Success: " + settings.qSettings.jobNamePrefix
} else {
emailMessage.subject = "Queue run: Failure: " + settings.qSettings.jobNamePrefix
addFailedFunctions(emailMessage, failed)
}
emailMessage.trySend(settings.qSettings.emailSettings)
}
}
private def addFailedFunctions(emailMessage: EmailMessage, failed: List[FunctionEdge]) {
val logs = failed.flatMap(edge => logFiles(edge))
if (emailMessage.body == null)
emailMessage.body = ""
emailMessage.body += """
|Failed functions:
|
|%s
|
|Logs:
|%s%n
|""".stripMargin.trim.format(
failed.map(edge => failedDescription(edge)).mkString(nl+nl),
logs.map(_.getAbsolutePath).mkString(nl))
emailMessage.attachments = logs
}
private def failedDescription(failed: FunctionEdge) = {
val description = new StringBuilder
if (settings.retries > 0)
description.append("Attempt %d of %d.%n".format(failed.retries + 1, settings.retries + 1))
description.append(failed.function.description)
description.toString
}
private def logFiles(edge: FunctionEdge) = {
var failedOutputs = List.empty[File]
failedOutputs :+= edge.function.jobOutputFile
if (edge.function.jobErrorFile != null)
failedOutputs :+= edge.function.jobErrorFile
failedOutputs.filter(file => file != null && file.exists)
}
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
/**
* Tracks analysis status.
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
*/
private class AnalysisStatus(val analysisName: String) {
val jobs = new GroupStatus
val scatter = new GroupStatus
val gather = new GroupStatus
def total = jobs.total + scatter.total + gather.total
def done = jobs.done + scatter.done + gather.done
def failed = jobs.failed + scatter.failed + gather.failed
def skipped = jobs.skipped + scatter.skipped + gather.skipped
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
/**
* Tracks status of a group of jobs.
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
*/
private class GroupStatus {
var total = 0
var done = 0
var failed = 0
var skipped = 0
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
/**
* Gets job statuses by traversing the graph and looking for status-related files
*/
private def getStatus = {
val buffer = new StringBuilder
doStatus(status => buffer.append(status).append(nl))
buffer.toString
}
/**
* Gets job statuses by traversing the graph and looking for status-related files
*/
private def doStatus(statusFunc: String => Unit) = {
var statuses = List.empty[AnalysisStatus]
var maxWidth = 0
foreachFunction(edge => {
val name = edge.function.analysisName
if (name != null) {
updateAnalysisStatus(statuses.find(_.analysisName == name) match {
case Some(status) => status
case None =>
val status = new AnalysisStatus(name)
maxWidth = maxWidth max name.length
statuses :+= status
status
}, edge)
}
})
statuses.foreach(status => {
val total = status.total
val done = status.done
val failed = status.failed
val skipped = status.skipped
val jobsTotal = status.jobs.total
val jobsDone = status.jobs.done
val gatherTotal = status.gather.total
val gatherDone = status.gather.done
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
var summaryStatus = RunnerStatus.PENDING
if (failed > 0)
summaryStatus = RunnerStatus.FAILED
else if (gatherDone == gatherTotal && jobsDone == jobsTotal)
summaryStatus = RunnerStatus.DONE
else if (done + skipped == total)
summaryStatus = RunnerStatus.SKIPPED
else if (done > 0)
summaryStatus = RunnerStatus.RUNNING
var info = ("%-" + maxWidth + "s %7s")
.format(status.analysisName, "[" + summaryStatus.toString + "]")
if (status.jobs.total > 1) {
info += formatGroupStatus(status.jobs)
}
if (status.scatter.total + status.gather.total > 1) {
info += formatGroupStatus(status.scatter, "s:")
info += formatGroupStatus(status.gather, "g:")
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
statusFunc(info)
})
}
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
/**
* Updates a status map with scatter/gather status information (e.g. counts)
*/
private def updateAnalysisStatus(stats: AnalysisStatus, edge: FunctionEdge) {
if (edge.function.isInstanceOf[ScatterFunction]) {
updateGroupStatus(stats.scatter, edge)
} else if (edge.function.isInstanceOf[CloneFunction]) {
updateGroupStatus(stats.scatter, edge)
} else if (edge.function.isInstanceOf[GatherFunction]) {
updateGroupStatus(stats.gather, edge)
} else {
updateGroupStatus(stats.jobs, edge)
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
}
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
private def updateGroupStatus(groupStatus: GroupStatus, edge: FunctionEdge) {
groupStatus.total += 1
edge.status match {
case RunnerStatus.DONE => groupStatus.done += 1
case RunnerStatus.FAILED => groupStatus.failed += 1
case RunnerStatus.SKIPPED => groupStatus.skipped += 1
/* can't tell the difference between pending and running right now! */
case RunnerStatus.PENDING =>
case RunnerStatus.RUNNING =>
Queue now does job tracking (replace -run with -status in the command line). Produces output that looks like: INFO 20:58:17,827 QCommandLine - Checking pipeline status INFO 20:58:23,234 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_MergeIndels [DONE] INFO 20:58:23,236 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_158.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,237 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_929.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,238 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_SNP_calls [NOT DONE] 5t/0d/0r/5p/0f INFO 20:58:23,239 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_HandFilter [NOT DONE] INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1122.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,240 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantRecalibrator [NOT DONE] INFO 20:58:23,241 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_913.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,242 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_2037.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,243 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_VariantEval [NOT DONE] INFO 20:58:23,244 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster [NOT DONE] INFO 20:58:23,245 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_106.bam [DONE] 5t/5d/0r/0p/0f INFO 20:58:23,246 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_Cluster_and_Indel_filter [NOT DONE] INFO 20:58:23,247 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_ApplyVariantCuts [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - Height_Hirschhorn_NHGRI.uncleaned_GenomicAnnotator [NOT DONE] INFO 20:58:23,248 QGraph$$anonfun$formatStatus$1 - IndelGenotyper_1713.bam [DONE] 5t/5d/0r/0p/0f git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4340 348d0f76-0448-11de-a6fe-93d51630548a
2010-09-24 08:59:09 +08:00
}
}
/**
* Formats a status into nice strings
*/
private def formatGroupStatus(stats: GroupStatus, prefix: String = "") = {
" %s%dt/%dd/%df".format(
prefix, stats.total, stats.done, stats.failed)
}
/**
* Creates a new graph where if new edges are needed (for cyclic dependency checking) they can be automatically created using a generic MappingFunction.
* @return A new graph
*/
private def newGraph = new SimpleDirectedGraph[QNode, QEdge](new EdgeFactory[QNode, QEdge] {
def createEdge(input: QNode, output: QNode) = new MappingEdge(input, output)})
private def getQNode(files: List[File]) = {
nodeMap.get(files) match {
case Some(node) =>
node
case None =>
if (nextNodeId % 100 == 0)
logger.debug("adding QNode: " + nextNodeId)
val node = new QNode(nextNodeId, files)
nextNodeId += 1
jobGraph.addVertex(node)
nodeMap += files -> node
node
}
}
private def addEdge(edge: QEdge) {
jobGraph.removeAllEdges(edge.inputs, edge.outputs)
jobGraph.addEdge(edge.inputs, edge.outputs, edge)
}
/**
* Adds input mappings between the node's files and the individual files.
* @param inputs Input node.
*/
private def addCollectionInputs(inputs: QNode) {
if (inputs.files.size > 1)
for (file <- inputs.files) {
if (running) {
val input = getQNode(List(file))
if (!jobGraph.containsEdge(input, inputs))
addEdge(new MappingEdge(input, inputs))
}
}
}
/**
* Adds output mappings between the node's files and the individual files.
* @param outputs Output node.
*/
private def addCollectionOutputs(outputs: QNode) {
if (outputs.files.size > 1)
for (file <- outputs.files) {
if (running) {
val output = getQNode(List(file))
if (!jobGraph.containsEdge(outputs, output))
addEdge(new MappingEdge(outputs, output))
}
}
}
/**
* Returns true if the edge is mapping edge that is not needed because it does
* not direct input or output from a user generated CommandLineFunction.
* @param edge Edge to check.
* @return true if the edge is not needed in the graph.
*/
private def isFiller(edge: QEdge) = {
edge match {
case mapping: MappingEdge =>
jobGraph.outgoingEdgesOf(jobGraph.getEdgeTarget(edge)).size == 0 &&
jobGraph.incomingEdgesOf(jobGraph.getEdgeSource(edge)).size == 0
case _ => false
}
}
/**
* Returns true if the node is not connected to any edges.
* @param node Node (set of files) to check.
* @return true if this set of files is not needed in the graph.
*/
private def isOrphan(node: QNode) = {
jobGraph.incomingEdgesOf(node).size == 0 &&
jobGraph.outgoingEdgesOf(node).size == 0
}
/**
* Utility function for running a method over all function edges.
* @param edgeFunction Function to run for each FunctionEdge.
*/
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
private def foreachFunction(f: (FunctionEdge) => Unit) {
foreachFunction(jobGraph.edgeSet.toList.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[List[FunctionEdge]], f)
}
/**
* Utility function for running a method over a list of function edges.
* @param edegs Edges to traverse.
* @param edgeFunction Function to run for each FunctionEdge.
*/
private def foreachFunction(edges: List[FunctionEdge], f: (FunctionEdge) => Unit) {
edges.sorted(functionOrdering).foreach(edge => if (running) f(edge))
}
/**
* Utility function for running a method over all functions, but traversing the nodes in order of dependency.
* @param edgeFunction Function to run for each FunctionEdge.
*/
private def traverseFunctions(f: (FunctionEdge) => Unit) {
val iterator = new TopologicalOrderIterator(this.jobGraph)
iterator.addTraversalListener(new TraversalListenerAdapter[QNode, QEdge] {
override def edgeTraversed(event: EdgeTraversalEvent[QNode, QEdge]) = {
if (running) {
event.getEdge match {
case functionEdge: FunctionEdge => f(functionEdge)
case map: MappingEdge => /* do nothing for mapping functions */
}
}
}
})
iterator.foreach(_ => {})
}
/**
* Outputs the graph to a .dot file.
* http://en.wikipedia.org/wiki/DOT_language
* @param file Path to output the .dot file.
*/
private def renderToDot(file: java.io.File) {
val out = new java.io.FileWriter(file)
// todo -- we need a nice way to visualize the key pieces of information about commands. Perhaps a
// todo -- visualizeString() command, or something that shows inputs / outputs
val ve = new org.jgrapht.ext.EdgeNameProvider[QEdge] {
def getEdgeName(function: QEdge) = if (function.dotString == null) "" else function.dotString.replace("\"", "\\\"")
}
//val iterator = new TopologicalOrderIterator(qGraph.jobGraph)
(new DOTExporter(new org.jgrapht.ext.IntegerNameProvider[QNode](), null, ve)).export(out, jobGraph)
out.close
}
/**
* Returns true if no functions have missing values nor a status of failed.
* @return true if no functions have missing values nor a status of failed.
*/
def success = {
if (numMissingValues > 0) {
false
} else if (this.dryRun) {
true
} else {
!this.jobGraph.edgeSet.exists(edge => {
edge.isInstanceOf[FunctionEdge] && edge.asInstanceOf[FunctionEdge].status == RunnerStatus.FAILED
})
}
}
def logFailed() {
foreachFunction(edge => {
if (edge.status == RunnerStatus.FAILED)
logEdge(edge)
})
}
private def updateStatus() {
val runners = runningJobs.map(_.runner)
for (mgr <- managers) {
if (mgr != null) {
val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]]
val managerRunners = runners
.filter(runner => manager.runnerType.isAssignableFrom(runner.getClass))
.asInstanceOf[Set[JobRunner[QFunction]]]
if (managerRunners.size > 0)
try {
manager.updateStatus(managerRunners)
} catch {
case e => /* ignore */
}
}
}
}
/**
* Returns true if the graph was shutdown instead of exiting on its own.
*/
def isShutdown = !running
/**
* Kills any forked jobs still running.
*/
def shutdown() {
// Signal the main thread to shutdown.
running = false
// Wait for the thread to finish and exit normally.
runningLock.synchronized {
val runners = runningJobs.map(_.runner)
runningJobs = Set.empty[FunctionEdge]
for (mgr <- managers) {
if (mgr != null) {
val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]]
val managerRunners = runners
.filter(runner => manager.runnerType.isAssignableFrom(runner.getClass))
.asInstanceOf[Set[JobRunner[QFunction]]]
if (managerRunners.size > 0)
try {
manager.tryStop(managerRunners)
} catch {
case e => /* ignore */
}
for (runner <- managerRunners) {
try {
runner.cleanup()
} catch {
case e => /* ignore */
}
}
}
}
}
}
}