Moved some shutdown logic from the LSF job runner into the QGraph.
Because of Java's type erasure JobManagers must provide runtime access to the runner class to shutdown. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5076 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6ac888d26a
commit
ce5b11317b
|
|
@ -3,5 +3,7 @@ package org.broadinstitute.sting.queue.engine
|
||||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||||
|
|
||||||
class InProcessJobManager extends JobManager[InProcessFunction, InProcessRunner] {
|
class InProcessJobManager extends JobManager[InProcessFunction, InProcessRunner] {
|
||||||
|
def runnerType = classOf[InProcessRunner]
|
||||||
|
def functionType = classOf[InProcessFunction]
|
||||||
def create(function: InProcessFunction) = new InProcessRunner(function)
|
def create(function: InProcessFunction) = new InProcessRunner(function)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,21 @@ import org.broadinstitute.sting.queue.function.QFunction
|
||||||
* Creates and stops JobRunners
|
* Creates and stops JobRunners
|
||||||
*/
|
*/
|
||||||
trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] {
|
trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] {
|
||||||
|
/** The class type of the runner. Available at runtime even after erasure. */
|
||||||
|
def functionType: Class[TFunction]
|
||||||
|
|
||||||
|
/** The class type of the functions processed by the runner. Available at runtime even after erasure. */
|
||||||
|
def runnerType: Class[TRunner]
|
||||||
|
|
||||||
|
/** Creates a new runner.
|
||||||
|
* @param function Function for the runner.
|
||||||
|
*/
|
||||||
def create(function: TFunction): TRunner
|
def create(function: TFunction): TRunner
|
||||||
def tryStop(runners: List[JobRunner[_]]) = {}
|
|
||||||
|
/**
|
||||||
|
* Stops a list of functions.
|
||||||
|
* @param runner Runners to stop.
|
||||||
|
*/
|
||||||
|
def tryStop(runners: List[TRunner]) {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||||
* Creates and stops Lsf706JobRunners
|
* Creates and stops Lsf706JobRunners
|
||||||
*/
|
*/
|
||||||
class Lsf706JobManager extends JobManager[CommandLineFunction, Lsf706JobRunner] {
|
class Lsf706JobManager extends JobManager[CommandLineFunction, Lsf706JobRunner] {
|
||||||
|
def runnerType = classOf[Lsf706JobRunner]
|
||||||
|
def functionType = classOf[CommandLineFunction]
|
||||||
def create(function: CommandLineFunction) = new Lsf706JobRunner(function)
|
def create(function: CommandLineFunction) = new Lsf706JobRunner(function)
|
||||||
override def tryStop(runners: List[JobRunner[_]]) = Lsf706JobRunner.tryStop(runners)
|
override def tryStop(runners: List[Lsf706JobRunner]) { Lsf706JobRunner.tryStop(runners) }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -182,32 +182,24 @@ object Lsf706JobRunner extends Logging {
|
||||||
* Tries to stop any running jobs.
|
* Tries to stop any running jobs.
|
||||||
* @param runners Runners to stop.
|
* @param runners Runners to stop.
|
||||||
*/
|
*/
|
||||||
def tryStop(runners: List[JobRunner[_]]) = {
|
def tryStop(runners: List[Lsf706JobRunner]) {
|
||||||
val lsfJobRunners = runners.filter(_.isInstanceOf[Lsf706JobRunner]).map(_.asInstanceOf[Lsf706JobRunner])
|
for (jobRunners <- runners.filterNot(_.jobId < 0).grouped(10)) {
|
||||||
if (lsfJobRunners.size > 0) {
|
try {
|
||||||
for (jobRunners <- lsfJobRunners.filterNot(_.jobId < 0).grouped(10)) {
|
val njobs = jobRunners.size
|
||||||
try {
|
val signalJobs = new signalBulkJobs
|
||||||
val njobs = jobRunners.size
|
signalJobs.jobs = {
|
||||||
val signalJobs = new signalBulkJobs
|
val jobIds = new Memory(8 * njobs)
|
||||||
signalJobs.jobs = {
|
jobIds.write(0, jobRunners.map(_.jobId).toArray, 0, njobs)
|
||||||
val p = new Memory(8 * njobs)
|
jobIds
|
||||||
p.write(0, jobRunners.map(_.jobId).toArray, 0, njobs)
|
}
|
||||||
p
|
signalJobs.njobs = njobs
|
||||||
}
|
signalJobs.signal = 9
|
||||||
signalJobs.njobs = njobs
|
|
||||||
signalJobs.signal = 9
|
|
||||||
|
|
||||||
if (LibBat.lsb_killbulkjobs(signalJobs) < 0)
|
if (LibBat.lsb_killbulkjobs(signalJobs) < 0)
|
||||||
throw new QException(LibBat.lsb_sperror("lsb_killbulkjobs failed"))
|
throw new QException(LibBat.lsb_sperror("lsb_killbulkjobs failed"))
|
||||||
} catch {
|
} catch {
|
||||||
case e =>
|
case e =>
|
||||||
logger.error("Unable to kill all jobs.", e)
|
logger.error("Unable to kill all jobs.", e)
|
||||||
}
|
|
||||||
try {
|
|
||||||
jobRunners.foreach(_.removeTemporaryFiles())
|
|
||||||
} catch {
|
|
||||||
case e => /* ignore */
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -177,14 +177,14 @@ class QGraph extends Logging {
|
||||||
case f: FunctionEdge =>
|
case f: FunctionEdge =>
|
||||||
this.previousFunctions(f).forall(_.status == RunnerStatus.DONE) && f.status == RunnerStatus.PENDING
|
this.previousFunctions(f).forall(_.status == RunnerStatus.DONE) && f.status == RunnerStatus.PENDING
|
||||||
case _ => false
|
case _ => false
|
||||||
}.map(_.asInstanceOf[FunctionEdge]).toList.sortWith(compare(_,_))
|
}.toList.asInstanceOf[List[FunctionEdge]].sortWith(compare(_,_))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def getRunningJobs = {
|
private def getRunningJobs = {
|
||||||
jobGraph.edgeSet.filter{
|
jobGraph.edgeSet.filter{
|
||||||
case f: FunctionEdge => f.status == RunnerStatus.RUNNING
|
case f: FunctionEdge => f.status == RunnerStatus.RUNNING
|
||||||
case _ => false
|
case _ => false
|
||||||
}.map(_.asInstanceOf[FunctionEdge]).toList.sortWith(compare(_,_))
|
}.toList.asInstanceOf[List[FunctionEdge]].sortWith(compare(_,_))
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -238,7 +238,14 @@ class QGraph extends Logging {
|
||||||
*/
|
*/
|
||||||
private def dryRunJobs() = {
|
private def dryRunJobs() = {
|
||||||
updateGraphStatus(false)
|
updateGraphStatus(false)
|
||||||
traverseFunctions(edge => logEdge(edge))
|
var readyJobs = getReadyJobs
|
||||||
|
while (!shuttingDown && readyJobs.size > 0) {
|
||||||
|
readyJobs.foreach(edge => {
|
||||||
|
logEdge(edge)
|
||||||
|
edge.markAsDone
|
||||||
|
})
|
||||||
|
readyJobs = getReadyJobs
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def logEdge(edge: FunctionEdge) = {
|
private def logEdge(edge: FunctionEdge) = {
|
||||||
|
|
@ -390,7 +397,7 @@ class QGraph extends Logging {
|
||||||
private def checkRetryJobs(failed: List[FunctionEdge]) = {
|
private def checkRetryJobs(failed: List[FunctionEdge]) = {
|
||||||
if (settings.retries > 0) {
|
if (settings.retries > 0) {
|
||||||
for (failedJob <- failed) {
|
for (failedJob <- failed) {
|
||||||
if (failedJob.retries < settings.retries) {
|
if (failedJob.function.jobRestartable && failedJob.retries < settings.retries) {
|
||||||
failedJob.retries += 1
|
failedJob.retries += 1
|
||||||
failedJob.resetToPending(true)
|
failedJob.resetToPending(true)
|
||||||
logger.info("Reset for retry attempt %d of %d: %s".format(
|
logger.info("Reset for retry attempt %d of %d: %s".format(
|
||||||
|
|
@ -660,10 +667,10 @@ class QGraph extends Logging {
|
||||||
*/
|
*/
|
||||||
private def foreachFunction(f: (FunctionEdge) => Unit) = {
|
private def foreachFunction(f: (FunctionEdge) => Unit) = {
|
||||||
jobGraph.edgeSet.toList
|
jobGraph.edgeSet.toList
|
||||||
.filter(_.isInstanceOf[FunctionEdge])
|
.filter(_.isInstanceOf[FunctionEdge])
|
||||||
.map(_.asInstanceOf[FunctionEdge])
|
.asInstanceOf[List[FunctionEdge]]
|
||||||
.sortWith(compare(_,_))
|
.sortWith(compare(_,_))
|
||||||
.foreach(f(_))
|
.foreach(f(_))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def compare(f1: FunctionEdge, f2: FunctionEdge): Boolean =
|
private def compare(f1: FunctionEdge, f2: FunctionEdge): Boolean =
|
||||||
|
|
@ -759,8 +766,25 @@ class QGraph extends Logging {
|
||||||
*/
|
*/
|
||||||
def shutdown() {
|
def shutdown() {
|
||||||
shuttingDown = true
|
shuttingDown = true
|
||||||
val runningJobs = getRunningJobs
|
val runners = getRunningJobs.map(_.runner)
|
||||||
if (commandLineManager != null && !runningJobs.isEmpty)
|
val manager = commandLineManager.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]]
|
||||||
commandLineManager.tryStop(runningJobs.map(_.runner))
|
if (manager != null) {
|
||||||
|
val managerRunners = runners
|
||||||
|
.filter(runner => manager.runnerType.isAssignableFrom(runner.getClass))
|
||||||
|
.asInstanceOf[List[JobRunner[QFunction]]]
|
||||||
|
if (managerRunners.size > 0)
|
||||||
|
try {
|
||||||
|
manager.tryStop(managerRunners)
|
||||||
|
} catch {
|
||||||
|
case e => /* ignore */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
runners.foreach(runner =>
|
||||||
|
try {
|
||||||
|
runner.removeTemporaryFiles()
|
||||||
|
} catch {
|
||||||
|
case e => /* ignore */
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,5 +3,7 @@ package org.broadinstitute.sting.queue.engine
|
||||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||||
|
|
||||||
class ShellJobManager extends JobManager[CommandLineFunction, ShellJobRunner] {
|
class ShellJobManager extends JobManager[CommandLineFunction, ShellJobRunner] {
|
||||||
|
def runnerType = classOf[ShellJobRunner]
|
||||||
|
def functionType = classOf[CommandLineFunction]
|
||||||
def create(function: CommandLineFunction) = new ShellJobRunner(function)
|
def create(function: CommandLineFunction) = new ShellJobRunner(function)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue