Grid Engine backend to GATK-Queue, initial commit of implementation
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5788 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dfdef2d29b
commit
16db86e6cb
|
|
@ -36,7 +36,7 @@ class QSettings {
|
||||||
var jobNamePrefix: String = QSettings.processNamePrefix
|
var jobNamePrefix: String = QSettings.processNamePrefix
|
||||||
|
|
||||||
@Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false)
|
@Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false)
|
||||||
var jobProject: String = "Queue"
|
var jobProject: String = _
|
||||||
|
|
||||||
@Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false)
|
@Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false)
|
||||||
var jobQueue: String = _
|
var jobQueue: String = _
|
||||||
|
|
|
||||||
|
|
@ -353,6 +353,8 @@ class QGraph extends Logging {
|
||||||
try {
|
try {
|
||||||
if (settings.bsub)
|
if (settings.bsub)
|
||||||
settings.jobRunner = "Lsf706"
|
settings.jobRunner = "Lsf706"
|
||||||
|
else if (settings.qsub)
|
||||||
|
settings.jobRunner = "GridEngine"
|
||||||
else if (settings.jobRunner == null)
|
else if (settings.jobRunner == null)
|
||||||
settings.jobRunner = "Shell"
|
settings.jobRunner = "Shell"
|
||||||
commandLineManager = commandLinePluginManager.createByName(settings.jobRunner)
|
commandLineManager = commandLinePluginManager.createByName(settings.jobRunner)
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,9 @@ class QGraphSettings {
|
||||||
@Argument(fullName="bsub", shortName="bsub", doc="Equivalent to -jobRunner Lsf706", required=false)
|
@Argument(fullName="bsub", shortName="bsub", doc="Equivalent to -jobRunner Lsf706", required=false)
|
||||||
var bsub = false
|
var bsub = false
|
||||||
|
|
||||||
|
@Argument(fullName="qsub", shortName="qsub", doc="Equivalent to -jobRunner GridEngine", required=false)
|
||||||
|
var qsub = false
|
||||||
|
|
||||||
@Argument(fullName="status",shortName="status",doc="Get status of jobs for the qscript",required=false)
|
@Argument(fullName="status",shortName="status",doc="Get status of jobs for the qscript",required=false)
|
||||||
var getStatus = false
|
var getStatus = false
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,79 +24,129 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.queue.engine.gridengine
|
package org.broadinstitute.sting.queue.engine.gridengine
|
||||||
|
|
||||||
import org.broadinstitute.sting.queue.util.Logging
|
import org.broadinstitute.sting.queue.QException
|
||||||
|
import org.broadinstitute.sting.queue.util.{Logging,Retry}
|
||||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||||
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
|
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
|
||||||
|
import org.ggf.drmaa.{DrmaaException,JobInfo,JobTemplate,Session,SessionFactory}
|
||||||
|
import java.util.Collections
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs jobs on a Grid Engine compute cluster.
|
||||||
|
*/
|
||||||
class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLineJobRunner with Logging {
|
class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLineJobRunner with Logging {
|
||||||
// Run the static initializer for GridEngineJobRunner
|
// Run the static initializer for GridEngineJobRunner
|
||||||
GridEngineJobRunner
|
GridEngineJobRunner
|
||||||
|
|
||||||
def start() = {
|
/** Job Id of the currently executing job. */
|
||||||
// TODO: Copy settings from function to GridEngine syntax.
|
private var jobId: String = _
|
||||||
/*
|
|
||||||
val gridEngineJob = new ...
|
|
||||||
|
|
||||||
// Set the display name to 4000 characters of the description (or whatever the GE max is)
|
/** Last known status */
|
||||||
gridEngineJob.displayName = function.description.take(4000)
|
private var lastStatus: RunnerStatus.Value = _
|
||||||
|
|
||||||
// Set the output file for stdout
|
/** The last time the status was updated */
|
||||||
gridEngineJob.outputFile = function.jobOutputFile.getPath
|
protected var lastStatusUpdate: Long = _
|
||||||
|
|
||||||
// Set the current working directory
|
def start() {
|
||||||
gridEngineJob.workingDirectory = function.commandDirectory.getPath
|
GridEngineJobRunner.gridEngineSession.synchronized {
|
||||||
|
val gridEngineJob: JobTemplate = GridEngineJobRunner.gridEngineSession.createJobTemplate
|
||||||
|
|
||||||
// If the error file is set specify the separate output for stderr
|
// Force the remote environment to inherit local environment settings
|
||||||
if (function.jobErrorFile != null) {
|
var nativeSpecString: String = "-V"
|
||||||
gridEngineJob.errFile = function.jobErrorFile.getPath
|
|
||||||
|
// Set the display name to 1024 characters of the description
|
||||||
|
gridEngineJob.setJobName(function.description.take(1024))
|
||||||
|
|
||||||
|
// Set the output file for stdout
|
||||||
|
gridEngineJob.setOutputPath(":" + function.jobOutputFile.getPath)
|
||||||
|
|
||||||
|
// Set the current working directory
|
||||||
|
gridEngineJob.setWorkingDirectory(function.commandDirectory.getPath)
|
||||||
|
|
||||||
|
// If the error file is set specify the separate output for stderr
|
||||||
|
// Otherwise join with stdout
|
||||||
|
if (Option(function.jobErrorFile) != None) {
|
||||||
|
gridEngineJob.setErrorPath(":" + function.jobErrorFile.getPath)
|
||||||
|
} else {
|
||||||
|
gridEngineJob.setJoinFiles(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a project name is set specify the project name
|
||||||
|
if (Option(function.jobProject) != None) {
|
||||||
|
nativeSpecString += " -P " + function.jobProject
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the job queue is set specify the job queue
|
||||||
|
if (Option(function.jobQueue) != None) {
|
||||||
|
nativeSpecString += " -q " + function.jobQueue
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the memory limit is set (GB) specify the memory limit
|
||||||
|
if (function.memoryLimit.isDefined) {
|
||||||
|
val memLim: String = function.memoryLimit.get + "G"
|
||||||
|
nativeSpecString += " -l mem_free=" + memLim + ",h_vmem=" + memLim
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the priority is set (user specified Int) specify the priority
|
||||||
|
if (function.jobPriority.isDefined) {
|
||||||
|
nativeSpecString += " -p " + function.jobPriority.get
|
||||||
|
}
|
||||||
|
|
||||||
|
gridEngineJob.setNativeSpecification(nativeSpecString)
|
||||||
|
|
||||||
|
// Instead of running the function.commandLine, run "sh <jobScript>"
|
||||||
|
gridEngineJob.setRemoteCommand("sh")
|
||||||
|
gridEngineJob.setArgs(Collections.singletonList(jobScript.toString))
|
||||||
|
|
||||||
|
// Allow advanced users to update the request via QFunction.updateJobRun()
|
||||||
|
updateJobRun(gridEngineJob)
|
||||||
|
|
||||||
|
updateStatus(RunnerStatus.PENDING)
|
||||||
|
|
||||||
|
// Start the job and store the id so it can be killed in tryStop
|
||||||
|
try {
|
||||||
|
Retry.attempt(() => {
|
||||||
|
try {
|
||||||
|
jobId = GridEngineJobRunner.gridEngineSession.runJob(gridEngineJob)
|
||||||
|
} catch {
|
||||||
|
case de: DrmaaException => throw new QException("Unable to submit job: " + de.getLocalizedMessage)
|
||||||
|
}
|
||||||
|
}, 1, 5, 10)
|
||||||
|
} finally {
|
||||||
|
// Prevent memory leaks
|
||||||
|
GridEngineJobRunner.gridEngineSession.deleteJobTemplate(gridEngineJob)
|
||||||
|
}
|
||||||
|
logger.info("Submitted Grid Engine job id: " + jobId)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If a project name is set specify the project name
|
|
||||||
if (function.jobProject != null) {
|
|
||||||
gridEngineJob.projectName = function.jobProject
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the job queue is set specify the job queue
|
|
||||||
if (function.jobQueue != null) {
|
|
||||||
gridEngineJob.queue = function.jobQueue
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the memory limit is set (GB) specify the memory limit
|
|
||||||
if (function.memoryLimit.isDefined) {
|
|
||||||
gridEngineJob.jobMemoryLimit = function.memoryLimit.get + "GB"
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the priority is set (user specified Int) specify the priority
|
|
||||||
if (function.jobPriority.isDefined) {
|
|
||||||
gridEngineJob.jobPriority = function.jobPriority.get
|
|
||||||
}
|
|
||||||
|
|
||||||
// Instead of running the function.commandLine, run "sh <jobScript>"
|
|
||||||
gridEngineJob.command = "sh " + jobScript
|
|
||||||
|
|
||||||
// Store the status so it can be returned in the status method.
|
|
||||||
myStatus = RunnerStatus.RUNNING
|
|
||||||
|
|
||||||
// Start the job and store the id so it can be killed in tryStop
|
|
||||||
myJobId = gridEngineJob.start()
|
|
||||||
*/
|
|
||||||
|
|
||||||
logger.warn("TODO: implement Grid Engine support")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Return the latest status: RUNNING, FAILED, or DONE
|
def status = this.lastStatus
|
||||||
def status = throw new RuntimeException("TODO: Grid Engine return status such as: " + RunnerStatus.FAILED)
|
|
||||||
|
private def updateStatus(updatedStatus: RunnerStatus.Value) {
|
||||||
|
this.lastStatus = updatedStatus
|
||||||
|
this.lastStatusUpdate = System.currentTimeMillis
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object GridEngineJobRunner extends Logging {
|
object GridEngineJobRunner extends Logging {
|
||||||
|
private val gridEngineSession = SessionFactory.getFactory.getSession
|
||||||
|
|
||||||
|
/** Amount of time a job can go without status before giving up. */
|
||||||
|
private val unknownStatusMaxSeconds = 5 * 60
|
||||||
|
|
||||||
initGridEngine()
|
initGridEngine()
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the Grid Engine library.
|
* Initialize the Grid Engine library.
|
||||||
*/
|
*/
|
||||||
private def initGridEngine() {
|
private def initGridEngine() {
|
||||||
// TODO: Init
|
gridEngineSession.synchronized {
|
||||||
logger.warn("TODO Grid Engine: Initialize here.")
|
try {
|
||||||
|
gridEngineSession.init("")
|
||||||
|
} catch {
|
||||||
|
case de: DrmaaException => throw new QException("init() failed", de)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -104,7 +154,14 @@ object GridEngineJobRunner extends Logging {
|
||||||
* @param runners Runners to update.
|
* @param runners Runners to update.
|
||||||
*/
|
*/
|
||||||
def updateStatus(runners: Set[GridEngineJobRunner]) {
|
def updateStatus(runners: Set[GridEngineJobRunner]) {
|
||||||
// TODO: Bulk update. If not possible this method can be removed here and in GridEngineJobManager.
|
var updatedRunners = Set.empty[GridEngineJobRunner]
|
||||||
|
gridEngineSession.synchronized {
|
||||||
|
runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner})
|
||||||
|
}
|
||||||
|
|
||||||
|
for (runner <- runners.diff(updatedRunners)) {
|
||||||
|
checkUnknownStatus(runner)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -112,6 +169,70 @@ object GridEngineJobRunner extends Logging {
|
||||||
* @param runners Runners to stop.
|
* @param runners Runners to stop.
|
||||||
*/
|
*/
|
||||||
def tryStop(runners: Set[GridEngineJobRunner]) {
|
def tryStop(runners: Set[GridEngineJobRunner]) {
|
||||||
// TODO: Stop runners. SIGTERM(15) is preferred to SIGKILL(9).
|
// Stop runners. SIGTERM(15) is preferred to SIGKILL(9).
|
||||||
|
// Only way to send SIGTERM is for the Sys Admin set the terminate_method
|
||||||
|
// resource of the designated queue to SIGTERM
|
||||||
|
gridEngineSession.synchronized {
|
||||||
|
for (runner <- runners.filterNot(runner => Option(runner.jobId) == None)) {
|
||||||
|
try {
|
||||||
|
gridEngineSession.control(runner.jobId, Session.TERMINATE)
|
||||||
|
} catch {
|
||||||
|
case e =>
|
||||||
|
logger.error("Unable to kill job " + runner.jobId, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gridEngineSession.exit()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def updateRunnerStatus(runner: GridEngineJobRunner): Boolean = {
|
||||||
|
var returnStatus: RunnerStatus.Value = null
|
||||||
|
|
||||||
|
try {
|
||||||
|
val jobStatus = gridEngineSession.getJobProgramStatus(runner.jobId);
|
||||||
|
jobStatus match {
|
||||||
|
case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.PENDING
|
||||||
|
case Session.DONE =>
|
||||||
|
val jobInfo: JobInfo = gridEngineSession.wait(runner.jobId, Session.TIMEOUT_NO_WAIT)
|
||||||
|
if ((jobInfo.hasExited && jobInfo.getExitStatus > 0)
|
||||||
|
|| jobInfo.hasSignaled
|
||||||
|
|| jobInfo.wasAborted)
|
||||||
|
returnStatus = RunnerStatus.FAILED
|
||||||
|
else
|
||||||
|
returnStatus = RunnerStatus.DONE
|
||||||
|
case Session.FAILED => returnStatus = RunnerStatus.FAILED
|
||||||
|
case Session.UNDETERMINED => logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId)
|
||||||
|
case _ => returnStatus = RunnerStatus.RUNNING
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// getJobProgramStatus will throw an exception once wait has run, as the
|
||||||
|
// job will be reaped. If the status is currently DONE or FAILED, return
|
||||||
|
// the status.
|
||||||
|
case de: DrmaaException =>
|
||||||
|
if (runner.lastStatus == RunnerStatus.DONE || runner.lastStatus == RunnerStatus.FAILED)
|
||||||
|
returnStatus = runner.lastStatus
|
||||||
|
else
|
||||||
|
logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de)
|
||||||
|
}
|
||||||
|
|
||||||
|
Option(returnStatus) match {
|
||||||
|
case Some(returnStatus) =>
|
||||||
|
runner.updateStatus(returnStatus)
|
||||||
|
return true
|
||||||
|
case None => return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def checkUnknownStatus(runner: GridEngineJobRunner) {
|
||||||
|
val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate)
|
||||||
|
if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) {
|
||||||
|
// Unknown status has been returned for a while now.
|
||||||
|
runner.updateStatus(RunnerStatus.FAILED)
|
||||||
|
logger.error("Unable to read Grid Engine status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override def finalize() {
|
||||||
|
gridEngineSession.exit()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue