284 lines
9.6 KiB
Scala
284 lines
9.6 KiB
Scala
package org.broadinstitute.sting.queue.engine
|
|
|
|
import java.io.File
|
|
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
|
import org.broadinstitute.sting.queue.util._
|
|
import org.broadinstitute.sting.queue.QException
|
|
import org.broadinstitute.sting.jna.lsf.v7_0_6.{LibLsf, LibBat}
|
|
import org.broadinstitute.sting.utils.Utils
|
|
import org.broadinstitute.sting.jna.clibrary.LibC
|
|
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit}
|
|
import com.sun.jna.ptr.IntByReference
|
|
import com.sun.jna.{StringArray, NativeLong}
|
|
|
|
/**
|
|
* Runs jobs on an LSF compute cluster.
|
|
*/
|
|
class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobRunner with Logging {
|
|
|
|
// Run the static initializer for Lsf706JobRunner
|
|
Lsf706JobRunner
|
|
|
|
/** Job Id of the currently executing job. */
|
|
private var jobId = -1L
|
|
|
|
/** Last known status */
|
|
private var lastStatus: RunnerStatus.Value = _
|
|
|
|
/** The last time the status was updated */
|
|
protected var lastStatusUpdate: Long = _
|
|
|
|
/**
|
|
* Dispatches the function on the LSF cluster.
|
|
* @param function Command to run.
|
|
*/
|
|
def start() = {
|
|
Lsf706JobRunner.lsfLibLock.synchronized {
|
|
val request = new submit
|
|
for (i <- 0 until LibLsf.LSF_RLIM_NLIMITS)
|
|
request.rLimits(i) = LibLsf.DEFAULT_RLIMIT;
|
|
|
|
request.outFile = function.jobOutputFile.getPath
|
|
request.options |= LibBat.SUB_OUT_FILE
|
|
|
|
if (function.jobErrorFile != null) {
|
|
request.errFile = function.jobErrorFile.getPath
|
|
request.options |= LibBat.SUB_ERR_FILE
|
|
}
|
|
|
|
if (function.jobProject != null) {
|
|
request.projectName = function.jobProject
|
|
request.options |= LibBat.SUB_PROJECT_NAME
|
|
}
|
|
|
|
if (function.jobQueue != null) {
|
|
request.queue = function.jobQueue
|
|
request.options |= LibBat.SUB_QUEUE
|
|
}
|
|
|
|
if (IOUtils.absolute(new File(".")) != function.commandDirectory) {
|
|
request.cwd = function.commandDirectory.getPath
|
|
request.options3 |= LibBat.SUB3_CWD
|
|
}
|
|
|
|
if (function.memoryLimit.isDefined) {
|
|
request.resReq = "rusage[mem=" + function.memoryLimit.get + "]"
|
|
request.options |= LibBat.SUB_RES_REQ
|
|
}
|
|
|
|
if (function.description != null) {
|
|
request.jobName = function.description.take(1000)
|
|
request.options |= LibBat.SUB_JOB_NAME
|
|
}
|
|
|
|
if (function.jobPriority.isDefined) {
|
|
request.userPriority = function.jobPriority.get
|
|
request.options2 |= LibBat.SUB2_JOB_PRIORITY
|
|
}
|
|
|
|
request.rLimits(LibLsf.LSF_RLIMIT_RUN) = Lsf706JobRunner.getRlimitRun(function.jobQueue)
|
|
|
|
writeExec()
|
|
request.command = "sh " + exec
|
|
|
|
// Allow advanced users to update the request.
|
|
updateJobRun(request)
|
|
|
|
updateStatus(RunnerStatus.RUNNING)
|
|
Retry.attempt(() => {
|
|
val reply = new submitReply
|
|
jobId = LibBat.lsb_submit(request, reply)
|
|
if (jobId < 0)
|
|
throw new QException(LibBat.lsb_sperror("Unable to submit job"))
|
|
}, 1, 5, 10)
|
|
logger.info("Submitted LSF job id: " + jobId)
|
|
}
|
|
}
|
|
|
|
def status = this.lastStatus
|
|
|
|
private def updateStatus(updatedStatus: RunnerStatus.Value) = {
|
|
this.lastStatus = updatedStatus
|
|
this.lastStatusUpdate = System.currentTimeMillis
|
|
}
|
|
}
|
|
|
|
object Lsf706JobRunner extends Logging {
|
|
private val lsfLibLock = new Object
|
|
private val SIGTERM = 15
|
|
|
|
/** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */
|
|
private val retryExpiredSeconds = 5 * 60
|
|
|
|
/** Amount of time a job can go without status before giving up. */
|
|
private val unknownStatusMaxSeconds = 5 * 60
|
|
|
|
init()
|
|
|
|
/** The name of the default queue. */
|
|
private var defaultQueue: String = _
|
|
|
|
/** The run limits for each queue. */
|
|
private var queueRlimitRun = Map.empty[String,Int]
|
|
|
|
/**
|
|
* Initialize the Lsf library.
|
|
*/
|
|
private def init() = {
|
|
lsfLibLock.synchronized {
|
|
if (LibBat.lsb_init("Queue") < 0)
|
|
throw new QException(LibBat.lsb_sperror("lsb_init() failed"))
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the run limit in seconds for the queue.
|
|
* If the queue name is null returns the length of the default queue.
|
|
* @param queue Name of the queue or null for the default queue.
|
|
* @return the run limit in seconds for the queue.
|
|
*/
|
|
def getRlimitRun(queue: String) = {
|
|
lsfLibLock.synchronized {
|
|
if (queue == null) {
|
|
if (defaultQueue != null) {
|
|
queueRlimitRun(defaultQueue)
|
|
} else {
|
|
// Get the info on the default queue.
|
|
val numQueues = new IntByReference(1)
|
|
val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0)
|
|
if (queueInfo == null)
|
|
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue"))
|
|
defaultQueue = queueInfo.queue
|
|
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
|
queueRlimitRun += defaultQueue -> limit
|
|
limit
|
|
}
|
|
} else {
|
|
queueRlimitRun.get(queue) match {
|
|
case Some(limit) => limit
|
|
case None =>
|
|
// Cache miss. Go get the run limits from LSF.
|
|
val queues = new StringArray(Array[String](queue))
|
|
val numQueues = new IntByReference(1)
|
|
val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0)
|
|
if (queueInfo == null)
|
|
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue))
|
|
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
|
queueRlimitRun += queue -> limit
|
|
limit
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Updates the status of a list of jobs.
|
|
*/
|
|
def updateStatus(runners: List[Lsf706JobRunner]) {
|
|
var updatedRunners = List.empty[Lsf706JobRunner]
|
|
|
|
Lsf706JobRunner.lsfLibLock.synchronized {
|
|
val result = LibBat.lsb_openjobinfo(0L, null, null, null, null, LibBat.ALL_JOB)
|
|
if (result < 0) {
|
|
logger.error(LibBat.lsb_sperror("Unable to check LSF job info"))
|
|
} else {
|
|
try {
|
|
val more = new IntByReference(result)
|
|
while (more.getValue > 0) {
|
|
val jobInfo = LibBat.lsb_readjobinfo(more)
|
|
if (jobInfo == null) {
|
|
logger.error(LibBat.lsb_sperror("Unable to read LSF job info"))
|
|
more.setValue(0)
|
|
} else {
|
|
runners.find(runner => runner.jobId == jobInfo.jobId) match {
|
|
case Some(runner) =>
|
|
updateRunnerStatus(runner, jobInfo)
|
|
updatedRunners :+= runner
|
|
case None => /* not our job */
|
|
}
|
|
}
|
|
}
|
|
} finally {
|
|
LibBat.lsb_closejobinfo()
|
|
}
|
|
}
|
|
}
|
|
|
|
for (runner <- runners.diff(updatedRunners)) {
|
|
checkUnknownStatus(runner)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tries to stop any running jobs.
|
|
* @param runners Runners to stop.
|
|
*/
|
|
def tryStop(runners: List[Lsf706JobRunner]) {
|
|
lsfLibLock.synchronized {
|
|
// lsb_killbulkjobs does not seem to forward SIGTERM,
|
|
// only SIGKILL, so send the Ctrl-C (SIGTERM) one by one.
|
|
for (runner <- runners.filterNot(_.jobId < 0)) {
|
|
try {
|
|
if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0)
|
|
logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId))
|
|
} catch {
|
|
case e =>
|
|
logger.error("Unable to kill job " + runner.jobId, e)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private def updateRunnerStatus(runner: Lsf706JobRunner, jobInfo: LibBat.jobInfoEnt) {
|
|
val jobStatus = jobInfo.status
|
|
val exitStatus = jobInfo.exitStatus
|
|
val exitInfo = jobInfo.exitInfo
|
|
val endTime = jobInfo.endTime
|
|
|
|
logger.debug("Job Id %s status / exitStatus / exitInfo: 0x%02x / 0x%02x / 0x%02x".format(runner.jobId, jobStatus, exitStatus, exitInfo))
|
|
|
|
runner.updateStatus(
|
|
if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_DONE)) {
|
|
// Done successfully.
|
|
RunnerStatus.DONE
|
|
} else if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_EXIT) && !willRetry(exitInfo, endTime)) {
|
|
// Exited function that (probably) won't be retried.
|
|
RunnerStatus.FAILED
|
|
} else {
|
|
// Note that we still saw the job in the system.
|
|
RunnerStatus.RUNNING
|
|
}
|
|
)
|
|
}
|
|
|
|
private def checkUnknownStatus(runner: Lsf706JobRunner) {
|
|
// TODO: Need a second pass through either of the two archive logs using lsb_geteventrecbyline() for disappeared jobs.
|
|
// Can also tell if we wake up and the last time we saw status was greater than lsb_parameterinfo().cleanPeriod
|
|
// LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct)
|
|
// LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist)
|
|
logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(runner.jobId))
|
|
val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate)
|
|
if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) {
|
|
// Unknown status has been returned for a while now.
|
|
runner.updateStatus(RunnerStatus.FAILED)
|
|
logger.error("Unable to read LSF status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description))
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns true if LSF is expected to retry running the function.
|
|
* @param exitInfo The reason the job exited.
|
|
* @param endTime THe time the job exited.
|
|
* @return true if LSF is expected to retry running the function.
|
|
*/
|
|
private def willRetry(exitInfo: Int, endTime: NativeLong) = {
|
|
exitInfo match {
|
|
case LibBat.EXIT_NORMAL => false
|
|
case _ => {
|
|
val seconds = LibC.difftime(LibC.time(null), endTime)
|
|
(seconds <= retryExpiredSeconds)
|
|
}
|
|
}
|
|
}
|
|
}
|