gatk-3.8/scala/src/org/broadinstitute/sting/queue/engine/LsfJobRunner.scala

220 lines
6.9 KiB
Scala

package org.broadinstitute.sting.queue.engine
import java.io.File
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.util._
import org.apache.commons.io.FileUtils
/**
* Runs jobs on an LSF compute cluster.
*/
class LsfJobRunner(val function: CommandLineFunction) extends DispatchJobRunner with Logging {
private var runStatus: RunnerStatus.Value = _
var job: LsfJob = new LsfJob
/** Which directory to use for the job status files. */
private def jobStatusDir = function.jobTempDir
/** A file to look for to validate that the function ran to completion. */
private var jobStatusPath: String = _
/** A temporary job done file to let Queue know that the process ran successfully. */
private lazy val jobDoneFile = new File(jobStatusPath + ".done")
/** A temporary job done file to let Queue know that the process exited with an error. */
private lazy val jobFailFile = new File(jobStatusPath + ".fail")
/** A generated exec shell script. */
private var exec: File = _
/** A generated pre-exec shell script. */
private var preExec: File = _
/** A generated post-exec shell script. */
private var postExec: File = _
/**
* Dispatches the function on the LSF cluster.
* @param function Command to run.
*/
def start() = {
try {
function.mkOutputDirectories()
// job.name = function.jobName TODO: Make setting the job name optional.
job.outputFile = function.jobOutputFile
job.errorFile = function.jobErrorFile
job.project = function.jobProject
job.queue = function.jobQueue
if (IOUtils.absolute(new File(".")) != function.commandDirectory)
job.workingDir = function.commandDirectory
job.extraBsubArgs ++= function.extraArgs
if (function.jobRestartable)
job.extraBsubArgs :+= "-r"
if (function.memoryLimit.isDefined)
job.extraBsubArgs ++= List("-R", "rusage[mem=" + function.memoryLimit.get + "]")
job.name = function.commandLine.take(1000)
exec = writeExec()
job.command = "sh " + exec
preExec = writePreExec()
job.preExecCommand = "sh " + preExec
postExec = writePostExec()
job.postExecCommand = "sh " + postExec
if (logger.isDebugEnabled) {
logger.debug("Starting: " + function.commandDirectory + " > " + job.bsubCommand.mkString(" "))
} else {
logger.info("Starting: " + job.bsubCommand.mkString(" "))
}
function.deleteLogs()
function.deleteOutputs()
runStatus = RunnerStatus.RUNNING
Retry.attempt(() => job.run(), 1, 5, 10)
jobStatusPath = IOUtils.absolute(new File(jobStatusDir, "." + job.bsubJobId)).toString
logger.info("Submitted LSF job id: " + job.bsubJobId)
} catch {
case e =>
runStatus = RunnerStatus.FAILED
try {
removeTemporaryFiles()
function.failOutputs.foreach(_.createNewFile())
writeStackTrace(e)
} catch {
case _ => /* ignore errors in the exception handler */
}
logger.error("Error: " + job.bsubCommand.mkString(" "), e)
}
}
/**
* Updates and returns the status by looking for job status files.
* After the job status files are detected they are cleaned up from
* the file system and the status is cached.
*
* Note, these temporary job status files are currently different from the
* .done files used to determine if a file has been created successfully.
*/
def status = {
try {
if (logger.isDebugEnabled) {
logger.debug("Done %s exists = %s".format(jobDoneFile, jobDoneFile.exists))
logger.debug("Fail %s exists = %s".format(jobFailFile, jobFailFile.exists))
}
if (jobFailFile.exists) {
removeTemporaryFiles()
runStatus = RunnerStatus.FAILED
logger.info("Error: " + job.bsubCommand.mkString(" "))
tailError()
} else if (jobDoneFile.exists) {
removeTemporaryFiles()
runStatus = RunnerStatus.DONE
logger.info("Done: " + job.bsubCommand.mkString(" "))
}
} catch {
case e =>
runStatus = RunnerStatus.FAILED
try {
removeTemporaryFiles()
function.failOutputs.foreach(_.createNewFile())
writeStackTrace(e)
} catch {
case _ => /* ignore errors in the exception handler */
}
logger.error("Error: " + job.bsubCommand.mkString(" "), e)
}
runStatus
}
/**
* Removes all temporary files used for this LSF job.
*/
def removeTemporaryFiles() = {
IOUtils.tryDelete(exec)
IOUtils.tryDelete(preExec)
IOUtils.tryDelete(postExec)
IOUtils.tryDelete(jobDoneFile)
IOUtils.tryDelete(jobFailFile)
}
/**
* Outputs the last lines of the error logs.
*/
private def tailError() = {
val errorFile = if (job.errorFile != null) job.errorFile else job.outputFile
if (FileUtils.waitFor(errorFile, 120)) {
val tailLines = IOUtils.tail(errorFile, 100)
val nl = "%n".format()
logger.error("Last %d lines of %s:%n%s".format(tailLines.size, errorFile, tailLines.mkString(nl)))
} else {
logger.error("Unable to access log file: %s".format(errorFile))
}
}
/**
* Writes an exec file to cleanup any status files and
* optionally mount any automount directories on the node.
* @return the file path to the pre-exec.
*/
private def writeExec() = {
IOUtils.writeTempFile(function.commandLine, ".exec", "", jobStatusDir)
}
/**
* Writes a pre-exec file to cleanup any status files and
* optionally mount any automount directories on the node.
* @return the file path to the pre-exec.
*/
private def writePreExec() = {
val preExec = new StringBuilder
preExec.append("rm -f '%s/'.$LSB_JOBID.done%n".format(jobStatusDir))
function.doneOutputs.foreach(file => preExec.append("rm -f '%s'%n".format(file)))
preExec.append("rm -f '%s/'.$LSB_JOBID.fail%n".format(jobStatusDir))
function.failOutputs.foreach(file => preExec.append("rm -f '%s'%n".format(file)))
mountCommand(function).foreach(command =>
preExec.append("%s%n".format(command)))
IOUtils.writeTempFile(preExec.toString, ".preExec", "", jobStatusDir)
}
/**
* Writes a post-exec file to create the status files.
* @return the file path to the post-exec.
*/
private def writePostExec() = {
val postExec = new StringBuilder
val touchDone = function.doneOutputs.map("touch '%s'%n".format(_)).mkString
val touchFail = function.failOutputs.map("touch '%s'%n".format(_)).mkString
postExec.append("""|
|if [ "${LSB_JOBPEND:-unset}" != "unset" ]; then
| exit 0
|fi
|
|JOB_STAT_ROOT='%s/'.$LSB_JOBID
|if [ "$LSB_JOBEXIT_STAT" == "0" ]; then
|%stouch "$JOB_STAT_ROOT".done
|else
|%stouch "$JOB_STAT_ROOT".fail
|fi
|""".stripMargin.format(jobStatusDir, touchDone, touchFail))
IOUtils.writeTempFile(postExec.toString, ".postExec", "", jobStatusDir)
}
}