Memory limits changed from Int to Double.
Updated LSF calls to read memory units from config along with tweaks to select hosts. Moved some common code from GridEngine and LSF to super classes.
This commit is contained in:
parent
15610ce0c3
commit
59eb1f4663
|
|
@ -34,7 +34,6 @@ import org.testng.annotations.Test;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.*;
|
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.*;
|
||||||
|
|
||||||
import javax.jws.soap.SOAPBinding;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -55,25 +54,25 @@ public class LibBatIntegrationTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReadConfEnv() {
|
public void testReadConfEnv() {
|
||||||
LibLsf.config_param[] unitsParam = (LibLsf.config_param[]) new LibLsf.config_param().toArray(4);
|
LibLsf.config_param[] configParams = (LibLsf.config_param[]) new LibLsf.config_param().toArray(4);
|
||||||
|
|
||||||
unitsParam[0].paramName = "LSF_UNIT_FOR_LIMITS";
|
configParams[0].paramName = "LSF_UNIT_FOR_LIMITS";
|
||||||
unitsParam[1].paramName = "LSF_CONFDIR";
|
configParams[1].paramName = "LSF_CONFDIR";
|
||||||
unitsParam[2].paramName = "MADE_UP_PARAMETER";
|
configParams[2].paramName = "MADE_UP_PARAMETER";
|
||||||
|
|
||||||
Structure.autoWrite(unitsParam);
|
Structure.autoWrite(configParams);
|
||||||
|
|
||||||
if (LibLsf.ls_readconfenv(unitsParam[0], null) != 0) {
|
if (LibLsf.ls_readconfenv(configParams[0], null) != 0) {
|
||||||
Assert.fail(LibLsf.ls_sysmsg());
|
Assert.fail(LibLsf.ls_sysmsg());
|
||||||
}
|
}
|
||||||
|
|
||||||
Structure.autoRead(unitsParam);
|
Structure.autoRead(configParams);
|
||||||
|
|
||||||
System.out.println("LSF_UNIT_FOR_LIMITS: " + unitsParam[0].paramValue);
|
System.out.println("LSF_UNIT_FOR_LIMITS: " + configParams[0].paramValue);
|
||||||
Assert.assertNotNull(unitsParam[1].paramValue);
|
Assert.assertNotNull(configParams[1].paramValue);
|
||||||
Assert.assertNull(unitsParam[2].paramValue);
|
Assert.assertNull(configParams[2].paramValue);
|
||||||
Assert.assertNull(unitsParam[3].paramName);
|
Assert.assertNull(configParams[3].paramName);
|
||||||
Assert.assertNull(unitsParam[3].paramValue);
|
Assert.assertNull(configParams[3].paramValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ class QSettings {
|
||||||
var jobPriority: Option[Int] = None
|
var jobPriority: Option[Int] = None
|
||||||
|
|
||||||
@Argument(fullName="default_memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false)
|
@Argument(fullName="default_memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false)
|
||||||
var memoryLimit: Option[Int] = None
|
var memoryLimit: Option[Double] = None
|
||||||
|
|
||||||
@Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false)
|
@Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false)
|
||||||
var runDirectory = new File(".")
|
var runDirectory = new File(".")
|
||||||
|
|
|
||||||
|
|
@ -33,12 +33,29 @@ import org.broadinstitute.sting.queue.util.{Logging, IOUtils}
|
||||||
*/
|
*/
|
||||||
trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging {
|
trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging {
|
||||||
|
|
||||||
|
/** The string representation of the identifier of the running job. */
|
||||||
|
def jobIdString: String = null
|
||||||
|
|
||||||
/** A generated exec shell script. */
|
/** A generated exec shell script. */
|
||||||
protected var jobScript: File = _
|
protected var jobScript: File = _
|
||||||
|
|
||||||
/** Which directory to use for the job status files. */
|
/** Which directory to use for the job status files. */
|
||||||
protected def jobStatusDir = function.jobTempDir
|
protected def jobStatusDir = function.jobTempDir
|
||||||
|
|
||||||
|
/** Amount of time a job can go without status before giving up. */
|
||||||
|
private val unknownStatusMaxSeconds = 5 * 60
|
||||||
|
|
||||||
|
/** Last known status */
|
||||||
|
protected var lastStatus: RunnerStatus.Value = _
|
||||||
|
|
||||||
|
/** The last time the status was updated */
|
||||||
|
protected var lastStatusUpdate: Long = _
|
||||||
|
|
||||||
|
final override def status = this.lastStatus
|
||||||
|
|
||||||
|
def residentRequestMB: Option[Double] = function.memoryLimit.map(_ * 1024)
|
||||||
|
def residentLimitMB: Option[Double] = residentRequestMB.map( _ * 1.2 )
|
||||||
|
|
||||||
override def init() {
|
override def init() {
|
||||||
super.init()
|
super.init()
|
||||||
var exec = new StringBuilder
|
var exec = new StringBuilder
|
||||||
|
|
@ -53,7 +70,21 @@ trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging {
|
||||||
}
|
}
|
||||||
exec.append(function.commandLine)
|
exec.append(function.commandLine)
|
||||||
|
|
||||||
this.jobScript = IOUtils.writeTempFile(exec.toString, ".exec", "", jobStatusDir)
|
this.jobScript = IOUtils.writeTempFile(exec.toString(), ".exec", "", jobStatusDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def updateStatus(updatedStatus: RunnerStatus.Value) {
|
||||||
|
this.lastStatus = updatedStatus
|
||||||
|
this.lastStatusUpdate = System.currentTimeMillis
|
||||||
|
}
|
||||||
|
|
||||||
|
override def checkUnknownStatus() {
|
||||||
|
val unknownStatusMillis = (System.currentTimeMillis - lastStatusUpdate)
|
||||||
|
if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) {
|
||||||
|
// Unknown status has been returned for a while now.
|
||||||
|
updateStatus(RunnerStatus.FAILED)
|
||||||
|
logger.error("Unable to read status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), jobIdString, function.description))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
override def cleanup() {
|
override def cleanup() {
|
||||||
|
|
|
||||||
|
|
@ -44,9 +44,9 @@ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] {
|
||||||
/**
|
/**
|
||||||
* Updates the status on a list of functions.
|
* Updates the status on a list of functions.
|
||||||
* @param runners Runners to update.
|
* @param runners Runners to update.
|
||||||
|
* @return runners which were updated.
|
||||||
*/
|
*/
|
||||||
def updateStatus(runners: Set[TRunner]) {
|
def updateStatus(runners: Set[TRunner]): Set[TRunner] = Set.empty
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stops a list of functions.
|
* Stops a list of functions.
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,11 @@ trait JobRunner[TFunction <: QFunction] {
|
||||||
*/
|
*/
|
||||||
def status: RunnerStatus.Value
|
def status: RunnerStatus.Value
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the status has been unknown for an extended period of time.
|
||||||
|
*/
|
||||||
|
def checkUnknownStatus() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the function to be run.
|
* Returns the function to be run.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -1005,7 +1005,10 @@ class QGraph extends Logging {
|
||||||
.asInstanceOf[Set[JobRunner[QFunction]]]
|
.asInstanceOf[Set[JobRunner[QFunction]]]
|
||||||
if (managerRunners.size > 0)
|
if (managerRunners.size > 0)
|
||||||
try {
|
try {
|
||||||
manager.updateStatus(managerRunners)
|
val updatedRunners = manager.updateStatus(managerRunners)
|
||||||
|
for (runner <- managerRunners.diff(updatedRunners)) {
|
||||||
|
runner.checkUnknownStatus()
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
case e => /* ignore */
|
case e => /* ignore */
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -40,12 +40,7 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine
|
||||||
|
|
||||||
/** Job Id of the currently executing job. */
|
/** Job Id of the currently executing job. */
|
||||||
private var jobId: String = _
|
private var jobId: String = _
|
||||||
|
override def jobIdString = jobId
|
||||||
/** Last known status */
|
|
||||||
private var lastStatus: RunnerStatus.Value = _
|
|
||||||
|
|
||||||
/** The last time the status was updated */
|
|
||||||
protected var lastStatusUpdate: Long = _
|
|
||||||
|
|
||||||
def start() {
|
def start() {
|
||||||
GridEngineJobRunner.gridEngineSession.synchronized {
|
GridEngineJobRunner.gridEngineSession.synchronized {
|
||||||
|
|
@ -82,11 +77,14 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine
|
||||||
nativeSpecString += " -q " + function.jobQueue
|
nativeSpecString += " -q " + function.jobQueue
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the memory limit is set (GB) specify the memory limit
|
// If the resident set size is requested pass on the memory request
|
||||||
if (function.memoryLimit.isDefined) {
|
if (residentRequestMB.isDefined) {
|
||||||
val memAvl: String = function.memoryLimit.get + "G"
|
nativeSpecString += " -l mem_free=%dM".format(residentRequestMB.get.ceil.toInt)
|
||||||
val memMax: String = (function.memoryLimit.get * 1.2 * 1024).ceil.toInt + "M"
|
}
|
||||||
nativeSpecString += " -l mem_free=" + memAvl + ",h_rss=" + memMax
|
|
||||||
|
// If the resident set size limit is defined specify the memory limit
|
||||||
|
if (residentLimitMB.isDefined) {
|
||||||
|
nativeSpecString += " -l h_rss=%dM".format(residentLimitMB.get.ceil.toInt)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the priority is set (user specified Int) specify the priority
|
// If the priority is set (user specified Int) specify the priority
|
||||||
|
|
@ -121,21 +119,11 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine
|
||||||
logger.info("Submitted Grid Engine job id: " + jobId)
|
logger.info("Submitted Grid Engine job id: " + jobId)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def status = this.lastStatus
|
|
||||||
|
|
||||||
private def updateStatus(updatedStatus: RunnerStatus.Value) {
|
|
||||||
this.lastStatus = updatedStatus
|
|
||||||
this.lastStatusUpdate = System.currentTimeMillis
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object GridEngineJobRunner extends Logging {
|
object GridEngineJobRunner extends Logging {
|
||||||
private val gridEngineSession = SessionFactory.getFactory.getSession
|
private val gridEngineSession = SessionFactory.getFactory.getSession
|
||||||
|
|
||||||
/** Amount of time a job can go without status before giving up. */
|
|
||||||
private val unknownStatusMaxSeconds = 5 * 60
|
|
||||||
|
|
||||||
initGridEngine()
|
initGridEngine()
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -156,16 +144,14 @@ object GridEngineJobRunner extends Logging {
|
||||||
/**
|
/**
|
||||||
* Updates the status of a list of jobs.
|
* Updates the status of a list of jobs.
|
||||||
* @param runners Runners to update.
|
* @param runners Runners to update.
|
||||||
|
* @return runners which were updated.
|
||||||
*/
|
*/
|
||||||
def updateStatus(runners: Set[GridEngineJobRunner]) {
|
def updateStatus(runners: Set[GridEngineJobRunner]) = {
|
||||||
var updatedRunners = Set.empty[GridEngineJobRunner]
|
var updatedRunners = Set.empty[GridEngineJobRunner]
|
||||||
gridEngineSession.synchronized {
|
gridEngineSession.synchronized {
|
||||||
runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner})
|
runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner})
|
||||||
}
|
}
|
||||||
|
updatedRunners
|
||||||
for (runner <- runners.diff(updatedRunners)) {
|
|
||||||
checkUnknownStatus(runner)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -219,20 +205,11 @@ object GridEngineJobRunner extends Logging {
|
||||||
logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de)
|
logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de)
|
||||||
}
|
}
|
||||||
|
|
||||||
Option(returnStatus) match {
|
if (returnStatus != null) {
|
||||||
case Some(returnStatus) =>
|
runner.updateStatus(returnStatus)
|
||||||
runner.updateStatus(returnStatus)
|
true
|
||||||
return true
|
} else {
|
||||||
case None => return false
|
false
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private def checkUnknownStatus(runner: GridEngineJobRunner) {
|
|
||||||
val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate)
|
|
||||||
if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) {
|
|
||||||
// Unknown status has been returned for a while now.
|
|
||||||
runner.updateStatus(RunnerStatus.FAILED)
|
|
||||||
logger.error("Unable to read Grid Engine status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,6 @@ class Lsf706JobManager extends CommandLineJobManager[Lsf706JobRunner] {
|
||||||
def runnerType = classOf[Lsf706JobRunner]
|
def runnerType = classOf[Lsf706JobRunner]
|
||||||
def create(function: CommandLineFunction) = new Lsf706JobRunner(function)
|
def create(function: CommandLineFunction) = new Lsf706JobRunner(function)
|
||||||
|
|
||||||
override def updateStatus(runners: Set[Lsf706JobRunner]) { Lsf706JobRunner.updateStatus(runners) }
|
override def updateStatus(runners: Set[Lsf706JobRunner]) = { Lsf706JobRunner.updateStatus(runners) }
|
||||||
override def tryStop(runners: Set[Lsf706JobRunner]) { Lsf706JobRunner.tryStop(runners) }
|
override def tryStop(runners: Set[Lsf706JobRunner]) { Lsf706JobRunner.tryStop(runners) }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,8 @@ import org.broadinstitute.sting.utils.Utils
|
||||||
import org.broadinstitute.sting.jna.clibrary.LibC
|
import org.broadinstitute.sting.jna.clibrary.LibC
|
||||||
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit}
|
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit}
|
||||||
import com.sun.jna.ptr.IntByReference
|
import com.sun.jna.ptr.IntByReference
|
||||||
import com.sun.jna.{StringArray, NativeLong}
|
|
||||||
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
|
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
|
||||||
|
import com.sun.jna.{Structure, StringArray, NativeLong}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs jobs on an LSF compute cluster.
|
* Runs jobs on an LSF compute cluster.
|
||||||
|
|
@ -45,12 +45,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR
|
||||||
|
|
||||||
/** Job Id of the currently executing job. */
|
/** Job Id of the currently executing job. */
|
||||||
private var jobId = -1L
|
private var jobId = -1L
|
||||||
|
override def jobIdString = jobId.toString
|
||||||
/** Last known status */
|
|
||||||
private var lastStatus: RunnerStatus.Value = _
|
|
||||||
|
|
||||||
/** The last time the status was updated */
|
|
||||||
protected var lastStatusUpdate: Long = _
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dispatches the function on the LSF cluster.
|
* Dispatches the function on the LSF cluster.
|
||||||
|
|
@ -85,12 +80,19 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR
|
||||||
request.options |= LibBat.SUB_QUEUE
|
request.options |= LibBat.SUB_QUEUE
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the memory limit is set (GB) specify the memory limit
|
// If the resident set size is requested pass on the memory request
|
||||||
if (function.memoryLimit.isDefined) {
|
if (residentRequestMB.isDefined) {
|
||||||
request.resReq = "rusage[mem=" + function.memoryLimit.get + "]"
|
val memInUnits = Lsf706JobRunner.convertUnits(residentRequestMB.get)
|
||||||
|
request.resReq = "select[mem>%1$d] rusage[mem=%1$d]".format(memInUnits)
|
||||||
request.options |= LibBat.SUB_RES_REQ
|
request.options |= LibBat.SUB_RES_REQ
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the resident set size limit is defined specify the memory limit
|
||||||
|
if (residentLimitMB.isDefined) {
|
||||||
|
val memInUnits = Lsf706JobRunner.convertUnits(residentLimitMB.get)
|
||||||
|
request.rLimits(LibLsf.LSF_RLIMIT_RSS) = memInUnits
|
||||||
|
}
|
||||||
|
|
||||||
// If the priority is set (user specified Int) specify the priority
|
// If the priority is set (user specified Int) specify the priority
|
||||||
if (function.jobPriority.isDefined) {
|
if (function.jobPriority.isDefined) {
|
||||||
request.userPriority = function.jobPriority.get
|
request.userPriority = function.jobPriority.get
|
||||||
|
|
@ -122,11 +124,13 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def status = this.lastStatus
|
override def checkUnknownStatus() {
|
||||||
|
// TODO: Need a second pass through either of the two archive logs using lsb_geteventrecbyline() for disappeared jobs.
|
||||||
private def updateStatus(updatedStatus: RunnerStatus.Value) {
|
// Can also tell if we wake up and the last time we saw status was greater than lsb_parameterinfo().cleanPeriod
|
||||||
this.lastStatus = updatedStatus
|
// LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct)
|
||||||
this.lastStatusUpdate = System.currentTimeMillis
|
// LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist)
|
||||||
|
logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(jobId))
|
||||||
|
super.checkUnknownStatus()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -137,17 +141,8 @@ object Lsf706JobRunner extends Logging {
|
||||||
/** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */
|
/** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */
|
||||||
private val retryExpiredSeconds = 5 * 60
|
private val retryExpiredSeconds = 5 * 60
|
||||||
|
|
||||||
/** Amount of time a job can go without status before giving up. */
|
|
||||||
private val unknownStatusMaxSeconds = 5 * 60
|
|
||||||
|
|
||||||
initLsf()
|
initLsf()
|
||||||
|
|
||||||
/** The name of the default queue. */
|
|
||||||
private var defaultQueue: String = _
|
|
||||||
|
|
||||||
/** The run limits for each queue. */
|
|
||||||
private var queueRlimitRun = Map.empty[String,Int]
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the Lsf library.
|
* Initialize the Lsf library.
|
||||||
*/
|
*/
|
||||||
|
|
@ -161,8 +156,9 @@ object Lsf706JobRunner extends Logging {
|
||||||
/**
|
/**
|
||||||
* Bulk updates job statuses.
|
* Bulk updates job statuses.
|
||||||
* @param runners Runners to update.
|
* @param runners Runners to update.
|
||||||
|
* @return runners which were updated.
|
||||||
*/
|
*/
|
||||||
def updateStatus(runners: Set[Lsf706JobRunner]) {
|
def updateStatus(runners: Set[Lsf706JobRunner]) = {
|
||||||
var updatedRunners = Set.empty[Lsf706JobRunner]
|
var updatedRunners = Set.empty[Lsf706JobRunner]
|
||||||
|
|
||||||
Lsf706JobRunner.lsfLibLock.synchronized {
|
Lsf706JobRunner.lsfLibLock.synchronized {
|
||||||
|
|
@ -192,70 +188,7 @@ object Lsf706JobRunner extends Logging {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (runner <- runners.diff(updatedRunners)) {
|
updatedRunners
|
||||||
checkUnknownStatus(runner)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tries to stop any running jobs.
|
|
||||||
* @param runners Runners to stop.
|
|
||||||
*/
|
|
||||||
def tryStop(runners: Set[Lsf706JobRunner]) {
|
|
||||||
lsfLibLock.synchronized {
|
|
||||||
// lsb_killbulkjobs does not seem to forward SIGTERM,
|
|
||||||
// only SIGKILL, so send the Ctrl-C (SIGTERM) one by one.
|
|
||||||
for (runner <- runners.filterNot(_.jobId < 0)) {
|
|
||||||
try {
|
|
||||||
if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0)
|
|
||||||
logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId))
|
|
||||||
} catch {
|
|
||||||
case e =>
|
|
||||||
logger.error("Unable to kill job " + runner.jobId, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the run limit in seconds for the queue.
|
|
||||||
* If the queue name is null returns the length of the default queue.
|
|
||||||
* @param queue Name of the queue or null for the default queue.
|
|
||||||
* @return the run limit in seconds for the queue.
|
|
||||||
*/
|
|
||||||
private def getRlimitRun(queue: String) = {
|
|
||||||
lsfLibLock.synchronized {
|
|
||||||
if (queue == null) {
|
|
||||||
if (defaultQueue != null) {
|
|
||||||
queueRlimitRun(defaultQueue)
|
|
||||||
} else {
|
|
||||||
// Get the info on the default queue.
|
|
||||||
val numQueues = new IntByReference(1)
|
|
||||||
val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0)
|
|
||||||
if (queueInfo == null)
|
|
||||||
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue"))
|
|
||||||
defaultQueue = queueInfo.queue
|
|
||||||
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
|
||||||
queueRlimitRun += defaultQueue -> limit
|
|
||||||
limit
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
queueRlimitRun.get(queue) match {
|
|
||||||
case Some(limit) => limit
|
|
||||||
case None =>
|
|
||||||
// Cache miss. Go get the run limits from LSF.
|
|
||||||
val queues = new StringArray(Array[String](queue))
|
|
||||||
val numQueues = new IntByReference(1)
|
|
||||||
val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0)
|
|
||||||
if (queueInfo == null)
|
|
||||||
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue))
|
|
||||||
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
|
||||||
queueRlimitRun += queue -> limit
|
|
||||||
limit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private def updateRunnerStatus(runner: Lsf706JobRunner, jobInfo: LibBat.jobInfoEnt) {
|
private def updateRunnerStatus(runner: Lsf706JobRunner, jobInfo: LibBat.jobInfoEnt) {
|
||||||
|
|
@ -280,20 +213,6 @@ object Lsf706JobRunner extends Logging {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def checkUnknownStatus(runner: Lsf706JobRunner) {
|
|
||||||
// TODO: Need a second pass through either of the two archive logs using lsb_geteventrecbyline() for disappeared jobs.
|
|
||||||
// Can also tell if we wake up and the last time we saw status was greater than lsb_parameterinfo().cleanPeriod
|
|
||||||
// LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct)
|
|
||||||
// LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist)
|
|
||||||
logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(runner.jobId))
|
|
||||||
val unknownStatusMillis = (System.currentTimeMillis - runner.lastStatusUpdate)
|
|
||||||
if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) {
|
|
||||||
// Unknown status has been returned for a while now.
|
|
||||||
runner.updateStatus(RunnerStatus.FAILED)
|
|
||||||
logger.error("Unable to read LSF status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), runner.jobId, runner.function.description))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if LSF is expected to retry running the function.
|
* Returns true if LSF is expected to retry running the function.
|
||||||
* @param exitInfo The reason the job exited.
|
* @param exitInfo The reason the job exited.
|
||||||
|
|
@ -309,4 +228,86 @@ object Lsf706JobRunner extends Logging {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tries to stop any running jobs.
|
||||||
|
* @param runners Runners to stop.
|
||||||
|
*/
|
||||||
|
def tryStop(runners: Set[Lsf706JobRunner]) {
|
||||||
|
lsfLibLock.synchronized {
|
||||||
|
// lsb_killbulkjobs does not seem to forward SIGTERM,
|
||||||
|
// only SIGKILL, so send the Ctrl-C (SIGTERM) one by one.
|
||||||
|
for (runner <- runners.filterNot(_.jobId < 0)) {
|
||||||
|
try {
|
||||||
|
if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0)
|
||||||
|
logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId))
|
||||||
|
} catch {
|
||||||
|
case e =>
|
||||||
|
logger.error("Unable to kill job " + runner.jobId, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The name of the default queue. */
|
||||||
|
private lazy val defaultQueue: String = {
|
||||||
|
lsfLibLock.synchronized {
|
||||||
|
val numQueues = new IntByReference(1)
|
||||||
|
val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0)
|
||||||
|
if (queueInfo == null)
|
||||||
|
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue"))
|
||||||
|
queueInfo.queue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The run limits for each queue. */
|
||||||
|
private var queueRlimitRun = Map.empty[String,Int]
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the run limit in seconds for the queue.
|
||||||
|
* If the queue name is null returns the length of the default queue.
|
||||||
|
* @param queue Name of the queue or null for the default queue.
|
||||||
|
* @return the run limit in seconds for the queue.
|
||||||
|
*/
|
||||||
|
private def getRlimitRun(queueName: String) = {
|
||||||
|
lsfLibLock.synchronized {
|
||||||
|
val queue = if (queueName == null) defaultQueue else queueName
|
||||||
|
queueRlimitRun.get(queue) match {
|
||||||
|
case Some(limit) => limit
|
||||||
|
case None =>
|
||||||
|
// Cache miss. Go get the run limits from LSF.
|
||||||
|
val queues = new StringArray(Array(queue))
|
||||||
|
val numQueues = new IntByReference(1)
|
||||||
|
val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0)
|
||||||
|
if (queueInfo == null)
|
||||||
|
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue))
|
||||||
|
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
||||||
|
queueRlimitRun += queue -> limit
|
||||||
|
limit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private lazy val unitDivisor: Double = {
|
||||||
|
lsfLibLock.synchronized {
|
||||||
|
val unitsParam: Array[LibLsf.config_param] = new LibLsf.config_param().toArray(2).asInstanceOf[Array[LibLsf.config_param]]
|
||||||
|
unitsParam(0).paramName = "LSF_UNIT_FOR_LIMITS"
|
||||||
|
|
||||||
|
Structure.autoWrite(unitsParam.asInstanceOf[Array[Structure]])
|
||||||
|
if (LibLsf.ls_readconfenv(unitsParam(0), null) != 0)
|
||||||
|
throw new QException(LibBat.lsb_sperror("ls_readconfenv() failed"))
|
||||||
|
Structure.autoRead(unitsParam.asInstanceOf[Array[Structure]])
|
||||||
|
|
||||||
|
unitsParam(0).paramValue match {
|
||||||
|
case "MB" => 1D
|
||||||
|
case "GB" => 1024D
|
||||||
|
case "TB" => 1024D * 1024
|
||||||
|
case "PB" => 1024D * 1024 * 1024
|
||||||
|
case "EB" => 1024D * 1024 * 1024 * 1024
|
||||||
|
case null => 1D
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def convertUnits(mb: Double) = (mb / unitDivisor).ceil.toInt
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -50,10 +50,10 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu
|
||||||
// Allow advanced users to update the job.
|
// Allow advanced users to update the job.
|
||||||
updateJobRun(job)
|
updateJobRun(job)
|
||||||
|
|
||||||
runStatus = RunnerStatus.RUNNING
|
updateStatus(RunnerStatus.RUNNING)
|
||||||
job.run()
|
job.run()
|
||||||
runStatus = RunnerStatus.DONE
|
updateStatus(RunnerStatus.FAILED)
|
||||||
}
|
}
|
||||||
|
|
||||||
def status = runStatus
|
override def checkUnknownStatus() {}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ trait CommandLineFunction extends QFunction with Logging {
|
||||||
def commandLine: String
|
def commandLine: String
|
||||||
|
|
||||||
/** Upper memory limit */
|
/** Upper memory limit */
|
||||||
var memoryLimit: Option[Int] = None
|
var memoryLimit: Option[Double] = None
|
||||||
|
|
||||||
/** Job project to run the command */
|
/** Job project to run the command */
|
||||||
var jobProject: String = _
|
var jobProject: String = _
|
||||||
|
|
@ -56,7 +56,7 @@ trait CommandLineFunction extends QFunction with Logging {
|
||||||
if (memoryLimit.isEmpty)
|
if (memoryLimit.isEmpty)
|
||||||
memoryLimit = qSettings.memoryLimit
|
memoryLimit = qSettings.memoryLimit
|
||||||
|
|
||||||
super.freezeFieldValues
|
super.freezeFieldValues()
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
|
||||||
/**
|
/**
|
||||||
* Memory limit for the java executable, or if None will use the default memoryLimit.
|
* Memory limit for the java executable, or if None will use the default memoryLimit.
|
||||||
*/
|
*/
|
||||||
var javaMemoryLimit: Option[Int] = None
|
var javaMemoryLimit: Option[Double] = None
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the java executable to run.
|
* Returns the java executable to run.
|
||||||
|
|
@ -61,8 +61,8 @@ trait JavaCommandLineFunction extends CommandLineFunction {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
override def freezeFieldValues = {
|
override def freezeFieldValues() {
|
||||||
super.freezeFieldValues
|
super.freezeFieldValues()
|
||||||
|
|
||||||
if (javaMemoryLimit.isEmpty && memoryLimit.isDefined)
|
if (javaMemoryLimit.isEmpty && memoryLimit.isDefined)
|
||||||
javaMemoryLimit = memoryLimit
|
javaMemoryLimit = memoryLimit
|
||||||
|
|
@ -72,7 +72,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
def javaOpts = "%s -Djava.io.tmpdir=%s"
|
def javaOpts = "%s -Djava.io.tmpdir=%s"
|
||||||
.format(optional(" -Xmx", javaMemoryLimit, "g"), jobTempDir)
|
.format(optional(" -Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m"), jobTempDir)
|
||||||
|
|
||||||
def commandLine = "java%s %s"
|
def commandLine = "java%s %s"
|
||||||
.format(javaOpts, javaExecutable)
|
.format(javaOpts, javaExecutable)
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
||||||
|
|
||||||
class HelloWorldPipelineTest {
|
class HelloWorldPipelineTest {
|
||||||
@Test
|
@Test
|
||||||
def testHelloWorld {
|
def testHelloWorld() {
|
||||||
val spec = new PipelineTestSpec
|
val spec = new PipelineTestSpec
|
||||||
spec.name = "HelloWorld"
|
spec.name = "HelloWorld"
|
||||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala"
|
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala"
|
||||||
|
|
@ -37,15 +37,23 @@ class HelloWorldPipelineTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testHelloWorldWithPrefix {
|
def testHelloWorldWithPrefix() {
|
||||||
val spec = new PipelineTestSpec
|
val spec = new PipelineTestSpec
|
||||||
spec.name = "HelloWorldWithPrefix"
|
spec.name = "HelloWorldWithPrefix"
|
||||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld"
|
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld"
|
||||||
PipelineTest.executeTest(spec)
|
PipelineTest.executeTest(spec)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testHelloWorldWithMemoryLimit() {
|
||||||
|
val spec = new PipelineTestSpec
|
||||||
|
spec.name = "HelloWorldWithPrefix"
|
||||||
|
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -memLimit 1.25"
|
||||||
|
PipelineTest.executeTest(spec)
|
||||||
|
}
|
||||||
|
|
||||||
@Test(enabled=false)
|
@Test(enabled=false)
|
||||||
def testHelloWorldWithPriority {
|
def testHelloWorldWithPriority() {
|
||||||
val spec = new PipelineTestSpec
|
val spec = new PipelineTestSpec
|
||||||
spec.name = "HelloWorldWithPriority"
|
spec.name = "HelloWorldWithPriority"
|
||||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100"
|
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue