diff --git a/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java index c301fffd5..1beb8fb86 100644 --- a/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java +++ b/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java @@ -5050,55 +5050,6 @@ public class LibBat { } - /** - * HACK: A version of the submit structure without autoread, so that - * jobInfoEnt doesn't try to populate the structure on return from lsb_readjobinfo. - * There are several reports of kernel crashes in strlen after a call to lsb_readjobinfo during the autoRead(). - * - * Example: - - Current thread (0x0000000050efd800): JavaThread "main" [_thread_in_native, id=22268, stack(0x0000000040dbf000,0x0000000040ec0000)] - - siginfo:si_signo=SIGSEGV: si_errno=0, si_code=128 (), si_addr=0x0000000000000000 - - Stack: [0x0000000040dbf000,0x0000000040ec0000], sp=0x0000000040ebc018, free space=3f40000000000000018k - Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) - C [libc.so.6+0x797c0] strlen+0x10 - j com.sun.jna.Pointer._getString(JZ)Ljava/lang/String;+0 - j com.sun.jna.Pointer.getString(JZ)Ljava/lang/String;+7 - j com.sun.jna.Pointer.getString(J)Ljava/lang/String;+90 - j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+630 - j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168 - j com.sun.jna.Structure.read()V+82 - j com.sun.jna.Structure.autoRead()V+8 - j com.sun.jna.Structure.updateStructureByReference(Ljava/lang/Class;Lcom/sun/jna/Structure;Lcom/sun/jna/Pointer;)Lcom/sun/jna/Structure;+68 - j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+74 - j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168 - j com.sun.jna.Structure.read()V+82 - j com.sun.jna.Structure.autoRead()V+8 - v ~StubRoutines::call_stub - V [libjvm.so+0x3e756d] - V [libjvm.so+0x5f6f59] - V [libjvm.so+0x3e73a5] - V [libjvm.so+0x420904] - V [libjvm.so+0x400ea5] - C [jna1670124220621463742.tmp+0x6feb] newJavaStructure+0xdb - C [jna1670124220621463742.tmp+0xb919] - C [jna1670124220621463742.tmp+0x11008] ffi_closure_unix64_inner+0x88 - C [jna1670124220621463742.tmp+0x11438] ffi_closure_unix64+0x46 - j org.broadinstitute.sting.queue.engine.Lsf706JobRunner.status()Lscala/Enumeration$Value;+36 - j org.broadinstitute.sting.queue.engine.FunctionEdge.status()Lscala/Enumeration$Value;+72 - j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Lorg/broadinstitute/sting/queue/engine/QEdge;)Z+44 - j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 - - * Because the error is in the second level call to autoRead(), and also in a structure that has a String, we are assuming - * that the error is on the submit structure even though this problem is very hard to reproduce consistently at the moment. - */ - public static class submitWithoutAutoRead extends submit { - public submitWithoutAutoRead() { - this.setAutoRead(false); - } - } /** @@ -6145,7 +6096,6 @@ public class LibBat { /** * \brief job information entry. - * HACK: The submit value in this structure currently has autoRead() set to false as a possible workaround for a SIGSEGV error. */ public static class jobInfoEnt extends Structure { public static class ByReference extends jobInfoEnt implements Structure.ByReference { @@ -6292,9 +6242,8 @@ public class LibBat { /** * < Structure for \ref lsb_submit call. - * HACK: Use a structure that has the same size, but has autoRead turned off. Hopes to work around kernel SIGSEGV. */ - public submitWithoutAutoRead submit; + public submit submit; /** * < Job exit status. diff --git a/scala/src/org/broadinstitute/sting/queue/engine/Lsf706JobRunner.scala b/scala/src/org/broadinstitute/sting/queue/engine/Lsf706JobRunner.scala index 55680081a..149f4f040 100644 --- a/scala/src/org/broadinstitute/sting/queue/engine/Lsf706JobRunner.scala +++ b/scala/src/org/broadinstitute/sting/queue/engine/Lsf706JobRunner.scala @@ -98,20 +98,20 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR * Updates and returns the status. */ def status = { - var jobStatus = LibBat.JOB_STAT_NULL + var jobStatus = LibBat.JOB_STAT_UNKWN var exitStatus = 0 var exitInfo = 0 var endTime: NativeLong = null - LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB) + val result = LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB) + if (result < 0) + throw new QException(LibBat.lsb_sperror("Unable to open LSF job info for job id: " + jobId)) try { - val jobInfo = LibBat.lsb_readjobinfo(null) - if (jobInfo == null) { - jobStatus = LibBat.JOB_STAT_UNKWN - exitStatus = 0 - exitInfo = 0 - endTime = null - } else { + if (result > 0) { + val more = new IntByReference(result) + val jobInfo = LibBat.lsb_readjobinfo(more) + if (jobInfo == null) + throw new QException(LibBat.lsb_sperror("lsb_readjobinfo returned null for job id: " + jobId)) jobStatus = jobInfo.status exitStatus = jobInfo.exitStatus exitInfo = jobInfo.exitInfo @@ -202,7 +202,7 @@ object Lsf706JobRunner extends Logging { val numQueues = new IntByReference(1) val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0) if (queueInfo == null) - throw new QException("Unable to get LSF queue info for the default queue.") + throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue")) defaultQueue = queueInfo.queue val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN) queueRlimitRun += defaultQueue -> limit @@ -217,7 +217,7 @@ object Lsf706JobRunner extends Logging { val numQueues = new IntByReference(1) val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0) if (queueInfo == null) - throw new QException("Unable to get LSF queue info for queue: " + queue) + throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue)) val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN) queueRlimitRun += queue -> limit limit