After code review with APSG, trying a patch for SIGSEGV errors which checks the LSF result codes from lsb_openjobinfo instead of checking for a null return value from lsb_readjobinfo.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5220 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-02-08 21:08:22 +00:00
parent f3de9ee3e0
commit a8ab5a5fb9
2 changed files with 12 additions and 63 deletions

View File

@ -5050,55 +5050,6 @@ public class LibBat {
}
/**
* HACK: A version of the submit structure without autoread, so that
* jobInfoEnt doesn't try to populate the structure on return from lsb_readjobinfo.
* There are several reports of kernel crashes in strlen after a call to lsb_readjobinfo during the autoRead().
*
* Example:
Current thread (0x0000000050efd800): JavaThread "main" [_thread_in_native, id=22268, stack(0x0000000040dbf000,0x0000000040ec0000)]
siginfo:si_signo=SIGSEGV: si_errno=0, si_code=128 (), si_addr=0x0000000000000000
Stack: [0x0000000040dbf000,0x0000000040ec0000], sp=0x0000000040ebc018, free space=3f40000000000000018k
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
C [libc.so.6+0x797c0] strlen+0x10
j com.sun.jna.Pointer._getString(JZ)Ljava/lang/String;+0
j com.sun.jna.Pointer.getString(JZ)Ljava/lang/String;+7
j com.sun.jna.Pointer.getString(J)Ljava/lang/String;+90
j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+630
j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168
j com.sun.jna.Structure.read()V+82
j com.sun.jna.Structure.autoRead()V+8
j com.sun.jna.Structure.updateStructureByReference(Ljava/lang/Class;Lcom/sun/jna/Structure;Lcom/sun/jna/Pointer;)Lcom/sun/jna/Structure;+68
j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+74
j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168
j com.sun.jna.Structure.read()V+82
j com.sun.jna.Structure.autoRead()V+8
v ~StubRoutines::call_stub
V [libjvm.so+0x3e756d]
V [libjvm.so+0x5f6f59]
V [libjvm.so+0x3e73a5]
V [libjvm.so+0x420904]
V [libjvm.so+0x400ea5]
C [jna1670124220621463742.tmp+0x6feb] newJavaStructure+0xdb
C [jna1670124220621463742.tmp+0xb919]
C [jna1670124220621463742.tmp+0x11008] ffi_closure_unix64_inner+0x88
C [jna1670124220621463742.tmp+0x11438] ffi_closure_unix64+0x46
j org.broadinstitute.sting.queue.engine.Lsf706JobRunner.status()Lscala/Enumeration$Value;+36
j org.broadinstitute.sting.queue.engine.FunctionEdge.status()Lscala/Enumeration$Value;+72
j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Lorg/broadinstitute/sting/queue/engine/QEdge;)Z+44
j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
* Because the error is in the second level call to autoRead(), and also in a structure that has a String, we are assuming
* that the error is on the submit structure even though this problem is very hard to reproduce consistently at the moment.
*/
public static class submitWithoutAutoRead extends submit {
public submitWithoutAutoRead() {
this.setAutoRead(false);
}
}
/**
@ -6145,7 +6096,6 @@ public class LibBat {
/**
* \brief job information entry.
* HACK: The submit value in this structure currently has autoRead() set to false as a possible workaround for a SIGSEGV error.
*/
public static class jobInfoEnt extends Structure {
public static class ByReference extends jobInfoEnt implements Structure.ByReference {
@ -6292,9 +6242,8 @@ public class LibBat {
/**
* < Structure for \ref lsb_submit call.
* HACK: Use a structure that has the same size, but has autoRead turned off. Hopes to work around kernel SIGSEGV.
*/
public submitWithoutAutoRead submit;
public submit submit;
/**
* < Job exit status.

View File

@ -98,20 +98,20 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR
* Updates and returns the status.
*/
def status = {
var jobStatus = LibBat.JOB_STAT_NULL
var jobStatus = LibBat.JOB_STAT_UNKWN
var exitStatus = 0
var exitInfo = 0
var endTime: NativeLong = null
LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB)
val result = LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB)
if (result < 0)
throw new QException(LibBat.lsb_sperror("Unable to open LSF job info for job id: " + jobId))
try {
val jobInfo = LibBat.lsb_readjobinfo(null)
if (jobInfo == null) {
jobStatus = LibBat.JOB_STAT_UNKWN
exitStatus = 0
exitInfo = 0
endTime = null
} else {
if (result > 0) {
val more = new IntByReference(result)
val jobInfo = LibBat.lsb_readjobinfo(more)
if (jobInfo == null)
throw new QException(LibBat.lsb_sperror("lsb_readjobinfo returned null for job id: " + jobId))
jobStatus = jobInfo.status
exitStatus = jobInfo.exitStatus
exitInfo = jobInfo.exitInfo
@ -202,7 +202,7 @@ object Lsf706JobRunner extends Logging {
val numQueues = new IntByReference(1)
val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0)
if (queueInfo == null)
throw new QException("Unable to get LSF queue info for the default queue.")
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue"))
defaultQueue = queueInfo.queue
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
queueRlimitRun += defaultQueue -> limit
@ -217,7 +217,7 @@ object Lsf706JobRunner extends Logging {
val numQueues = new IntByReference(1)
val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0)
if (queueInfo == null)
throw new QException("Unable to get LSF queue info for queue: " + queue)
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue))
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
queueRlimitRun += queue -> limit
limit