After code review with APSG, trying a patch for SIGSEGV errors which checks the LSF result codes from lsb_openjobinfo instead of checking for a null return value from lsb_readjobinfo.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5220 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f3de9ee3e0
commit
a8ab5a5fb9
|
|
@ -5050,55 +5050,6 @@ public class LibBat {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* HACK: A version of the submit structure without autoread, so that
|
||||
* jobInfoEnt doesn't try to populate the structure on return from lsb_readjobinfo.
|
||||
* There are several reports of kernel crashes in strlen after a call to lsb_readjobinfo during the autoRead().
|
||||
*
|
||||
* Example:
|
||||
|
||||
Current thread (0x0000000050efd800): JavaThread "main" [_thread_in_native, id=22268, stack(0x0000000040dbf000,0x0000000040ec0000)]
|
||||
|
||||
siginfo:si_signo=SIGSEGV: si_errno=0, si_code=128 (), si_addr=0x0000000000000000
|
||||
|
||||
Stack: [0x0000000040dbf000,0x0000000040ec0000], sp=0x0000000040ebc018, free space=3f40000000000000018k
|
||||
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
|
||||
C [libc.so.6+0x797c0] strlen+0x10
|
||||
j com.sun.jna.Pointer._getString(JZ)Ljava/lang/String;+0
|
||||
j com.sun.jna.Pointer.getString(JZ)Ljava/lang/String;+7
|
||||
j com.sun.jna.Pointer.getString(J)Ljava/lang/String;+90
|
||||
j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+630
|
||||
j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168
|
||||
j com.sun.jna.Structure.read()V+82
|
||||
j com.sun.jna.Structure.autoRead()V+8
|
||||
j com.sun.jna.Structure.updateStructureByReference(Ljava/lang/Class;Lcom/sun/jna/Structure;Lcom/sun/jna/Pointer;)Lcom/sun/jna/Structure;+68
|
||||
j com.sun.jna.Pointer.getValue(JLjava/lang/Class;Ljava/lang/Object;)Ljava/lang/Object;+74
|
||||
j com.sun.jna.Structure.readField(Lcom/sun/jna/Structure$StructField;)Ljava/lang/Object;+168
|
||||
j com.sun.jna.Structure.read()V+82
|
||||
j com.sun.jna.Structure.autoRead()V+8
|
||||
v ~StubRoutines::call_stub
|
||||
V [libjvm.so+0x3e756d]
|
||||
V [libjvm.so+0x5f6f59]
|
||||
V [libjvm.so+0x3e73a5]
|
||||
V [libjvm.so+0x420904]
|
||||
V [libjvm.so+0x400ea5]
|
||||
C [jna1670124220621463742.tmp+0x6feb] newJavaStructure+0xdb
|
||||
C [jna1670124220621463742.tmp+0xb919]
|
||||
C [jna1670124220621463742.tmp+0x11008] ffi_closure_unix64_inner+0x88
|
||||
C [jna1670124220621463742.tmp+0x11438] ffi_closure_unix64+0x46
|
||||
j org.broadinstitute.sting.queue.engine.Lsf706JobRunner.status()Lscala/Enumeration$Value;+36
|
||||
j org.broadinstitute.sting.queue.engine.FunctionEdge.status()Lscala/Enumeration$Value;+72
|
||||
j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Lorg/broadinstitute/sting/queue/engine/QEdge;)Z+44
|
||||
j org.broadinstitute.sting.queue.engine.QGraph$$anonfun$getReadyJobs$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
|
||||
|
||||
* Because the error is in the second level call to autoRead(), and also in a structure that has a String, we are assuming
|
||||
* that the error is on the submit structure even though this problem is very hard to reproduce consistently at the moment.
|
||||
*/
|
||||
public static class submitWithoutAutoRead extends submit {
|
||||
public submitWithoutAutoRead() {
|
||||
this.setAutoRead(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -6145,7 +6096,6 @@ public class LibBat {
|
|||
|
||||
/**
|
||||
* \brief job information entry.
|
||||
* HACK: The submit value in this structure currently has autoRead() set to false as a possible workaround for a SIGSEGV error.
|
||||
*/
|
||||
public static class jobInfoEnt extends Structure {
|
||||
public static class ByReference extends jobInfoEnt implements Structure.ByReference {
|
||||
|
|
@ -6292,9 +6242,8 @@ public class LibBat {
|
|||
|
||||
/**
|
||||
* < Structure for \ref lsb_submit call.
|
||||
* HACK: Use a structure that has the same size, but has autoRead turned off. Hopes to work around kernel SIGSEGV.
|
||||
*/
|
||||
public submitWithoutAutoRead submit;
|
||||
public submit submit;
|
||||
|
||||
/**
|
||||
* < Job exit status.
|
||||
|
|
|
|||
|
|
@ -98,20 +98,20 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR
|
|||
* Updates and returns the status.
|
||||
*/
|
||||
def status = {
|
||||
var jobStatus = LibBat.JOB_STAT_NULL
|
||||
var jobStatus = LibBat.JOB_STAT_UNKWN
|
||||
var exitStatus = 0
|
||||
var exitInfo = 0
|
||||
var endTime: NativeLong = null
|
||||
|
||||
LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB)
|
||||
val result = LibBat.lsb_openjobinfo(jobId, null, null, null, null, LibBat.ALL_JOB)
|
||||
if (result < 0)
|
||||
throw new QException(LibBat.lsb_sperror("Unable to open LSF job info for job id: " + jobId))
|
||||
try {
|
||||
val jobInfo = LibBat.lsb_readjobinfo(null)
|
||||
if (jobInfo == null) {
|
||||
jobStatus = LibBat.JOB_STAT_UNKWN
|
||||
exitStatus = 0
|
||||
exitInfo = 0
|
||||
endTime = null
|
||||
} else {
|
||||
if (result > 0) {
|
||||
val more = new IntByReference(result)
|
||||
val jobInfo = LibBat.lsb_readjobinfo(more)
|
||||
if (jobInfo == null)
|
||||
throw new QException(LibBat.lsb_sperror("lsb_readjobinfo returned null for job id: " + jobId))
|
||||
jobStatus = jobInfo.status
|
||||
exitStatus = jobInfo.exitStatus
|
||||
exitInfo = jobInfo.exitInfo
|
||||
|
|
@ -202,7 +202,7 @@ object Lsf706JobRunner extends Logging {
|
|||
val numQueues = new IntByReference(1)
|
||||
val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0)
|
||||
if (queueInfo == null)
|
||||
throw new QException("Unable to get LSF queue info for the default queue.")
|
||||
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue"))
|
||||
defaultQueue = queueInfo.queue
|
||||
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
||||
queueRlimitRun += defaultQueue -> limit
|
||||
|
|
@ -217,7 +217,7 @@ object Lsf706JobRunner extends Logging {
|
|||
val numQueues = new IntByReference(1)
|
||||
val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0)
|
||||
if (queueInfo == null)
|
||||
throw new QException("Unable to get LSF queue info for queue: " + queue)
|
||||
throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue))
|
||||
val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN)
|
||||
queueRlimitRun += queue -> limit
|
||||
limit
|
||||
|
|
|
|||
Loading…
Reference in New Issue