diff --git a/services/worker/src/worker/executor.py b/services/worker/src/worker/executor.py index 62a8d01e86..ee4c4ec7c1 100644 --- a/services/worker/src/worker/executor.py +++ b/services/worker/src/worker/executor.py @@ -14,7 +14,7 @@ from libcommon.processing_graph import ProcessingGraph from libcommon.queue import Queue from libcommon.utils import get_datetime -from mirakuru import OutputExecutor +from mirakuru import OutputExecutor, ProcessExitedWithError from worker import start_worker_loop from worker.config import AppConfig @@ -181,5 +181,16 @@ def kill_long_job(self, worker_loop_executor: OutputExecutor) -> None: def is_worker_alive(self, worker_loop_executor: OutputExecutor) -> bool: if worker_loop_executor.running(): return True - worker_loop_executor.stop() # raises an error if the worker returned exit code 1 + try: + worker_loop_executor.stop() # raises an error if the worker returned unexpected exit code + except ProcessExitedWithError as err: + explanation = f"exit code f{err.exit_code}" + if err.exit_code == -9: + explanation += " SIGKILL - surely an OOM" + error_msg = f"Worker crashed ({explanation})" + state = self.get_state() + if state and state["current_job_info"]: + error_msg += f"when running job_id={state['current_job_info']['job_id']}" + logging.error(error_msg) + raise return False