From be95c0bb5347122aead647d4251ff4e36c3a57b8 Mon Sep 17 00:00:00 2001 From: Aaron Berdy Date: Mon, 23 Oct 2023 17:33:30 -0700 Subject: [PATCH] feat: improve failure status reason --- base/jobs/docker/1.0/py3/requirements.txt | 2 +- pytorch/jobs/docker/2.0/py3/requirements.txt | 2 +- src/braket_container.py | 21 ++++++++++++++----- .../jobs/docker/2.13/py3/requirements.txt | 2 +- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/base/jobs/docker/1.0/py3/requirements.txt b/base/jobs/docker/1.0/py3/requirements.txt index dd951578..00a1860f 100644 --- a/base/jobs/docker/1.0/py3/requirements.txt +++ b/base/jobs/docker/1.0/py3/requirements.txt @@ -1,7 +1,7 @@ amazon-braket-default-simulator==1.20.1 amazon-braket-schemas==1.19.1 amazon-braket-pennylane-plugin==1.21.0 -amazon-braket-sdk==1.58.0 +amazon-braket-sdk==1.59.1 awscli==1.29.53 botocore==1.31.53 boto3==1.28.53 diff --git a/pytorch/jobs/docker/2.0/py3/requirements.txt b/pytorch/jobs/docker/2.0/py3/requirements.txt index 94584074..bf0e9a4c 100644 --- a/pytorch/jobs/docker/2.0/py3/requirements.txt +++ b/pytorch/jobs/docker/2.0/py3/requirements.txt @@ -1,7 +1,7 @@ amazon-braket-default-simulator==1.20.1 amazon-braket-schemas==1.19.1 amazon-braket-pennylane-plugin==1.21.0 -amazon-braket-sdk==1.58.0 +amazon-braket-sdk==1.59.1 awscli==1.29.53 botocore==1.31.53 boto3==1.28.53 diff --git a/src/braket_container.py b/src/braket_container.py index f6291d6e..9e52bd63 100644 --- a/src/braket_container.py +++ b/src/braket_container.py @@ -136,7 +136,7 @@ def unpack_code_and_add_to_path(local_s3_file: str, compression_type: str): sys.path.append(EXTRACTED_CUSTOMER_CODE_PATH) -def kick_off_customer_script(entry_point: str) -> multiprocessing.Process: +def kick_off_customer_script(entry_point: str, queue: multiprocessing.Queue) -> multiprocessing.Process: """ Runs the customer script as a separate process. @@ -151,7 +151,13 @@ def kick_off_customer_script(entry_point: str) -> multiprocessing.Process: customer_module = importlib.import_module(str_module) customer_method = getattr(customer_module, str_method) - process_kwargs = {"target": customer_method} + def wrapped_customer_method(queue, **kwargs): + try: + customer_method(**kwargs) + except Exception as exc: + queue.put(exc) + + process_kwargs = {"target": wrapped_customer_method, "args": (queue,)} function_args = try_bind_hyperparameters_to_customer_method(customer_method) if function_args is not None: @@ -186,7 +192,7 @@ def try_bind_hyperparameters_to_customer_method(customer_method: Callable): return function_args -def join_customer_script(customer_code_process: multiprocessing.Process): +def join_customer_script(customer_code_process: multiprocessing.Process, queue: multiprocessing.Queue): """ Joins the process running the customer code. @@ -195,6 +201,10 @@ def join_customer_script(customer_code_process: multiprocessing.Process): """ try: customer_code_process.join() + + if not queue.empty(): + exception = queue.get() + log_failure_and_exit(f"{type(exception).__name__}: {exception}") except Exception as e: log_failure_and_exit(f"Job did not exit gracefully.\nException: {e}") @@ -265,8 +275,9 @@ def run_customer_code_as_process(entry_point: str) -> int: int: The exit code of the customer code run. """ print("Running Code As Process") - customer_code_process = kick_off_customer_script(entry_point) - join_customer_script(customer_code_process) + queue = multiprocessing.Queue() + customer_code_process = kick_off_customer_script(entry_point, queue) + join_customer_script(customer_code_process, queue) print("Code Run Finished") return customer_code_process.exitcode diff --git a/tensorflow/jobs/docker/2.13/py3/requirements.txt b/tensorflow/jobs/docker/2.13/py3/requirements.txt index 4d138a85..1bd1b7fb 100644 --- a/tensorflow/jobs/docker/2.13/py3/requirements.txt +++ b/tensorflow/jobs/docker/2.13/py3/requirements.txt @@ -1,7 +1,7 @@ amazon-braket-default-simulator==1.20.1 amazon-braket-schemas==1.19.1 amazon-braket-pennylane-plugin==1.21.0 -amazon-braket-sdk==1.58.0 +amazon-braket-sdk==1.59.1 awscli==1.29.53 botocore==1.31.53 boto3==1.28.53