From 3073696a60f5915793a3a60177ea56c4e796755e Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Mon, 29 Jul 2024 21:27:04 +0000 Subject: [PATCH] [GSProcessing] Update EMRS image to 7.1.0, add file in image to ensure we recognize execution env. --- .../docker/0.3.1/emr-serverless/Dockerfile.cpu | 4 +++- .../graphstorm_processing/distributed_executor.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu index b3b511f6b4..e8db91b4bc 100644 --- a/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu +++ b/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu @@ -1,5 +1,5 @@ ARG ARCH=x86_64 -FROM public.ecr.aws/emr-serverless/spark/emr-7.0.0:20240206-${ARCH} as base +FROM public.ecr.aws/emr-serverless/spark/emr-7.1.0:20240528-${ARCH} as base USER root ENV PYTHON_VERSION=3.9.18 @@ -40,6 +40,8 @@ else \ python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \ fi +# We use this file as an indicator of the execution environment +RUN touch /usr/lib/spark/code/EMR_SERVERLESS_EXECUTION # GSProcessing codebase COPY code/ /usr/lib/spark/code/ diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index 0b2e6e5b21..c374056f56 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -573,10 +573,10 @@ def main(): format="[GSPROCESSING] %(asctime)s %(levelname)-8s %(message)s", ) - # Determine if we're running within a SageMaker container + # Determine execution environment if os.path.exists("/opt/ml/config/processingjobconfig.json"): execution_env = ExecutionEnv.SAGEMAKER - elif os.path.exists("/emr-serverless-config.json"): + elif os.path.exists("/usr/lib/spark/code/EMR_SERVERLESS_EXECUTION"): execution_env = ExecutionEnv.EMR_SERVERLESS elif os.path.exists("/usr/lib/spark/code/EMR_EXECUTION"): execution_env = ExecutionEnv.EMR_ON_EC2