diff --git a/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu index b3b511f6b4..e8db91b4bc 100644 --- a/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu +++ b/graphstorm-processing/docker/0.3.1/emr-serverless/Dockerfile.cpu @@ -1,5 +1,5 @@ ARG ARCH=x86_64 -FROM public.ecr.aws/emr-serverless/spark/emr-7.0.0:20240206-${ARCH} as base +FROM public.ecr.aws/emr-serverless/spark/emr-7.1.0:20240528-${ARCH} as base USER root ENV PYTHON_VERSION=3.9.18 @@ -40,6 +40,8 @@ else \ python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \ fi +# We use this file as an indicator of the execution environment +RUN touch /usr/lib/spark/code/EMR_SERVERLESS_EXECUTION # GSProcessing codebase COPY code/ /usr/lib/spark/code/ diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index 0b2e6e5b21..c374056f56 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -573,10 +573,10 @@ def main(): format="[GSPROCESSING] %(asctime)s %(levelname)-8s %(message)s", ) - # Determine if we're running within a SageMaker container + # Determine execution environment if os.path.exists("/opt/ml/config/processingjobconfig.json"): execution_env = ExecutionEnv.SAGEMAKER - elif os.path.exists("/emr-serverless-config.json"): + elif os.path.exists("/usr/lib/spark/code/EMR_SERVERLESS_EXECUTION"): execution_env = ExecutionEnv.EMR_SERVERLESS elif os.path.exists("/usr/lib/spark/code/EMR_EXECUTION"): execution_env = ExecutionEnv.EMR_ON_EC2