[GSProcessing] Add support for EMR on EC2 execution.

awslabs · Apr 23, 2024 · 7536074 · 7536074
1 parent b0a8c77
commit 7536074
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 10 deletions.
diff --git a/docs/source/gs-processing/gs-processing-getting-started.rst b/docs/source/gs-processing/gs-processing-getting-started.rst
@@ -145,11 +145,12 @@ distributed training pipeline.
 Running on AWS resources
 ------------------------
 
-GSProcessing supports Amazon SageMaker and EMR Serverless as execution environments.
+GSProcessing supports Amazon SageMaker, EMR on EC2, and EMR Serverless as execution environments.
 To run distributed jobs on AWS resources we will have to build a Docker image
 and push it to the Amazon Elastic Container Registry, which we cover in
-:doc:`usage/distributed-processing-setup` and run a SageMaker Processing
-job which we describe in :doc:`usage/amazon-sagemaker`, or EMR Serverless
+:doc:`usage/distributed-processing-setup`. We can then run either a SageMaker Processing
+job which we describe in :doc:`usage/amazon-sagemaker`, an EMR on EC2 job which
+we describe in :doc:`usage/emr`, or an EMR Serverless
 job that is covered in :doc:`usage/emr-serverless`.
 
 

diff --git a/docs/source/gs-processing/usage/distributed-processing-setup.rst b/docs/source/gs-processing/usage/distributed-processing-setup.rst
@@ -87,14 +87,14 @@ Once Docker and Poetry are installed, and your AWS credentials are set up,
 we can use the provided scripts
 in the ``graphstorm-processing/docker`` directory to build the image.
 
-GSProcessing supports Amazon SageMaker and EMR Serverless as
+GSProcessing supports Amazon SageMaker, EMR, and EMR Serverless as
 execution environments, so we need to choose which image we want
 to build first.
 
 The ``build_gsprocessing_image.sh`` script can build the image
 locally and tag it, provided the intended execution environment,
 using the ``-e/--environment`` argument. The supported environments
-are ``sagemaker`` and ``emr-serverless``.
+are ``sagemaker``, ``emr``, and ``emr-serverless``.
 For example, assuming our current directory is where
 we cloned ``graphstorm/graphstorm-processing``, we can use
 the following to build the SageMaker image:
@@ -131,7 +131,7 @@ You can find detailed instructions on creating a VPC for EMR Serverless in the A
 Support for arm64 architecture
 ------------------------------
 
-For EMR Serverless images, it is possible to build images for the ``arm64`` architecture,
+For EMR and EMR Serverless images, it is possible to build images for the ``arm64`` architecture,
 which can lead to improved runtime and cost compared to ``x86_64``. For more details
 on EMR Serverless architecture options see the
 `official docs <https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/architecture.html>`_.

diff --git a/docs/source/gs-processing/usage/emr-serverless.rst b/docs/source/gs-processing/usage/emr-serverless.rst
@@ -69,7 +69,7 @@ you are using would be:
                     "ecr:BatchGetImage",
                     "ecr:DescribeImages"
                 ],
-                "Resource": "<enter-ecr-repository-arn-here>"
+                "Resource": ""<ACCOUNT>.dkr.ecr.<REGION>.amazonaws.com/graphstorm-processing-emr-serverless"
             }
         ]
     }

diff --git a/graphstorm-processing/docker/0.3.0/emr/Dockerfile.cpu b/graphstorm-processing/docker/0.3.0/emr/Dockerfile.cpu
@@ -0,0 +1,76 @@
+# TODO: Pin image version
+FROM public.ecr.aws/amazoncorretto/amazoncorretto:17 as base
+
+ENV PYTHON_VERSION=3.9.18
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+ENV PYENV_ROOT="${HOME}/.pyenv"
+ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"
+
+ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python
+ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python
+
+# pyenv and Spark/YARN dependencies
+RUN yum erase -y openssl-devel && \
+    yum install -y \
+        bzip2-devel\
+        gcc \
+        git \
+        headless \
+        hostname \
+        java-17-amazon-corretto-headless \
+        libffi-devel \
+        make \
+        ncurses-devel \
+        openssl11-devel \
+        readline-devel \
+        sqlite-devel \
+        sudo \
+        tar \
+        xz-devel && \
+        rm -rf /var/cache/yum
+
+# Install Python through pyenv
+RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} --single-branch && \
+    pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION}
+
+FROM base AS runtime
+
+WORKDIR /usr/lib/spark/code/
+
+
+# Install GSProcessing requirements to pyenv Python
+COPY requirements.txt requirements.txt
+# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
+# https://github.com/moby/buildkit/issues/1512
+RUN pip3 install -r /usr/lib/spark/code/requirements.txt \
+    && rm -rf /root/.cache
+
+# Install Huggingface model cache if it is necessary
+# This needs to happen after the transformers library has been installed above
+ARG MODEL=""
+ENV HF_HOME=/usr/lib/spark/.cache/huggingface/hub
+RUN if [ -z "${MODEL}" ]; then \
+        echo "Skip installing model cache"; \
+else \
+        echo "Installing model cache for $MODEL" && \
+        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
+        python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \
+fi
+
+
+# GSProcessing codebase
+COPY code/ /usr/lib/spark/code/
+
+FROM runtime AS prod
+RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
+    rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache
+
+FROM runtime AS test
+RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ && rm -rf /root/.cache
diff --git a/graphstorm-processing/docker/build_gsprocessing_image.sh b/graphstorm-processing/docker/build_gsprocessing_image.sh
@@ -15,7 +15,7 @@ Available options:
 
 -h, --help          Print this help and exit
 -x, --verbose       Print script debug info (set -x)
--e, --environment   Image execution environment. Must be one of 'emr-serverless' or 'sagemaker'. Required.
+-e, --environment   Image execution environment. Must be one of 'emr', 'emr-serverless' or 'sagemaker'. Required.
 -a, --architecture  Image architecture. Must be one of 'x86_64' or 'arm64'. Default is 'x86_64'.
                     Note that only x86_64 architecture is supported for SageMaker.
 -t, --target        Docker image target, must be one of 'prod' or 'test'. Default is 'prod'.
@@ -102,7 +102,7 @@ parse_params() {
   args=("$@")
 
   # check required params and arguments
-  [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [emr-serverless|sagemaker]"
+  [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [emr|emr-serverless|sagemaker]"
 
   return 0
 }
@@ -117,6 +117,13 @@ cleanup() {
 
 parse_params "$@"
 
+if [[ ${EXEC_ENV} == "emr" || ${EXEC_ENV} == "emr-serverless" || ${EXEC_ENV} == "sagemaker" ]]; then
+    :  # Do nothing
+else
+    die "--environment parameter needs to be one of 'emr', 'emr-serverless' or 'sagemaker', got ${EXEC_ENV}"
+fi
+
+
 if [[ ${TARGET} == "prod" || ${TARGET} == "test" ]]; then
     :  # Do nothing
 else

diff --git a/graphstorm-processing/docker/push_gsprocessing_image.sh b/graphstorm-processing/docker/push_gsprocessing_image.sh
@@ -100,7 +100,7 @@ cleanup() {
 
 parse_params "${@}"
 
-if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "emr-serverless" ]]; then
+if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "emr-serverless" || ${EXEC_ENV} == "emr" ]]; then
     :  # Do nothing
 else
     die "--environment parameter needs to be one of 'emr', 'emr-serverless' or 'sagemaker', got ${EXEC_ENV}"