diff --git a/docs/source/gs-processing/gs-processing-getting-started.rst b/docs/source/gs-processing/gs-processing-getting-started.rst index 327e2ac111..048adc1de2 100644 --- a/docs/source/gs-processing/gs-processing-getting-started.rst +++ b/docs/source/gs-processing/gs-processing-getting-started.rst @@ -145,11 +145,12 @@ distributed training pipeline. Running on AWS resources ------------------------ -GSProcessing supports Amazon SageMaker and EMR Serverless as execution environments. +GSProcessing supports Amazon SageMaker, EMR on EC2, and EMR Serverless as execution environments. To run distributed jobs on AWS resources we will have to build a Docker image and push it to the Amazon Elastic Container Registry, which we cover in -:doc:`usage/distributed-processing-setup` and run a SageMaker Processing -job which we describe in :doc:`usage/amazon-sagemaker`, or EMR Serverless +:doc:`usage/distributed-processing-setup`. We can then run either a SageMaker Processing +job which we describe in :doc:`usage/amazon-sagemaker`, an EMR on EC2 job which +we describe in :doc:`usage/emr`, or an EMR Serverless job that is covered in :doc:`usage/emr-serverless`. diff --git a/docs/source/gs-processing/usage/distributed-processing-setup.rst b/docs/source/gs-processing/usage/distributed-processing-setup.rst index 00d15399c0..0aad7af9ce 100644 --- a/docs/source/gs-processing/usage/distributed-processing-setup.rst +++ b/docs/source/gs-processing/usage/distributed-processing-setup.rst @@ -87,14 +87,14 @@ Once Docker and Poetry are installed, and your AWS credentials are set up, we can use the provided scripts in the ``graphstorm-processing/docker`` directory to build the image. -GSProcessing supports Amazon SageMaker and EMR Serverless as +GSProcessing supports Amazon SageMaker, EMR, and EMR Serverless as execution environments, so we need to choose which image we want to build first. The ``build_gsprocessing_image.sh`` script can build the image locally and tag it, provided the intended execution environment, using the ``-e/--environment`` argument. The supported environments -are ``sagemaker`` and ``emr-serverless``. +are ``sagemaker``, ``emr``, and ``emr-serverless``. For example, assuming our current directory is where we cloned ``graphstorm/graphstorm-processing``, we can use the following to build the SageMaker image: @@ -131,7 +131,7 @@ You can find detailed instructions on creating a VPC for EMR Serverless in the A Support for arm64 architecture ------------------------------ -For EMR Serverless images, it is possible to build images for the ``arm64`` architecture, +For EMR and EMR Serverless images, it is possible to build images for the ``arm64`` architecture, which can lead to improved runtime and cost compared to ``x86_64``. For more details on EMR Serverless architecture options see the `official docs `_. diff --git a/docs/source/gs-processing/usage/emr-serverless.rst b/docs/source/gs-processing/usage/emr-serverless.rst index a9ce1d7364..e92608581c 100644 --- a/docs/source/gs-processing/usage/emr-serverless.rst +++ b/docs/source/gs-processing/usage/emr-serverless.rst @@ -69,7 +69,7 @@ you are using would be: "ecr:BatchGetImage", "ecr:DescribeImages" ], - "Resource": "" + "Resource": "".dkr.ecr..amazonaws.com/graphstorm-processing-emr-serverless" } ] } diff --git a/graphstorm-processing/docker/0.3.0/emr/Dockerfile.cpu b/graphstorm-processing/docker/0.3.0/emr/Dockerfile.cpu new file mode 100644 index 0000000000..9feed22b3b --- /dev/null +++ b/graphstorm-processing/docker/0.3.0/emr/Dockerfile.cpu @@ -0,0 +1,76 @@ +# TODO: Pin image version +FROM public.ecr.aws/amazoncorretto/amazoncorretto:17 as base + +ENV PYTHON_VERSION=3.9.18 + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +ENV PYENV_ROOT="${HOME}/.pyenv" +ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}" + +ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python +ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python + +# pyenv and Spark/YARN dependencies +RUN yum erase -y openssl-devel && \ + yum install -y \ + bzip2-devel\ + gcc \ + git \ + headless \ + hostname \ + java-17-amazon-corretto-headless \ + libffi-devel \ + make \ + ncurses-devel \ + openssl11-devel \ + readline-devel \ + sqlite-devel \ + sudo \ + tar \ + xz-devel && \ + rm -rf /var/cache/yum + +# Install Python through pyenv +RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} --single-branch && \ + pyenv install ${PYTHON_VERSION} && \ + pyenv global ${PYTHON_VERSION} + +FROM base AS runtime + +WORKDIR /usr/lib/spark/code/ + + +# Install GSProcessing requirements to pyenv Python +COPY requirements.txt requirements.txt +# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed: +# https://github.com/moby/buildkit/issues/1512 +RUN pip3 install -r /usr/lib/spark/code/requirements.txt \ + && rm -rf /root/.cache + +# Install Huggingface model cache if it is necessary +# This needs to happen after the transformers library has been installed above +ARG MODEL="" +ENV HF_HOME=/usr/lib/spark/.cache/huggingface/hub +RUN if [ -z "${MODEL}" ]; then \ + echo "Skip installing model cache"; \ +else \ + echo "Installing model cache for $MODEL" && \ + python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \ + python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \ +fi + + +# GSProcessing codebase +COPY code/ /usr/lib/spark/code/ + +FROM runtime AS prod +RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \ + rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache + +FROM runtime AS test +RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ && rm -rf /root/.cache diff --git a/graphstorm-processing/docker/build_gsprocessing_image.sh b/graphstorm-processing/docker/build_gsprocessing_image.sh index 5b53ace508..aa87b67460 100644 --- a/graphstorm-processing/docker/build_gsprocessing_image.sh +++ b/graphstorm-processing/docker/build_gsprocessing_image.sh @@ -15,7 +15,7 @@ Available options: -h, --help Print this help and exit -x, --verbose Print script debug info (set -x) --e, --environment Image execution environment. Must be one of 'emr-serverless' or 'sagemaker'. Required. +-e, --environment Image execution environment. Must be one of 'emr', 'emr-serverless' or 'sagemaker'. Required. -a, --architecture Image architecture. Must be one of 'x86_64' or 'arm64'. Default is 'x86_64'. Note that only x86_64 architecture is supported for SageMaker. -t, --target Docker image target, must be one of 'prod' or 'test'. Default is 'prod'. @@ -102,7 +102,7 @@ parse_params() { args=("$@") # check required params and arguments - [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [emr-serverless|sagemaker]" + [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [emr|emr-serverless|sagemaker]" return 0 } @@ -117,6 +117,13 @@ cleanup() { parse_params "$@" +if [[ ${EXEC_ENV} == "emr" || ${EXEC_ENV} == "emr-serverless" || ${EXEC_ENV} == "sagemaker" ]]; then + : # Do nothing +else + die "--environment parameter needs to be one of 'emr', 'emr-serverless' or 'sagemaker', got ${EXEC_ENV}" +fi + + if [[ ${TARGET} == "prod" || ${TARGET} == "test" ]]; then : # Do nothing else diff --git a/graphstorm-processing/docker/push_gsprocessing_image.sh b/graphstorm-processing/docker/push_gsprocessing_image.sh index 7d1116db1a..7df2f820dd 100644 --- a/graphstorm-processing/docker/push_gsprocessing_image.sh +++ b/graphstorm-processing/docker/push_gsprocessing_image.sh @@ -100,7 +100,7 @@ cleanup() { parse_params "${@}" -if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "emr-serverless" ]]; then +if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "emr-serverless" || ${EXEC_ENV} == "emr" ]]; then : # Do nothing else die "--environment parameter needs to be one of 'emr', 'emr-serverless' or 'sagemaker', got ${EXEC_ENV}"