diff --git a/docs/source/gs-processing/usage/distributed-processing-setup.rst b/docs/source/gs-processing/usage/distributed-processing-setup.rst index b50880a48d..261c0ce9a9 100644 --- a/docs/source/gs-processing/usage/distributed-processing-setup.rst +++ b/docs/source/gs-processing/usage/distributed-processing-setup.rst @@ -114,9 +114,13 @@ for more information. Support for arm64 architecture ------------------------------ -You might have noticed that we include the image's architecture, ``x86_64`` in the image name. For EMR Serverless images, it is possible to build images that support ``arm64`` instances, -which can lead to improved runtime and cost compared to ``x86_64``. To build ``arm64`` images +which can lead to improved runtime and cost compared to ``x86_64``. You can build an ``arm64`` +image natively by installing Docker and following the above process on an ARM instance such +as ``M6G`` or ``M7G``. See the `AWS documentation `_ +for instances powered by the Graviton processor. + +To build ``arm64`` images on an ``x86_64`` host you need to enable multi-platform builds for Docker. The easiest way to do so is to use QEMU emulation. To install the QEMU related libraries you can run @@ -159,9 +163,10 @@ To build an EMR Serverless GSProcessing image for the ``arm64`` architecture you Building images under emulation using QEMU can be significantly slower than native builds (more than 20 minutes to build the GSProcessing ``arm64`` image). - To speed up the build process you can look into using ``buildx`` with multiple native nodes, - or cross-compilation. - See `the official Docker documentation `_ for details. + To speed up the build process you can build on an ARM instances, + look into using ``buildx`` with multiple native nodes, or use cross-compilation. + See `the official Docker documentation `_ + for details. Push the image to the Amazon Elastic Container Registry (ECR) ------------------------------------------------------------- diff --git a/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu b/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu deleted file mode 100644 index c39e21bbf6..0000000000 --- a/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu +++ /dev/null @@ -1,45 +0,0 @@ -# syntax=docker/dockerfile:experimental -FROM 153931337802.dkr.ecr.us-west-2.amazonaws.com/sagemaker-spark-processing:3.4-cpu-py39-v1.0 AS base - -# Python won’t try to write .pyc or .pyo files on the import of source modules -# Force stdin, stdout and stderr to be totally unbuffered. Good for logging -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/conda/lib" -ENV PATH=/opt/conda/bin:$PATH - -# GSProcessing requirements -RUN pipenv install pip==23.1.2 setuptools wheel spacy==3.6.0 pyspark==3.4.1 \ - pyarrow==13.0.0 joblib==1.3.1 psutil==5.9.5 pandas==1.3.5 \ - boto3==1.28.38 protobuf==3.20.3 mock==5.1.0 \ - && rm -rf /root/.cache -# Do a pipenv sync so our base libs are independent from our editable code, making them cacheable -RUN pipenv sync --system && python3 -m spacy download en_core_web_lg \ - && rm -rf /root/.cache - -# Graphloader codebase -COPY code/ /usr/lib/spark/code/ -WORKDIR /usr/lib/spark/code/ - -# Base container assumes this is the workdir -ENV SPARK_HOME /usr/lib/spark -WORKDIR $SPARK_HOME - -# Ensure our python3 installation is the one used -RUN echo 'alias python3=python3.9' >> ~/.bashrc - -# Starts framework -ENTRYPOINT ["bash", "/usr/lib/spark/code/docker-entry.sh"] - -FROM base AS prod -RUN python3 -m pip install /usr/lib/spark/code/graphstorm_processing-*.whl && \ - rm /usr/lib/spark/code/graphstorm_processing-*.whl -CMD ["gs-processing"] - -FROM base AS test -RUN python3 -m pip install /usr/lib/spark/code/graphstorm-processing/ -CMD ["sh", "-c", "pytest ./code/tests/"] diff --git a/graphstorm-processing/tests/resources/small_heterogeneous_graph/gsprocessing-config.json b/graphstorm-processing/tests/resources/small_heterogeneous_graph/gsprocessing-config.json index 48b3b2deb8..1ea789b69b 100644 --- a/graphstorm-processing/tests/resources/small_heterogeneous_graph/gsprocessing-config.json +++ b/graphstorm-processing/tests/resources/small_heterogeneous_graph/gsprocessing-config.json @@ -21,7 +21,7 @@ ], "separator": "," }, - "type": "movies", + "type": "movie", "column": "~id" }, {