diff --git a/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu index b256975673..ef4fd93591 100644 --- a/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu +++ b/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu @@ -31,7 +31,7 @@ RUN yum erase -y openssl-devel && \ sudo \ xz-devel && \ rm -rf /var/cache/yum -RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} && \ +RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} --single-branch && \ pyenv install ${PYTHON_VERSION} && \ pyenv global ${PYTHON_VERSION} @@ -41,7 +41,7 @@ WORKDIR /usr/lib/spark/code/ COPY requirements.txt requirements.txt # Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed: # https://github.com/moby/buildkit/issues/1512 -RUN pip install -r /usr/lib/spark/code/requirements.txt \ +RUN pip install --no-cache-dir -r /usr/lib/spark/code/requirements.txt \ && rm -rf /root/.cache # GSProcessing codebase @@ -63,7 +63,8 @@ RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*. rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache FROM runtime AS test -RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ && rm -rf /root/.cache +RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ mock && \ + rm -rf /root/.cache USER hadoop:hadoop WORKDIR /home/hadoop diff --git a/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu b/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu index ae9f873762..6e3df724ea 100644 --- a/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu +++ b/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu @@ -11,32 +11,17 @@ ENV LC_ALL=C.UTF-8 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/conda/lib" ENV PATH=/opt/conda/bin:$PATH +ENV PIP_NO_CACHE_DIR=1 -# Install GSProcessing requirements to pipenv Python -RUN pipenv install \ - boto3==1.28.38 \ - joblib==1.3.1 \ - mock==5.1.0 \ - pandas==1.3.5 \ - pip==23.1.2 \ - protobuf==3.20.3 \ - psutil==5.9.5 \ - pyarrow==13.0.0 \ - pyspark==3.4.1 \ - scipy==1.11.3 \ - setuptools \ - transformers==4.37.1 \ - spacy==3.6.0 \ - torch==2.1.0 \ - wheel \ - && rm -rf /root/.cache -# Do a pipenv sync so our base libs are independent from our editable code, making them cacheable -RUN pipenv sync --system && python3 -m spacy download en_core_web_lg \ +WORKDIR /usr/lib/spark/code/ + +# Install GSProcessing dependencies to system Python 3.9 +COPY requirements.txt requirements.txt +RUN /usr/local/bin/python3.9 -m pip install --no-cache-dir -r /usr/lib/spark/code/requirements.txt \ && rm -rf /root/.cache # Graphloader codebase COPY code/ /usr/lib/spark/code/ -WORKDIR /usr/lib/spark/code/ # Base container assumes this is the workdir ENV SPARK_HOME /usr/lib/spark @@ -60,10 +45,11 @@ fi ENTRYPOINT ["bash", "/usr/lib/spark/code/docker-entry.sh"] FROM base AS prod -RUN python3 -m pip install /usr/lib/spark/code/graphstorm_processing-*.whl && \ - rm /usr/lib/spark/code/graphstorm_processing-*.whl +RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \ + rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache CMD ["gs-processing"] FROM base AS test -RUN python3 -m pip install /usr/lib/spark/code/graphstorm-processing/ -CMD ["sh", "-c", "pytest ./code/tests/"] +RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ mock && \ + rm -rf /root/.cache +CMD ["sh", "-c", "pytest /usr/lib/spark/code/graphstorm-processing/tests/"] diff --git a/graphstorm-processing/docker/README.md b/graphstorm-processing/docker/README.md index 7da32a74d9..1e78ab613f 100644 --- a/graphstorm-processing/docker/README.md +++ b/graphstorm-processing/docker/README.md @@ -12,10 +12,9 @@ with Amazon SageMaker see docs/source/usage/distributed-processing-setup.rst. ## Building the image To build the image you will run `bash build_gsprocessing_image.sh` -script that has one required parameter, `--target` that can take -one of two values, `prod` and `test` that determine whether we -include the source and tests on the image (when `test` is used), -or just install the libary on the image (when `prod` is used). +script that has one required parameter, `--environment` that +determines the intended execution environment of the image. +We currently support either `sagemaker` or `emr-serverless`. The script copies the necessary code, optionally builds and packages the library as a `wheel` file and builds and tags the image. @@ -23,23 +22,38 @@ the library as a `wheel` file and builds and tags the image. You can get the other parameters of the script using `bash build_gsprocessing_image.sh -h/--help` that include: -* `-p, --path` Path to graphstorm-processing directory, default is one level above this script. -* `-i, --image` Docker image name, default is 'graphstorm-processing'. -* `-v, --version` Docker version tag, default is the library's current version (`poetry version --short`) -* `-b, --build` Docker build directory, default is '/tmp/` - - +* `-e, --environment` Intended execution environment, must be one of `sagemaker` or `emr-serverless`. Required. +* `-p, --path` Path to graphstorm-processing directory, default is one level above this script. +* `-i, --image` Docker image name, default is 'graphstorm-processing'. +* `-v, --version` Docker version tag, default is the library's current version (`poetry version --short`) +* `-b, --build` Docker build directory, default is `/tmp/` +* `-a, --architecture` Target architecture for the image. Both execution environments support `x86_64`, while + EMR Serverless also supports `arm64`. +* `-s, --suffix` A suffix to add to the image tag, e.g. `-test` will name the image + `graphstorm-processing-${ENVIRONMENT}:${VERSION}-${ARCH}-test`. +* `-t, --target` Target of the image. Use `test` if you intend to use the image for testing + new library functionality, otherwise `prod`. Default: `prod` ## Pushing the image After having built the image you will run `bash push_gsprocessing_image.sh` to push the image to ECR. By default the script will optionally create -a repository on ECR named `graphstorm-processing` in the `us-west-2` region +a repository on ECR named `graphstorm-processing-${ENVIRONMENT}` in the `us-west-2` region and push the image we just built to it. You can change these default values using the other parameters of the script: -* `-i, --image` Docker image name, default is 'graphstorm-processing'. -* `-v, --version` Docker version tag, default is the library's current version (`poetry version --short`) -* `-r, --region` AWS Region to which we'll push the image. By default will get from aws-cli configuration. -* `-a, --account` AWS Account ID. By default will get from aws-cli configuration. \ No newline at end of file +* `-e, --environment` Intended execution environment, must be one of `sagemaker` or `emr-serverless`. Required. +* `-i, --image` Docker image name prefix, default is `graphstorm-processing-${ENVIRONMENT}`. +* `-v, --version` Docker version tag, default is the library's current version (`poetry version --short`) +* `-r, --region` AWS Region to which we'll push the image. By default will get from aws-cli configuration. +* `-a, --account` AWS Account ID. By default will get from aws-cli configuration. + +## Testing the image + +If you build the image with the argument `--target test` the +build script will include the source and tests on the image. + +To run the unit tests inside on a container running you have created, which helps ensure the deployed container will +behave as expected, you can run `docker run -it --rm --name gsp graphstorm-processing-${ENV}:0.2.2-${ARCH}${SUFFIX}` +which will execute the library's unit tests inside a local instance of the provided image. diff --git a/graphstorm-processing/docker/build_gsprocessing_image.sh b/graphstorm-processing/docker/build_gsprocessing_image.sh index d2b71e43da..e92edfebf5 100644 --- a/graphstorm-processing/docker/build_gsprocessing_image.sh +++ b/graphstorm-processing/docker/build_gsprocessing_image.sh @@ -7,9 +7,9 @@ SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P) usage() { cat <