diff --git a/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu b/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu new file mode 100644 index 0000000000..c39e21bbf6 --- /dev/null +++ b/graphstorm-processing/docker/0.2.1/sagemaker/Dockerfile.cpu @@ -0,0 +1,45 @@ +# syntax=docker/dockerfile:experimental +FROM 153931337802.dkr.ecr.us-west-2.amazonaws.com/sagemaker-spark-processing:3.4-cpu-py39-v1.0 AS base + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/conda/lib" +ENV PATH=/opt/conda/bin:$PATH + +# GSProcessing requirements +RUN pipenv install pip==23.1.2 setuptools wheel spacy==3.6.0 pyspark==3.4.1 \ + pyarrow==13.0.0 joblib==1.3.1 psutil==5.9.5 pandas==1.3.5 \ + boto3==1.28.38 protobuf==3.20.3 mock==5.1.0 \ + && rm -rf /root/.cache +# Do a pipenv sync so our base libs are independent from our editable code, making them cacheable +RUN pipenv sync --system && python3 -m spacy download en_core_web_lg \ + && rm -rf /root/.cache + +# Graphloader codebase +COPY code/ /usr/lib/spark/code/ +WORKDIR /usr/lib/spark/code/ + +# Base container assumes this is the workdir +ENV SPARK_HOME /usr/lib/spark +WORKDIR $SPARK_HOME + +# Ensure our python3 installation is the one used +RUN echo 'alias python3=python3.9' >> ~/.bashrc + +# Starts framework +ENTRYPOINT ["bash", "/usr/lib/spark/code/docker-entry.sh"] + +FROM base AS prod +RUN python3 -m pip install /usr/lib/spark/code/graphstorm_processing-*.whl && \ + rm /usr/lib/spark/code/graphstorm_processing-*.whl +CMD ["gs-processing"] + +FROM base AS test +RUN python3 -m pip install /usr/lib/spark/code/graphstorm-processing/ +CMD ["sh", "-c", "pytest ./code/tests/"]