From 6acb80ab477f72a5c49c81e3555020ff0995c8fc Mon Sep 17 00:00:00 2001 From: Qing Lan Date: Sat, 9 Dec 2023 15:25:27 -0800 Subject: [PATCH] [Docker] upgrade cuda 12.1 support for DJLServing (#1370) --- .github/workflows/docker-nightly-publish.yml | 2 +- .github/workflows/integration.yml | 8 +++---- serving/docker/Dockerfile | 4 ++-- serving/docker/README.md | 2 +- serving/docker/docker-compose.yml | 6 ++--- ...u118.Dockerfile => pytorch-gpu.Dockerfile} | 24 +++++++++---------- serving/docker/scripts/pull_and_retag.sh | 2 +- serving/docker/scripts/security_patch.sh | 3 +-- tests/integration/download_models.sh | 6 ++--- 9 files changed, 28 insertions(+), 29 deletions(-) rename serving/docker/{pytorch-cu118.Dockerfile => pytorch-gpu.Dockerfile} (85%) diff --git a/.github/workflows/docker-nightly-publish.yml b/.github/workflows/docker-nightly-publish.yml index e22cc4706..bb0592bf7 100644 --- a/.github/workflows/docker-nightly-publish.yml +++ b/.github/workflows/docker-nightly-publish.yml @@ -19,7 +19,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [ cpu, cpu-full, pytorch-inf2, pytorch-cu118, tensorrt-llm ] + arch: [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm ] steps: - name: Clean disk space run: | diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index b01372ff5..63a2350c9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -170,24 +170,24 @@ jobs: working-directory: benchmark run: ./gradlew installOnLinux - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh pytorch-cu118 ${{ github.event.inputs.djl-version }} + run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }} - name: Download models and dockers working-directory: tests/integration run: | docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG mkdir logs - ./download_models.sh pytorch-cu118 + ./download_models.sh pytorch-gpu - name: Test Python model working-directory: tests/integration run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \ + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip ./test_client.sh image/jpg models/kitten.jpg docker rm -f $(docker ps -aq) - name: Test PyTorch model working-directory: tests/integration run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \ + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip ./test_client.sh image/jpg models/kitten.jpg docker rm -f $(docker ps -aq) diff --git a/serving/docker/Dockerfile b/serving/docker/Dockerfile index 27b8723ec..9a8561617 100644 --- a/serving/docker/Dockerfile +++ b/serving/docker/Dockerfile @@ -60,7 +60,7 @@ LABEL djl-version=$djl_version FROM base AS cpu-full -ARG torch_version=2.0.1 +ARG torch_version=2.1.1 COPY scripts scripts/ RUN scripts/install_python.sh && \ @@ -76,5 +76,5 @@ RUN scripts/install_python.sh && \ apt-get clean -y && rm -rf /var/lib/apt/lists/* \ LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.cpu-full="true" -LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.cpu-full="true" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.cpu-full="true" LABEL torch-version=$torch_version diff --git a/serving/docker/README.md b/serving/docker/README.md index 3eead9eec..bbc38b91e 100644 --- a/serving/docker/README.md +++ b/serving/docker/README.md @@ -42,7 +42,7 @@ mkdir models cd models curl -O https://resources.djl.ai/test-models/pytorch/bert_qa_jit.tar.gz -docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.23.0-pytorch-cu118 +docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.26.0-pytorch-gpu ``` ### AWS Inferentia diff --git a/serving/docker/docker-compose.yml b/serving/docker/docker-compose.yml index acd6e5034..062a1f10f 100644 --- a/serving/docker/docker-compose.yml +++ b/serving/docker/docker-compose.yml @@ -22,12 +22,12 @@ services: context: . dockerfile: deepspeed.Dockerfile image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}deepspeed${NIGHTLY}" - pytorch-cu118: + pytorch-gpu: build: context: . target: base - dockerfile: pytorch-cu118.Dockerfile - image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-cu118${NIGHTLY}" + dockerfile: pytorch-gpu.Dockerfile + image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-gpu${NIGHTLY}" tensorrt-llm: build: context: . diff --git a/serving/docker/pytorch-cu118.Dockerfile b/serving/docker/pytorch-gpu.Dockerfile similarity index 85% rename from serving/docker/pytorch-cu118.Dockerfile rename to serving/docker/pytorch-gpu.Dockerfile index 0ac4b98e8..5799e0934 100644 --- a/serving/docker/pytorch-cu118.Dockerfile +++ b/serving/docker/pytorch-gpu.Dockerfile @@ -9,15 +9,15 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. -ARG version=11.8.0-cudnn8-devel-ubuntu20.04 +ARG version=12.1.1-cudnn8-devel-ubuntu22.04 FROM nvidia/cuda:$version as base -ARG djl_version=0.24.0~SNAPSHOT -ARG cuda_version=cu118 -ARG torch_version=2.0.1 -ARG torch_vision_version=0.15.2 -ARG python_version=3.9 +ARG djl_version=0.26.0~SNAPSHOT +ARG cuda_version=cu121 +ARG torch_version=2.1.1 +ARG torch_vision_version=0.16.1 +ARG python_version=3.10 RUN mkdir -p /opt/djl/conf && \ mkdir -p /opt/ml/model @@ -31,10 +31,10 @@ ENV MODEL_SERVER_HOME=/opt/djl ENV DJL_CACHE_DIR=/tmp/.djl.ai ENV HUGGINGFACE_HUB_CACHE=/tmp ENV TRANSFORMERS_CACHE=/tmp -ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.9/dist-packages/torch/lib +ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib ENV PYTORCH_PRECXX11=true ENV PYTORCH_VERSION=${torch_version} -ENV PYTORCH_FLAVOR=cu118-precxx11 +ENV PYTORCH_FLAVOR=cu121-precxx11 # TODO: remove TORCH_CUDNN_V8_API_DISABLED once PyTorch bug is fixed ENV TORCH_CUDNN_V8_API_DISABLED=1 ENV JAVA_OPTS="-Xmx1g -Xms1g -XX:+ExitOnOutOfMemoryError -Dai.djl.default_engine=PyTorch" @@ -53,9 +53,9 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh && \ scripts/install_djl_serving.sh $djl_version ${torch_version} && \ scripts/install_python.sh ${python_version} && \ scripts/install_s5cmd.sh x64 && \ - pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 && \ + pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu121 && \ scripts/patch_oss_dlc.sh python && \ - scripts/security_patch.sh pytorch-cu118 && \ + scripts/security_patch.sh pytorch-gpu && \ useradd -m -d /home/djl djl && \ chown -R djl:djl /opt/djl && \ rm -rf scripts && pip3 cache purge && \ @@ -69,8 +69,8 @@ CMD ["serve"] LABEL maintainer="djl-dev@amazon.com" LABEL dlc_major_version="1" -LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-cu118="true" -LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.pytorch-cu118="true" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-gpu="true" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.pytorch-cu121="true" LABEL com.amazonaws.sagemaker.capabilities.multi-models="true" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" LABEL djl-version=$djl_version diff --git a/serving/docker/scripts/pull_and_retag.sh b/serving/docker/scripts/pull_and_retag.sh index d2bc913fb..64c34bdc7 100755 --- a/serving/docker/scripts/pull_and_retag.sh +++ b/serving/docker/scripts/pull_and_retag.sh @@ -2,7 +2,7 @@ version=$1 repo=$2 -images="cpu aarch64 cpu-full pytorch-inf2 pytorch-cu118 deepspeed tensorrt-llm" +images="cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu deepspeed tensorrt-llm" for image in $images; do if [[ ! "$version" == "nightly" ]]; then diff --git a/serving/docker/scripts/security_patch.sh b/serving/docker/scripts/security_patch.sh index 97b26dcc1..fa835c5c2 100755 --- a/serving/docker/scripts/security_patch.sh +++ b/serving/docker/scripts/security_patch.sh @@ -4,8 +4,7 @@ IMAGE_NAME=$1 apt-get update -if [[ "$IMAGE_NAME" == "deepspeed" ]] || \ - [[ "$IMAGE_NAME" == "pytorch-cu118" ]]; then +if [[ "$IMAGE_NAME" == "deepspeed" ]]; then apt-get upgrade -y dpkg e2fsprogs libdpkg-perl libpcre2-8-0 libpcre3 openssl libsqlite3-0 libsepol1 libdbus-1-3 curl elif [[ "$IMAGE_NAME" == "cpu" ]]; then apt-get upgrade -y libpcre2-8-0 libdbus-1-3 curl diff --git a/tests/integration/download_models.sh b/tests/integration/download_models.sh index 89f98483d..cdcc57470 100755 --- a/tests/integration/download_models.sh +++ b/tests/integration/download_models.sh @@ -2,7 +2,7 @@ set -e -platform=$1 # expected values are "cpu" "cpu-full" "pytorch-cu118" "pytorch-inf2" "aarch64" +platform=$1 # expected values are "cpu" "cpu-full" "pytorch-gpu" "pytorch-inf2" "aarch64" rm -rf models mkdir models && cd models @@ -38,7 +38,7 @@ download() { } case $platform in -cpu | cpu-full | pytorch-cu118) +cpu | cpu-full | pytorch-gpu) download "${general_platform_models_urls[@]}" ;; pytorch-inf2) @@ -48,7 +48,7 @@ aarch64) download "${aarch_models_urls[@]}" ;; *) - echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-cu118, pytorch-inf2, aarch64" + echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-gpu, pytorch-inf2, aarch64" exit 1 ;; esac