[Docker] upgrade cuda 12.1 support for DJLServing (#1370)

deepjavalibrary · Dec 9, 2023 · 6acb80a · 6acb80a
1 parent c7986f2
commit 6acb80a
Show file tree

Hide file tree

Showing 9 changed files with 28 additions and 29 deletions.
diff --git a/.github/workflows/docker-nightly-publish.yml b/.github/workflows/docker-nightly-publish.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        arch: [ cpu, cpu-full, pytorch-inf2, pytorch-cu118, tensorrt-llm ]
+        arch: [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm ]
     steps:
       - name: Clean disk space
         run: |

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -170,24 +170,24 @@ jobs:
         working-directory: benchmark
         run: ./gradlew installOnLinux
       - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-cu118 ${{ github.event.inputs.djl-version }}
+        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }}
       - name: Download models and dockers
         working-directory: tests/integration
         run: |
           docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
           mkdir logs
-          ./download_models.sh pytorch-cu118
+          ./download_models.sh pytorch-gpu
       - name: Test Python model
         working-directory: tests/integration
         run: |
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \
           serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip
           ./test_client.sh image/jpg models/kitten.jpg
           docker rm -f $(docker ps -aq)
       - name: Test PyTorch model
         working-directory: tests/integration
         run: |
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \
           serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip
           ./test_client.sh image/jpg models/kitten.jpg
           docker rm -f $(docker ps -aq)

diff --git a/serving/docker/Dockerfile b/serving/docker/Dockerfile
@@ -60,7 +60,7 @@ LABEL djl-version=$djl_version
 
 FROM base AS cpu-full
 
-ARG torch_version=2.0.1
+ARG torch_version=2.1.1
 
 COPY scripts scripts/
 RUN scripts/install_python.sh && \
@@ -76,5 +76,5 @@ RUN scripts/install_python.sh && \
     apt-get clean -y && rm -rf /var/lib/apt/lists/* \
 
 LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.cpu-full="true"
-LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.cpu-full="true"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.cpu-full="true"
 LABEL torch-version=$torch_version
diff --git a/serving/docker/README.md b/serving/docker/README.md
@@ -42,7 +42,7 @@ mkdir models
 cd models
 curl -O https://resources.djl.ai/test-models/pytorch/bert_qa_jit.tar.gz
 
-docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.23.0-pytorch-cu118
+docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.26.0-pytorch-gpu
 ```
 
 ### AWS Inferentia

diff --git a/serving/docker/docker-compose.yml b/serving/docker/docker-compose.yml
@@ -22,12 +22,12 @@ services:
       context: .
       dockerfile: deepspeed.Dockerfile
     image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}deepspeed${NIGHTLY}"
-  pytorch-cu118:
+  pytorch-gpu:
     build:
       context: .
       target: base
-      dockerfile: pytorch-cu118.Dockerfile
-    image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-cu118${NIGHTLY}"
+      dockerfile: pytorch-gpu.Dockerfile
+    image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-gpu${NIGHTLY}"
   tensorrt-llm:
     build:
       context: .

diff --git a/serving/docker/pytorch-cu118.Dockerfile → serving/docker/pytorch-gpu.Dockerfile b/serving/docker/pytorch-cu118.Dockerfile → serving/docker/pytorch-gpu.Dockerfile
@@ -9,15 +9,15 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
 # the specific language governing permissions and limitations under the License.
-ARG version=11.8.0-cudnn8-devel-ubuntu20.04
+ARG version=12.1.1-cudnn8-devel-ubuntu22.04
 
 FROM nvidia/cuda:$version as base
 
-ARG djl_version=0.24.0~SNAPSHOT
-ARG cuda_version=cu118
-ARG torch_version=2.0.1
-ARG torch_vision_version=0.15.2
-ARG python_version=3.9
+ARG djl_version=0.26.0~SNAPSHOT
+ARG cuda_version=cu121
+ARG torch_version=2.1.1
+ARG torch_vision_version=0.16.1
+ARG python_version=3.10
 
 RUN mkdir -p /opt/djl/conf && \
     mkdir -p /opt/ml/model
@@ -31,10 +31,10 @@ ENV MODEL_SERVER_HOME=/opt/djl
 ENV DJL_CACHE_DIR=/tmp/.djl.ai
 ENV HUGGINGFACE_HUB_CACHE=/tmp
 ENV TRANSFORMERS_CACHE=/tmp
-ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.9/dist-packages/torch/lib
+ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib
 ENV PYTORCH_PRECXX11=true
 ENV PYTORCH_VERSION=${torch_version}
-ENV PYTORCH_FLAVOR=cu118-precxx11
+ENV PYTORCH_FLAVOR=cu121-precxx11
 # TODO: remove TORCH_CUDNN_V8_API_DISABLED once PyTorch bug is fixed
 ENV TORCH_CUDNN_V8_API_DISABLED=1
 ENV JAVA_OPTS="-Xmx1g -Xms1g -XX:+ExitOnOutOfMemoryError -Dai.djl.default_engine=PyTorch"
@@ -53,9 +53,9 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh && \
     scripts/install_djl_serving.sh $djl_version ${torch_version} && \
     scripts/install_python.sh ${python_version} && \
     scripts/install_s5cmd.sh x64 && \
-    pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 && \
+    pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu121 && \
     scripts/patch_oss_dlc.sh python && \
-    scripts/security_patch.sh pytorch-cu118 && \
+    scripts/security_patch.sh pytorch-gpu && \
     useradd -m -d /home/djl djl && \
     chown -R djl:djl /opt/djl && \
     rm -rf scripts && pip3 cache purge && \
@@ -69,8 +69,8 @@ CMD ["serve"]
 
 LABEL maintainer="[email protected]"
 LABEL dlc_major_version="1"
-LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-cu118="true"
-LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.pytorch-cu118="true"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-gpu="true"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.pytorch-cu121="true"
 LABEL com.amazonaws.sagemaker.capabilities.multi-models="true"
 LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
 LABEL djl-version=$djl_version

diff --git a/serving/docker/scripts/pull_and_retag.sh b/serving/docker/scripts/pull_and_retag.sh
@@ -2,7 +2,7 @@
 
 version=$1
 repo=$2
-images="cpu aarch64 cpu-full pytorch-inf2 pytorch-cu118 deepspeed tensorrt-llm"
+images="cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu deepspeed tensorrt-llm"
 
 for image in $images; do
     if [[ ! "$version" == "nightly" ]]; then

diff --git a/serving/docker/scripts/security_patch.sh b/serving/docker/scripts/security_patch.sh
@@ -4,8 +4,7 @@ IMAGE_NAME=$1
 
 apt-get update
 
-if [[ "$IMAGE_NAME" == "deepspeed" ]] || \
-   [[ "$IMAGE_NAME" == "pytorch-cu118" ]]; then
+if [[ "$IMAGE_NAME" == "deepspeed" ]]; then
   apt-get upgrade -y dpkg e2fsprogs libdpkg-perl libpcre2-8-0 libpcre3 openssl libsqlite3-0 libsepol1 libdbus-1-3 curl
 elif [[ "$IMAGE_NAME" == "cpu" ]]; then
   apt-get upgrade -y libpcre2-8-0 libdbus-1-3 curl

diff --git a/tests/integration/download_models.sh b/tests/integration/download_models.sh
@@ -2,7 +2,7 @@
 
 set -e
 
-platform=$1 # expected values are "cpu" "cpu-full" "pytorch-cu118" "pytorch-inf2" "aarch64"
+platform=$1 # expected values are "cpu" "cpu-full" "pytorch-gpu" "pytorch-inf2" "aarch64"
 
 rm -rf models
 mkdir models && cd models
@@ -38,7 +38,7 @@ download() {
 }
 
 case $platform in
-cpu | cpu-full | pytorch-cu118)
+cpu | cpu-full | pytorch-gpu)
   download "${general_platform_models_urls[@]}"
   ;;
 pytorch-inf2)
@@ -48,7 +48,7 @@ aarch64)
   download "${aarch_models_urls[@]}"
   ;;
 *)
-  echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-cu118, pytorch-inf2, aarch64"
+  echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-gpu, pytorch-inf2, aarch64"
   exit 1
   ;;
 esac