Skip to content

Commit

Permalink
[Docker] upgrade cuda 12.1 support for DJLServing (#1370)
Browse files Browse the repository at this point in the history
  • Loading branch information
Qing Lan authored Dec 9, 2023
1 parent c7986f2 commit 6acb80a
Show file tree
Hide file tree
Showing 9 changed files with 28 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-nightly-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
arch: [ cpu, cpu-full, pytorch-inf2, pytorch-cu118, tensorrt-llm ]
arch: [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm ]
steps:
- name: Clean disk space
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,24 +170,24 @@ jobs:
working-directory: benchmark
run: ./gradlew installOnLinux
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-cu118 ${{ github.event.inputs.djl-version }}
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
mkdir logs
./download_models.sh pytorch-cu118
./download_models.sh pytorch-gpu
- name: Test Python model
working-directory: tests/integration
run: |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \
serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip
./test_client.sh image/jpg models/kitten.jpg
docker rm -f $(docker ps -aq)
- name: Test PyTorch model
working-directory: tests/integration
run: |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-cu118 \
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \
serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip
./test_client.sh image/jpg models/kitten.jpg
docker rm -f $(docker ps -aq)
Expand Down
4 changes: 2 additions & 2 deletions serving/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ LABEL djl-version=$djl_version

FROM base AS cpu-full

ARG torch_version=2.0.1
ARG torch_version=2.1.1

COPY scripts scripts/
RUN scripts/install_python.sh && \
Expand All @@ -76,5 +76,5 @@ RUN scripts/install_python.sh && \
apt-get clean -y && rm -rf /var/lib/apt/lists/* \

LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.cpu-full="true"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.cpu-full="true"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.cpu-full="true"
LABEL torch-version=$torch_version
2 changes: 1 addition & 1 deletion serving/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ mkdir models
cd models
curl -O https://resources.djl.ai/test-models/pytorch/bert_qa_jit.tar.gz

docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.23.0-pytorch-cu118
docker run -it --runtime=nvidia --shm-size 2g -v $PWD:/opt/ml/model -p 8080:8080 deepjavalibrary/djl-serving:0.26.0-pytorch-gpu
```

### AWS Inferentia
Expand Down
6 changes: 3 additions & 3 deletions serving/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ services:
context: .
dockerfile: deepspeed.Dockerfile
image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}deepspeed${NIGHTLY}"
pytorch-cu118:
pytorch-gpu:
build:
context: .
target: base
dockerfile: pytorch-cu118.Dockerfile
image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-cu118${NIGHTLY}"
dockerfile: pytorch-gpu.Dockerfile
image: "deepjavalibrary/djl-serving:${RELEASE_VERSION}pytorch-gpu${NIGHTLY}"
tensorrt-llm:
build:
context: .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
ARG version=11.8.0-cudnn8-devel-ubuntu20.04
ARG version=12.1.1-cudnn8-devel-ubuntu22.04

FROM nvidia/cuda:$version as base

ARG djl_version=0.24.0~SNAPSHOT
ARG cuda_version=cu118
ARG torch_version=2.0.1
ARG torch_vision_version=0.15.2
ARG python_version=3.9
ARG djl_version=0.26.0~SNAPSHOT
ARG cuda_version=cu121
ARG torch_version=2.1.1
ARG torch_vision_version=0.16.1
ARG python_version=3.10

RUN mkdir -p /opt/djl/conf && \
mkdir -p /opt/ml/model
Expand All @@ -31,10 +31,10 @@ ENV MODEL_SERVER_HOME=/opt/djl
ENV DJL_CACHE_DIR=/tmp/.djl.ai
ENV HUGGINGFACE_HUB_CACHE=/tmp
ENV TRANSFORMERS_CACHE=/tmp
ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.9/dist-packages/torch/lib
ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib
ENV PYTORCH_PRECXX11=true
ENV PYTORCH_VERSION=${torch_version}
ENV PYTORCH_FLAVOR=cu118-precxx11
ENV PYTORCH_FLAVOR=cu121-precxx11
# TODO: remove TORCH_CUDNN_V8_API_DISABLED once PyTorch bug is fixed
ENV TORCH_CUDNN_V8_API_DISABLED=1
ENV JAVA_OPTS="-Xmx1g -Xms1g -XX:+ExitOnOutOfMemoryError -Dai.djl.default_engine=PyTorch"
Expand All @@ -53,9 +53,9 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh && \
scripts/install_djl_serving.sh $djl_version ${torch_version} && \
scripts/install_python.sh ${python_version} && \
scripts/install_s5cmd.sh x64 && \
pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 && \
pip3 install numpy && pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu121 && \
scripts/patch_oss_dlc.sh python && \
scripts/security_patch.sh pytorch-cu118 && \
scripts/security_patch.sh pytorch-gpu && \
useradd -m -d /home/djl djl && \
chown -R djl:djl /opt/djl && \
rm -rf scripts && pip3 cache purge && \
Expand All @@ -69,8 +69,8 @@ CMD ["serve"]

LABEL maintainer="[email protected]"
LABEL dlc_major_version="1"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-cu118="true"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-25-0.pytorch-cu118="true"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.pytorch-gpu="true"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.v0-26-0.pytorch-cu121="true"
LABEL com.amazonaws.sagemaker.capabilities.multi-models="true"
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
LABEL djl-version=$djl_version
Expand Down
2 changes: 1 addition & 1 deletion serving/docker/scripts/pull_and_retag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

version=$1
repo=$2
images="cpu aarch64 cpu-full pytorch-inf2 pytorch-cu118 deepspeed tensorrt-llm"
images="cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu deepspeed tensorrt-llm"

for image in $images; do
if [[ ! "$version" == "nightly" ]]; then
Expand Down
3 changes: 1 addition & 2 deletions serving/docker/scripts/security_patch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ IMAGE_NAME=$1

apt-get update

if [[ "$IMAGE_NAME" == "deepspeed" ]] || \
[[ "$IMAGE_NAME" == "pytorch-cu118" ]]; then
if [[ "$IMAGE_NAME" == "deepspeed" ]]; then
apt-get upgrade -y dpkg e2fsprogs libdpkg-perl libpcre2-8-0 libpcre3 openssl libsqlite3-0 libsepol1 libdbus-1-3 curl
elif [[ "$IMAGE_NAME" == "cpu" ]]; then
apt-get upgrade -y libpcre2-8-0 libdbus-1-3 curl
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/download_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -e

platform=$1 # expected values are "cpu" "cpu-full" "pytorch-cu118" "pytorch-inf2" "aarch64"
platform=$1 # expected values are "cpu" "cpu-full" "pytorch-gpu" "pytorch-inf2" "aarch64"

rm -rf models
mkdir models && cd models
Expand Down Expand Up @@ -38,7 +38,7 @@ download() {
}

case $platform in
cpu | cpu-full | pytorch-cu118)
cpu | cpu-full | pytorch-gpu)
download "${general_platform_models_urls[@]}"
;;
pytorch-inf2)
Expand All @@ -48,7 +48,7 @@ aarch64)
download "${aarch_models_urls[@]}"
;;
*)
echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-cu118, pytorch-inf2, aarch64"
echo "Bad argument. Expecting one of the values: cpu, cpu-full, pytorch-gpu, pytorch-inf2, aarch64"
exit 1
;;
esac

0 comments on commit 6acb80a

Please sign in to comment.