Skip to content

Commit

Permalink
refactor lmi and trt dockerfiles to install python requirements from …
Browse files Browse the repository at this point in the history
…requirements file
  • Loading branch information
siddvenk committed Nov 12, 2024
1 parent 8481f56 commit ad4f874
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 104 deletions.
73 changes: 10 additions & 63 deletions serving/docker/lmi.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,8 @@ ARG djl_version
ARG djl_serving_version
# Base Deps
ARG python_version=3.11
ARG torch_version=2.5.1
ARG torch_vision_version=0.20.1
ARG djl_torch_version=2.4.0
ARG onnx_version=1.19.0
ARG pydantic_version=2.9.2
ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl"
# HF Deps
ARG protobuf_version=3.20.3
ARG transformers_version=4.45.2
ARG accelerate_version=1.0.1
ARG bitsandbytes_version=0.44.1
ARG optimum_version=1.23.2
ARG auto_gptq_version=0.7.1
ARG datasets_version=3.0.1
ARG autoawq_version=0.2.5
ARG tokenizers_version=0.20.1
# LMI-Dist Deps
ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl"
ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl"
# %2B is the url escape for the '+' character
ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl"
ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl"
ARG peft_version=0.13.2

ARG sagemaker_fast_model_loader_wheel="https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl"

EXPOSE 8080

Expand Down Expand Up @@ -105,46 +82,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq libaio-
&& pip3 cache purge \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --index-url https://download.pytorch.org/whl/cu124
RUN pip3 install \
${seq_scheduler_wheel} \
peft==${peft_version} \
protobuf==${protobuf_version} \
transformers==${transformers_version} \
hf-transfer \
zstandard \
datasets==${datasets_version} \
mpi4py \
sentencepiece \
tiktoken \
blobfile \
einops \
accelerate==${accelerate_version} \
bitsandbytes==${bitsandbytes_version} \
auto-gptq==${auto_gptq_version} \
pandas \
pyarrow \
jinja2 \
retrying \
opencv-contrib-python-headless \
safetensors \
scipy \
onnx \
sentence_transformers \
onnxruntime \
autoawq==${autoawq_version} \
tokenizers==${tokenizers_version} \
pydantic==${pydantic_version} \
${djl_converter_wheel} \
optimum==${optimum_version} \
${flash_infer_wheel} \
${vllm_wheel} \
${lmi_dist_wheel} \
torch==${torch_version} \
torchvision==${torch_vision_version} \
${sagemaker_fast_model_loader_wheel} \
&& git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \
&& pip3 cache purge
COPY requirements-lmi.txt ./requirements.txt
RUN pip3 install -r requirements.txt
# TODO: Migrate to llmcompressor, this repo is deprecated
RUN git clone https://github.com/neuralmagic/AutoFP8.git && \
cd AutoFP8 && \
git reset --hard 4b2092c && \
pip3 install . --no-deps && \
cd .. && \
rm -rf AutoFP8 && \
pip3 cache purge

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*
Expand Down
42 changes: 42 additions & 0 deletions serving/docker/requirements-lmi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
torch==2.5.1
torchvision==0.20.1
# sequence scheduler for hf accelerate rolling batch
https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl
peft==0.13.2
protobuf==3.20.3
transformers==4.45.2
hf-transfer
zstandard
datasets==3.0.1
mpi4py
sentencepiece
tiktoken
blobfile
einops
accelerate==1.0.1
bitsandbytes==0.44.1
auto-gptq==0.7.1
pandas
pyarrow
jinja2
retrying
opencv-contrib-python-headless
safetensors
scipy
onnx
sentence_transformers
onnxruntime
autoawq==0.2.5
tokenizers==0.20.1
pydantic==2.9.2
# djl converter wheel for converting hf models to onnx/rust
https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl
optimum==1.23.2
# flashinfer for vllm
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
# vllm wheel using PT 2.5.1
https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl
# lmi-dist wheel - need to change this to the one built by ci once ready
https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl
# sagemaker fast model loader wheel
https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl
24 changes: 24 additions & 0 deletions serving/docker/requirements-neuron.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
--extra-index-url https://pip.repos.neuron.amazonaws.com
accelerate==0.29.2
safetensors
torchvision==0.16.2
neuronx-cc==2.15.128.0
torch-neuronx==2.1.2.2.3.0
transformers-neuronx==0.12.313
torch_xla==2.1.4
neuronx-cc-stubs==2.15.128.0
huggingface-hub==0.25.2
neuronx_distributed==0.9.0
protobuf
sentencepiece
jinja2
diffusers==0.28.2
opencv-contrib-python-headless
pillow
pydantic==2.6.1
optimum
optimum-neuron==0.0.24
tiktoken
blobfile
transformers==4.45.2
https://publish.djl.ai/neuron_vllm/vllm-0.6.2%2Bnightly-py3-none-any.whl
38 changes: 38 additions & 0 deletions serving/docker/requirements-trt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
torch==2.4.0
transformers==4.42.4
accelerate==0.32.1
peft==0.13.2
sentencepiece
mpi4py
cuda-python==12.5
onnx
polygraphy
pynvml==11.5.0
datasets==2.19.1
pydantic==2.6.1
scipy
torchprofile
bitsandbytes
ninja
transformers_stream_generator
einops
tiktoken
jinja2
graphviz
blobfile
colored
h5py
strenum
pulp
flax
easydict
tensorrt==10.3.0
janus==1.0.0
nvidia-modelopt==0.15.0
numpy==1.26.4
# TRTLLM toolkit wheel
https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-0.12.0%2Bnightly-py3-none-any.whl
# TRTLLM wheel - contains necessary patch fix not available in OSS
https://publish.djl.ai/tensorrt-llm/v0.12.0/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl
# Triton toolkit wheel - pybindings for trtllm
https://publish.djl.ai/tritonserver/r24.04/tritontoolkit-24.4-py310-none-any.whl
55 changes: 14 additions & 41 deletions serving/docker/tensorrt-llm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,8 @@ ARG version=12.5.1-devel-ubuntu22.04
FROM nvidia/cuda:$version
ARG cuda_version=cu125
ARG python_version=3.10
ARG TORCH_VERSION=2.4.0
ARG djl_version
ARG djl_serving_version
ARG transformers_version=4.44.2
ARG accelerate_version=0.32.1
ARG tensorrtlibs_version=10.1.0
# %2B is the url escape for the '+' character
ARG trtllm_toolkit_version=0.12.0%2Bnightly
ARG trtllm_version=v0.12.0
ARG cuda_python_version=12.5
ARG peft_version=0.10.0
ARG triton_version=r24.04
ARG trtllm_toolkit_wheel="https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-${trtllm_toolkit_version}-py3-none-any.whl"
ARG trtllm_wheel="https://publish.djl.ai/tensorrt-llm/${trtllm_version}/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl"
ARG triton_toolkit_wheel="https://publish.djl.ai/tritonserver/${triton_version}/tritontoolkit-24.4-py310-none-any.whl"
ARG pydantic_version=2.6.1
ARG modelopt_version=0.15.0
ARG janus_version=1.0.0
ARG pynvml_verison=11.5.0
ARG numpy_version=1.26.4
ARG datasets_version=2.19.1

EXPOSE 8080

Expand Down Expand Up @@ -68,33 +49,30 @@ COPY partition /opt/djl/partition
COPY distribution[s]/ ./
RUN mv *.deb djl-serving_all.deb || true

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-5 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install OpenMPI and other deps
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y g++ wget unzip openmpi-bin libopenmpi-dev libffi-dev git-lfs rapidjson-dev graphviz && \
scripts/install_python.sh ${python_version} && \
pip3 cache purge && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install PyTorch
# Qwen needs transformers_stream_generator, tiktoken and einops
RUN pip install torch==${TORCH_VERSION} transformers==${transformers_version} accelerate==${accelerate_version} peft==${peft_version} sentencepiece \
mpi4py cuda-python==${cuda_python_version} onnx polygraphy pynvml==${pynvml_verison} datasets==${datasets_version} pydantic==${pydantic_version} scipy torchprofile bitsandbytes ninja \
transformers_stream_generator einops tiktoken jinja2 graphviz blobfile colored h5py strenum pulp flax easydict && \
pip3 cache purge

# Install TensorRT and TRT-LLM Deps
RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt==${tensorrtlibs_version} janus==${janus_version} nvidia-modelopt==${modelopt_version} && \
pip install --no-deps ${trtllm_wheel} && \
# Install Python dependencies
COPY requirements-trt.txt ./requirements.txt
RUN pip3 install -r requirements.txt && \
pyver=$(echo $python_version | awk -F. '{print $1$2}') && \
pip3 cache purge
# TRT depends on transformers<=4.42.4, but we need a higher version for llama 3.1
RUN pip3 install transformers==4.44.2 --no-deps

# download dependencies
RUN pip install ${triton_toolkit_wheel} ${trtllm_toolkit_wheel} && \
mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \
curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/${triton_version}/libtritonserver.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm_common.so && \
curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libnvinfer_plugin_tensorrt_llm.so.10 && \
RUN mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \
curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/r24.04/libtritonserver.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm_common.so && \
curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/v0.12.0/libnvinfer_plugin_tensorrt_llm.so.10 && \
pip3 cache purge && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

Expand All @@ -108,13 +86,8 @@ RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version && \
useradd -m -d /home/djl djl && \
chown -R djl:djl /opt/djl && \
rm -rf scripts && \
pip3 install numpy==${numpy_version} && \
pip3 cache purge && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

LABEL maintainer="[email protected]"
LABEL dlc_major_version="1"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.tensorrtllm="true"
Expand All @@ -123,7 +96,7 @@ LABEL com.amazonaws.sagemaker.capabilities.multi-models="true"
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
LABEL djl-version=$djl_version
LABEL djl-serving-version=$djl_serving_version
LABEL trtllm-version=$trtllm_version
LABEL trtllm-version=v0.12.0
LABEL cuda-version=$cuda_version
# To use the 535 CUDA driver
LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.2

0 comments on commit ad4f874

Please sign in to comment.