diff --git a/serving/docker/lmi.Dockerfile b/serving/docker/lmi.Dockerfile index 38fc86bc1..feee90e5c 100644 --- a/serving/docker/lmi.Dockerfile +++ b/serving/docker/lmi.Dockerfile @@ -16,31 +16,8 @@ ARG djl_version ARG djl_serving_version # Base Deps ARG python_version=3.11 -ARG torch_version=2.5.1 -ARG torch_vision_version=0.20.1 ARG djl_torch_version=2.4.0 ARG onnx_version=1.19.0 -ARG pydantic_version=2.9.2 -ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl" -# HF Deps -ARG protobuf_version=3.20.3 -ARG transformers_version=4.45.2 -ARG accelerate_version=1.0.1 -ARG bitsandbytes_version=0.44.1 -ARG optimum_version=1.23.2 -ARG auto_gptq_version=0.7.1 -ARG datasets_version=3.0.1 -ARG autoawq_version=0.2.5 -ARG tokenizers_version=0.20.1 -# LMI-Dist Deps -ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl" -ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl" -# %2B is the url escape for the '+' character -ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl" -ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl" -ARG peft_version=0.13.2 - -ARG sagemaker_fast_model_loader_wheel="https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl" EXPOSE 8080 @@ -105,46 +82,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq libaio- && pip3 cache purge \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* -RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --index-url https://download.pytorch.org/whl/cu124 -RUN pip3 install \ - ${seq_scheduler_wheel} \ - peft==${peft_version} \ - protobuf==${protobuf_version} \ - transformers==${transformers_version} \ - hf-transfer \ - zstandard \ - datasets==${datasets_version} \ - mpi4py \ - sentencepiece \ - tiktoken \ - blobfile \ - einops \ - accelerate==${accelerate_version} \ - bitsandbytes==${bitsandbytes_version} \ - auto-gptq==${auto_gptq_version} \ - pandas \ - pyarrow \ - jinja2 \ - retrying \ - opencv-contrib-python-headless \ - safetensors \ - scipy \ - onnx \ - sentence_transformers \ - onnxruntime \ - autoawq==${autoawq_version} \ - tokenizers==${tokenizers_version} \ - pydantic==${pydantic_version} \ - ${djl_converter_wheel} \ - optimum==${optimum_version} \ - ${flash_infer_wheel} \ - ${vllm_wheel} \ - ${lmi_dist_wheel} \ - torch==${torch_version} \ - torchvision==${torch_vision_version} \ - ${sagemaker_fast_model_loader_wheel} \ - && git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \ - && pip3 cache purge +COPY requirements-lmi.txt ./requirements.txt +RUN pip3 install -r requirements.txt +# TODO: Migrate to llmcompressor, this repo is deprecated +RUN git clone https://github.com/neuralmagic/AutoFP8.git && \ + cd AutoFP8 && \ + git reset --hard 4b2092c && \ + pip3 install . --no-deps && \ + cd .. && \ + rm -rf AutoFP8 && \ + pip3 cache purge # Add CUDA-Compat RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/* diff --git a/serving/docker/requirements-lmi.txt b/serving/docker/requirements-lmi.txt new file mode 100644 index 000000000..20309ab5a --- /dev/null +++ b/serving/docker/requirements-lmi.txt @@ -0,0 +1,42 @@ +torch==2.5.1 +torchvision==0.20.1 +# sequence scheduler for hf accelerate rolling batch +https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl +peft==0.13.2 +protobuf==3.20.3 +transformers==4.45.2 +hf-transfer +zstandard +datasets==3.0.1 +mpi4py +sentencepiece +tiktoken +blobfile +einops +accelerate==1.0.1 +bitsandbytes==0.44.1 +auto-gptq==0.7.1 +pandas +pyarrow +jinja2 +retrying +opencv-contrib-python-headless +safetensors +scipy +onnx +sentence_transformers +onnxruntime +autoawq==0.2.5 +tokenizers==0.20.1 +pydantic==2.9.2 +# djl converter wheel for converting hf models to onnx/rust +https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl +optimum==1.23.2 +# flashinfer for vllm +https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl +# vllm wheel using PT 2.5.1 +https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl +# lmi-dist wheel - need to change this to the one built by ci once ready +https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl +# sagemaker fast model loader wheel +https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl diff --git a/serving/docker/requirements-neuron.txt b/serving/docker/requirements-neuron.txt new file mode 100644 index 000000000..7e056c5cf --- /dev/null +++ b/serving/docker/requirements-neuron.txt @@ -0,0 +1,24 @@ +--extra-index-url https://pip.repos.neuron.amazonaws.com +accelerate==0.29.2 +safetensors +torchvision==0.16.2 +neuronx-cc==2.15.128.0 +torch-neuronx==2.1.2.2.3.0 +transformers-neuronx==0.12.313 +torch_xla==2.1.4 +neuronx-cc-stubs==2.15.128.0 +huggingface-hub==0.25.2 +neuronx_distributed==0.9.0 +protobuf +sentencepiece +jinja2 +diffusers==0.28.2 +opencv-contrib-python-headless +pillow +pydantic==2.6.1 +optimum +optimum-neuron==0.0.24 +tiktoken +blobfile +transformers==4.45.2 +https://publish.djl.ai/neuron_vllm/vllm-0.6.2%2Bnightly-py3-none-any.whl \ No newline at end of file diff --git a/serving/docker/requirements-trt.txt b/serving/docker/requirements-trt.txt new file mode 100644 index 000000000..a4a0e761b --- /dev/null +++ b/serving/docker/requirements-trt.txt @@ -0,0 +1,38 @@ +torch==2.4.0 +transformers==4.42.4 +accelerate==0.32.1 +peft==0.13.2 +sentencepiece +mpi4py +cuda-python==12.5 +onnx +polygraphy +pynvml==11.5.0 +datasets==2.19.1 +pydantic==2.6.1 +scipy +torchprofile +bitsandbytes +ninja +transformers_stream_generator +einops +tiktoken +jinja2 +graphviz +blobfile +colored +h5py +strenum +pulp +flax +easydict +tensorrt==10.3.0 +janus==1.0.0 +nvidia-modelopt==0.15.0 +numpy==1.26.4 +# TRTLLM toolkit wheel +https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-0.12.0%2Bnightly-py3-none-any.whl +# TRTLLM wheel - contains necessary patch fix not available in OSS +https://publish.djl.ai/tensorrt-llm/v0.12.0/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl +# Triton toolkit wheel - pybindings for trtllm +https://publish.djl.ai/tritonserver/r24.04/tritontoolkit-24.4-py310-none-any.whl \ No newline at end of file diff --git a/serving/docker/tensorrt-llm.Dockerfile b/serving/docker/tensorrt-llm.Dockerfile index ccdac202b..e312f3390 100644 --- a/serving/docker/tensorrt-llm.Dockerfile +++ b/serving/docker/tensorrt-llm.Dockerfile @@ -13,27 +13,8 @@ ARG version=12.5.1-devel-ubuntu22.04 FROM nvidia/cuda:$version ARG cuda_version=cu125 ARG python_version=3.10 -ARG TORCH_VERSION=2.4.0 ARG djl_version ARG djl_serving_version -ARG transformers_version=4.44.2 -ARG accelerate_version=0.32.1 -ARG tensorrtlibs_version=10.1.0 -# %2B is the url escape for the '+' character -ARG trtllm_toolkit_version=0.12.0%2Bnightly -ARG trtllm_version=v0.12.0 -ARG cuda_python_version=12.5 -ARG peft_version=0.10.0 -ARG triton_version=r24.04 -ARG trtllm_toolkit_wheel="https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-${trtllm_toolkit_version}-py3-none-any.whl" -ARG trtllm_wheel="https://publish.djl.ai/tensorrt-llm/${trtllm_version}/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl" -ARG triton_toolkit_wheel="https://publish.djl.ai/tritonserver/${triton_version}/tritontoolkit-24.4-py310-none-any.whl" -ARG pydantic_version=2.6.1 -ARG modelopt_version=0.15.0 -ARG janus_version=1.0.0 -ARG pynvml_verison=11.5.0 -ARG numpy_version=1.26.4 -ARG datasets_version=2.19.1 EXPOSE 8080 @@ -68,6 +49,9 @@ COPY partition /opt/djl/partition COPY distribution[s]/ ./ RUN mv *.deb djl-serving_all.deb || true +# Add CUDA-Compat +RUN apt-get update && apt-get install -y cuda-compat-12-5 && apt-get clean -y && rm -rf /var/lib/apt/lists/* + # Install OpenMPI and other deps ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y g++ wget unzip openmpi-bin libopenmpi-dev libffi-dev git-lfs rapidjson-dev graphviz && \ @@ -75,26 +59,20 @@ RUN apt-get update && apt-get install -y g++ wget unzip openmpi-bin libopenmpi-d pip3 cache purge && \ apt-get clean -y && rm -rf /var/lib/apt/lists/* -# Install PyTorch -# Qwen needs transformers_stream_generator, tiktoken and einops -RUN pip install torch==${TORCH_VERSION} transformers==${transformers_version} accelerate==${accelerate_version} peft==${peft_version} sentencepiece \ - mpi4py cuda-python==${cuda_python_version} onnx polygraphy pynvml==${pynvml_verison} datasets==${datasets_version} pydantic==${pydantic_version} scipy torchprofile bitsandbytes ninja \ - transformers_stream_generator einops tiktoken jinja2 graphviz blobfile colored h5py strenum pulp flax easydict && \ - pip3 cache purge - -# Install TensorRT and TRT-LLM Deps -RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt==${tensorrtlibs_version} janus==${janus_version} nvidia-modelopt==${modelopt_version} && \ - pip install --no-deps ${trtllm_wheel} && \ +# Install Python dependencies +COPY requirements-trt.txt ./requirements.txt +RUN pip3 install -r requirements.txt && \ pyver=$(echo $python_version | awk -F. '{print $1$2}') && \ pip3 cache purge +# TRT depends on transformers<=4.42.4, but we need a higher version for llama 3.1 +RUN pip3 install transformers==4.44.2 --no-deps # download dependencies -RUN pip install ${triton_toolkit_wheel} ${trtllm_toolkit_wheel} && \ - mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \ - curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/${triton_version}/libtritonserver.so && \ - curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm.so && \ - curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm_common.so && \ - curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libnvinfer_plugin_tensorrt_llm.so.10 && \ +RUN mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \ + curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/r24.04/libtritonserver.so && \ + curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm.so && \ + curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm_common.so && \ + curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/v0.12.0/libnvinfer_plugin_tensorrt_llm.so.10 && \ pip3 cache purge && \ apt-get clean -y && rm -rf /var/lib/apt/lists/* @@ -108,13 +86,8 @@ RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version && \ useradd -m -d /home/djl djl && \ chown -R djl:djl /opt/djl && \ rm -rf scripts && \ - pip3 install numpy==${numpy_version} && \ - pip3 cache purge && \ apt-get clean -y && rm -rf /var/lib/apt/lists/* -# Add CUDA-Compat -RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/* - LABEL maintainer="djl-dev@amazon.com" LABEL dlc_major_version="1" LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.tensorrtllm="true" @@ -123,7 +96,7 @@ LABEL com.amazonaws.sagemaker.capabilities.multi-models="true" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" LABEL djl-version=$djl_version LABEL djl-serving-version=$djl_serving_version -LABEL trtllm-version=$trtllm_version +LABEL trtllm-version=v0.12.0 LABEL cuda-version=$cuda_version # To use the 535 CUDA driver LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.2