forked from runpod-workers/worker-vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
50 lines (40 loc) · 1.63 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
FROM nvidia/cuda:12.1.0-base-ubuntu22.04
RUN apt-get update -y \
&& apt-get install -y python3-pip
RUN ldconfig /usr/local/cuda-12.1/compat/
# Install Python dependencies
COPY builder/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt
# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.6.4 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
ARG TOKENIZER_NAME=""
ARG BASE_PATH="/runpod-volume"
ARG QUANTIZATION=""
ARG MODEL_REVISION=""
ARG TOKENIZER_REVISION=""
ENV MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$MODEL_REVISION \
TOKENIZER_NAME=$TOKENIZER_NAME \
TOKENIZER_REVISION=$TOKENIZER_REVISION \
BASE_PATH=$BASE_PATH \
QUANTIZATION=$QUANTIZATION \
HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
HF_HUB_ENABLE_HF_TRANSFER=1
ENV PYTHONPATH="/:/vllm-workspace"
COPY src /src
RUN --mount=type=secret,id=HF_TOKEN,required=false \
if [ -f /run/secrets/HF_TOKEN ]; then \
export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
fi && \
if [ -n "$MODEL_NAME" ]; then \
python3 /src/download_model.py; \
fi
# Start the handler
CMD ["python3", "/src/handler.py"]