Skip to content

Commit

Permalink
vllm improvements (#524)
Browse files Browse the repository at this point in the history
  • Loading branch information
gongy authored Dec 15, 2023
1 parent b383dbb commit 12d6832
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 25 deletions.
19 changes: 7 additions & 12 deletions 06_gpu_and_ml/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
def download_model_to_folder():
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

os.makedirs(MODEL_DIR, exist_ok=True)

Expand All @@ -50,25 +51,19 @@ def download_model_to_folder():
local_dir=MODEL_DIR,
token=os.environ["HUGGINGFACE_TOKEN"],
)
move_cache()


# ### Image definition
# We’ll start from a Dockerhub image recommended by `vLLM`, upgrade the older
# version of `torch` (from 1.14) to a new one specifically built for CUDA 11.8.
# Next, we install `vLLM` from source to get the latest updates. Finally, we’ll
# use run_function to run the function defined above to ensure the weights of
# We’ll start from a recommended Dockerhub image and install `vLLM`.
# Then we’ll use run_function to run the function defined above to ensure the weights of
# the model are saved within the container image.
image = (
Image.from_registry("nvcr.io/nvidia/pytorch:22.12-py3")
.pip_install(
"torch==2.0.1+cu118", index_url="https://download.pytorch.org/whl/cu118"
)
# Pinned to 10/16/23
.pip_install(
"vllm @ git+https://github.com/vllm-project/vllm.git@651c614aa43e497a2e2aab473493ba295201ab20"
Image.from_registry(
"nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
)
.pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4")
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
.pip_install("hf-transfer~=0.1")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model_to_folder,
Expand Down
20 changes: 8 additions & 12 deletions 06_gpu_and_ml/vllm_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def download_model_to_folder():
snapshot_download(
BASE_MODEL,
local_dir=MODEL_DIR,
ignore_patterns="*.safetensors", # vLLM doesn't support Mixtral safetensors anyway.
ignore_patterns="*.pt", # Using safetensors
)
move_cache()

Expand All @@ -56,23 +56,16 @@ def download_model_to_folder():
# run_function to run the function defined above to ensure the weights of
# the model are saved within the container image.

VLLM_HASH = "89523c8293bc02a4dfaaa80079a5347dc3952464a33a501d5de329921eea7ec7"

image = (
Image.from_registry(
f"vllm/vllm-openai@sha256:{VLLM_HASH}",
setup_dockerfile_commands=[
"RUN apt-get install python-is-python3",
"RUN mv /workspace/* /root",
],
"nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
)
.dockerfile_commands("ENTRYPOINT []")
.pip_install("huggingface_hub==0.19.4", "hf-transfer==0.1.4")
.pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_model_to_folder, timeout=60 * 20)
)

stub = Stub("example-vllm-inference", image=image)
stub = Stub("example-vllm-mixtral", image=image)


# ## The model class
Expand Down Expand Up @@ -138,7 +131,10 @@ async def completion_stream(self, user_question):
)
index, num_tokens = 0, 0
async for output in result_generator:
if "\ufffd" == output.outputs[0].text[-1]:
if (
output.outputs[0].text
and "\ufffd" == output.outputs[0].text[-1]
):
continue
text_delta = output.outputs[0].text[index:]
index = len(output.outputs[0].text)
Expand Down
3 changes: 2 additions & 1 deletion misc/queue_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import modal
import modal.queue

stub = modal.Stub("example-queue-simple", q=modal.Queue.new())
stub = modal.Stub("example-queue-simple")
stub.q = modal.Queue.new()


@stub.function()
Expand Down

0 comments on commit 12d6832

Please sign in to comment.