vllm improvements (#524)

modal-labs · Dec 15, 2023 · 12d6832 · 12d6832
1 parent b383dbb
commit 12d6832
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 25 deletions.
diff --git a/06_gpu_and_ml/vllm_inference.py b/06_gpu_and_ml/vllm_inference.py
@@ -42,6 +42,7 @@
 # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
 def download_model_to_folder():
     from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
 
     os.makedirs(MODEL_DIR, exist_ok=True)
 
@@ -50,25 +51,19 @@ def download_model_to_folder():
         local_dir=MODEL_DIR,
         token=os.environ["HUGGINGFACE_TOKEN"],
     )
+    move_cache()
 
 
 # ### Image definition
-# We’ll start from a Dockerhub image recommended by `vLLM`, upgrade the older
-# version of `torch` (from 1.14) to a new one specifically built for CUDA 11.8.
-# Next, we install `vLLM` from source to get the latest updates. Finally, we’ll
-# use run_function to run the function defined above to ensure the weights of
+# We’ll start from a recommended Dockerhub image and install `vLLM`.
+# Then we’ll use run_function to run the function defined above to ensure the weights of
 # the model are saved within the container image.
 image = (
-    Image.from_registry("nvcr.io/nvidia/pytorch:22.12-py3")
-    .pip_install(
-        "torch==2.0.1+cu118", index_url="https://download.pytorch.org/whl/cu118"
-    )
-    # Pinned to 10/16/23
-    .pip_install(
-        "vllm @ git+https://github.com/vllm-project/vllm.git@651c614aa43e497a2e2aab473493ba295201ab20"
+    Image.from_registry(
+        "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
     )
+    .pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4")
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
-    .pip_install("hf-transfer~=0.1")
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
     .run_function(
         download_model_to_folder,

diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py
@@ -46,7 +46,7 @@ def download_model_to_folder():
     snapshot_download(
         BASE_MODEL,
         local_dir=MODEL_DIR,
-        ignore_patterns="*.safetensors",  # vLLM doesn't support Mixtral safetensors anyway.
+        ignore_patterns="*.pt",  # Using safetensors
     )
     move_cache()
 
@@ -56,23 +56,16 @@ def download_model_to_folder():
 # run_function to run the function defined above to ensure the weights of
 # the model are saved within the container image.
 
-VLLM_HASH = "89523c8293bc02a4dfaaa80079a5347dc3952464a33a501d5de329921eea7ec7"
-
 image = (
     Image.from_registry(
-        f"vllm/vllm-openai@sha256:{VLLM_HASH}",
-        setup_dockerfile_commands=[
-            "RUN apt-get install python-is-python3",
-            "RUN mv /workspace/* /root",
-        ],
+        "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
     )
-    .dockerfile_commands("ENTRYPOINT []")
-    .pip_install("huggingface_hub==0.19.4", "hf-transfer==0.1.4")
+    .pip_install("vllm==0.2.5", "huggingface_hub==0.19.4", "hf-transfer==0.1.4")
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
     .run_function(download_model_to_folder, timeout=60 * 20)
 )
 
-stub = Stub("example-vllm-inference", image=image)
+stub = Stub("example-vllm-mixtral", image=image)
 
 
 # ## The model class
@@ -138,7 +131,10 @@ async def completion_stream(self, user_question):
         )
         index, num_tokens = 0, 0
         async for output in result_generator:
-            if "\ufffd" == output.outputs[0].text[-1]:
+            if (
+                output.outputs[0].text
+                and "\ufffd" == output.outputs[0].text[-1]
+            ):
                 continue
             text_delta = output.outputs[0].text[index:]
             index = len(output.outputs[0].text)

diff --git a/misc/queue_simple.py b/misc/queue_simple.py
@@ -14,7 +14,8 @@
 import modal
 import modal.queue
 
-stub = modal.Stub("example-queue-simple", q=modal.Queue.new())
+stub = modal.Stub("example-queue-simple")
+stub.q = modal.Queue.new()
 
 
 @stub.function()