Skip to content

Commit

Permalink
Bump vLLM Mixtral example to 0.2.6 (#568)
Browse files Browse the repository at this point in the history
  • Loading branch information
gongy authored and charlesfrye committed Mar 1, 2024
1 parent 67f8ac5 commit b62b8d2
Showing 1 changed file with 2 additions and 11 deletions.
13 changes: 2 additions & 11 deletions 06_gpu_and_ml/llm-serving/vllm_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def download_model_to_folder():
"nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
)
.pip_install(
"vllm==0.2.5",
"vllm==0.2.6",
"huggingface_hub==0.19.4",
"hf-transfer==0.1.4",
"torch==2.1.2",
Expand All @@ -79,9 +79,7 @@ def download_model_to_folder():
# This enables us to load the model into memory just once every time a container starts up, and keep it cached
# on the GPU for each subsequent invocation of the function.
#
# The `vLLM` library allows the code to remain quite clean. There are, however, some
# outstanding issues and performance improvements that we patch here, such as multi-GPU setup and
# suboptimal Ray CPU pinning.
# The `vLLM` library allows the code to remain quite clean. We do have to patch the multi-GPU setup due to issues with Ray.
@stub.cls(
gpu=GPU_CONFIG,
timeout=60 * 10,
Expand Down Expand Up @@ -111,13 +109,6 @@ def start_engine(self):
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
self.template = "<s> [INST] {user} [/INST] "

# Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
if GPU_CONFIG.count > 1:
import subprocess

RAY_CORE_PIN_OVERRIDE = "cpuid=0 ; for pid in $(ps xo '%p %c' | grep ray:: | awk '{print $1;}') ; do taskset -cp $cpuid $pid ; cpuid=$(($cpuid + 1)) ; done"
subprocess.call(RAY_CORE_PIN_OVERRIDE, shell=True)

@method()
async def completion_stream(self, user_question):
from vllm import SamplingParams
Expand Down

0 comments on commit b62b8d2

Please sign in to comment.