From baeeda4ec51d078e7f839636d7bada02ea4411ef Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Wed, 25 Sep 2024 19:13:03 -0700 Subject: [PATCH] upgrade llama and vllm (#890) --- .../openai_compatible/locustfile.py | 2 +- 06_gpu_and_ml/llm-serving/vllm_inference.py | 30 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py b/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py index a536ee1cf..66af64258 100644 --- a/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py +++ b/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py @@ -25,7 +25,7 @@ class WebsiteUser(locust.HttpUser): @locust.task def chat_completion(self): payload = { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "Llama-3.2-3B-Instruct-quantized.w8a8", "messages": messages, } diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index b52b66893..f7eeb2b15 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -32,17 +32,18 @@ import modal vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install( - "vllm==0.5.3post1" + "vllm==0.6.2" ) # ## Download the model weights # -# We'll be running a pretrained foundation model -- Meta's LLaMA 3.1 8B -# in the Instruct variant that's trained to chat and follow instructions. +# We'll be running a pretrained foundation model -- Meta's LLaMA 3.2 3B +# in the Instruct variant that's trained to chat and follow instructions, +# quantized to 8-bit by [Neural Magic](https://neuralmagic.com/) and uploaded to Hugging Face. MODELS_DIR = "/llamas" -MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct" -MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16" +MODEL_NAME = "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8" +MODEL_REVISION = "1c42cac61b517e84efa30e3e90f00076045d5a89" # We need to make the weights of that model available to our Modal Functions. # @@ -85,10 +86,10 @@ @app.function( image=vllm_image, - gpu=modal.gpu.A100(count=N_GPU, size="40GB"), + gpu=modal.gpu.L4(count=N_GPU), container_idle_timeout=5 * MINUTES, timeout=24 * HOURS, - allow_concurrent_inputs=100, + allow_concurrent_inputs=500, volumes={MODELS_DIR: volume}, ) @modal.asgi_app() @@ -102,6 +103,7 @@ def serve(): from vllm.entrypoints.openai.serving_completion import ( OpenAIServingCompletion, ) + from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.usage.usage_lib import UsageContext volume.reload() # ensure we have the latest version of the weights @@ -109,7 +111,7 @@ def serve(): # create a fastAPI app that uses vLLM's OpenAI-compatible router web_app = fastapi.FastAPI( title=f"OpenAI-compatible {MODEL_NAME} server", - description="Run an OpenAI-compatible LLM server with vLLM on modal.com", + description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀", version="0.0.1", docs_url="/docs", ) @@ -159,20 +161,24 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): request_logger = RequestLogger(max_log_len=2048) - api_server.openai_serving_chat = OpenAIServingChat( + base_model_paths = [ + BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME) + ] + + api_server.chat = lambda s: OpenAIServingChat( engine, model_config=model_config, - served_model_names=[MODEL_NAME], + base_model_paths=base_model_paths, chat_template=None, response_role="assistant", lora_modules=[], prompt_adapters=[], request_logger=request_logger, ) - api_server.openai_serving_completion = OpenAIServingCompletion( + api_server.completion = lambda s: OpenAIServingCompletion( engine, model_config=model_config, - served_model_names=[MODEL_NAME], + base_model_paths=base_model_paths, lora_modules=[], prompt_adapters=[], request_logger=request_logger,