Skip to content

Commit

Permalink
upgrade llama and vllm (#890)
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesfrye authored Sep 26, 2024
1 parent 837bdb4 commit baeeda4
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
2 changes: 1 addition & 1 deletion 06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class WebsiteUser(locust.HttpUser):
@locust.task
def chat_completion(self):
payload = {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "Llama-3.2-3B-Instruct-quantized.w8a8",
"messages": messages,
}

Expand Down
30 changes: 18 additions & 12 deletions 06_gpu_and_ml/llm-serving/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@
import modal

vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
"vllm==0.5.3post1"
"vllm==0.6.2"
)

# ## Download the model weights
#
# We'll be running a pretrained foundation model -- Meta's LLaMA 3.1 8B
# in the Instruct variant that's trained to chat and follow instructions.
# We'll be running a pretrained foundation model -- Meta's LLaMA 3.2 3B
# in the Instruct variant that's trained to chat and follow instructions,
# quantized to 8-bit by [Neural Magic](https://neuralmagic.com/) and uploaded to Hugging Face.

MODELS_DIR = "/llamas"
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
MODEL_NAME = "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8"
MODEL_REVISION = "1c42cac61b517e84efa30e3e90f00076045d5a89"

# We need to make the weights of that model available to our Modal Functions.
#
Expand Down Expand Up @@ -85,10 +86,10 @@

@app.function(
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
gpu=modal.gpu.L4(count=N_GPU),
container_idle_timeout=5 * MINUTES,
timeout=24 * HOURS,
allow_concurrent_inputs=100,
allow_concurrent_inputs=500,
volumes={MODELS_DIR: volume},
)
@modal.asgi_app()
Expand All @@ -102,14 +103,15 @@ def serve():
from vllm.entrypoints.openai.serving_completion import (
OpenAIServingCompletion,
)
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.usage.usage_lib import UsageContext

volume.reload() # ensure we have the latest version of the weights

# create a fastAPI app that uses vLLM's OpenAI-compatible router
web_app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
version="0.0.1",
docs_url="/docs",
)
Expand Down Expand Up @@ -159,20 +161,24 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):

request_logger = RequestLogger(max_log_len=2048)

api_server.openai_serving_chat = OpenAIServingChat(
base_model_paths = [
BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
]

api_server.chat = lambda s: OpenAIServingChat(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
base_model_paths=base_model_paths,
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.openai_serving_completion = OpenAIServingCompletion(
api_server.completion = lambda s: OpenAIServingCompletion(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
base_model_paths=base_model_paths,
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
Expand Down

0 comments on commit baeeda4

Please sign in to comment.