Skip to content

Commit

Permalink
vllm-rayserve-gpu upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
bbgu1 committed Nov 4, 2024
1 parent d0b0106 commit 395ece5
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 40 deletions.
5 changes: 5 additions & 0 deletions gen-ai/inference/vllm-rayserve-gpu/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: rayserve-vllm
resources:
- ray-service-vllm.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
apiVersion: v1
kind: Namespace
metadata:
name: rayserve-vllm
---
apiVersion: v1
kind: Secret
metadata:
name: hf-token
namespace: rayserve-vllm
data:
hf-token: $HUGGING_FACE_HUB_TOKEN
---
apiVersion: ray.io/v1
kind: RayService
metadata:
Expand All @@ -23,38 +10,67 @@ spec:
applications:
- name: mistral
import_path: "vllm_serve:deployment"
route_prefix: "/mistral"
runtime_env:
env_vars:
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
GPU_MEMORY_UTILIZATION: "0.9"
MAX_MODEL_LEN: "8192"
MAX_NUM_SEQ: "4"
MAX_NUM_BATCHED_TOKENS: "32768"
VLLM_ATTENTION_BACKEND: "XFORMERS"
deployments:
- name: mistral-deployment
- name: VLLMDeployment
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 1
min_replicas: 2
max_replicas: 4
look_back_period_s: 2
downscale_delay_s: 600
upscale_delay_s: 30
target_num_ongoing_requests_per_replica: 20
max_replica_per_node: 1
graceful_shutdown_timeout_s: 5
max_concurrent_queries: 100
ray_actor_options:
num_cpus: 1
num_cpus: 4
num_gpus: 1
- name: llama3
import_path: "vllm_openai_serve:deployment"
route_prefix: "/llama3"
runtime_env:
env_vars:
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
GPU_MEMORY_UTILIZATION: "0.9"
MAX_MODEL_LEN: "4096"
MAX_NUM_SEQ: "4"
MAX_NUM_BATCHED_TOKENS: "32768"
NUM_OF_GPU: "2"
VLLM_ATTENTION_BACKEND: "XFORMERS"
deployments:
- name: VLLMDeployment
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 1
max_replicas: 4
look_back_period_s: 2
downscale_delay_s: 600
upscale_delay_s: 30
target_num_ongoing_requests_per_replica: 20
graceful_shutdown_timeout_s: 5
max_concurrent_queries: 100
ray_actor_options:
num_cpus: 4
num_gpus: 2
rayClusterConfig:
rayVersion: '2.24.0' # Should match the Ray version in the image of the containers
rayVersion: '2.32.0' # Should match the Ray version in the image of the containers
enableInTreeAutoscaling: true
######################headGroupSpecs#################################
# Ray head pod template.
headGroupSpec:
headService:
metadata:
name: vllm
name: llmserve
namespace: rayserve-vllm
rayStartParams:
dashboard-host: '0.0.0.0'
Expand All @@ -64,7 +80,7 @@ spec:
spec:
containers:
- name: ray-head
image: public.ecr.aws/data-on-eks/ray2.24.0-py310-vllm-gpu:v1
image: 301444719761.dkr.ecr.us-west-2.amazonaws.com/ray2.32.2-py311-vllm-gpu:v9
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
Expand Down Expand Up @@ -114,31 +130,31 @@ spec:
emptyDir: {}
workerGroupSpecs:
# The pod replicas in this group typed worker
- replicas: 1
- groupName: gpu-group
replicas: 1
minReplicas: 1
maxReplicas: 4
groupName: gpu-group
rayStartParams: {}
# Pod template
template:
spec:
containers:
- name: ray-worker
image: public.ecr.aws/data-on-eks/ray2.24.0-py310-vllm-gpu:v1
image: 301444719761.dkr.ecr.us-west-2.amazonaws.com/ray2.32.2-py311-vllm-gpu:v9
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: 10
cpu: 6
memory: "60G"
nvidia.com/gpu: 1
nvidia.com/gpu: 2
requests:
cpu: 10
cpu: 6
memory: "60G"
nvidia.com/gpu: 1
nvidia.com/gpu: 2
env:
# Ensure to set VLLM_PORT to avoid conflict with Ray serve port 8000
- name: VLLM_PORT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
# Use the base image
FROM rayproject/ray:2.24.0-py310-cu118
FROM rayproject/ray:2.32.0-py311

# Maintainer label
LABEL maintainer="DoEKS"

# Set environment variables to non-interactive (this prevents some prompts)
ENV DEBIAN_FRONTEND=non-interactive

# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
ENV LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH
ENV DEBIAN_FRONTEND=non-interactive \
LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH

# Set the working directory
WORKDIR /app

# Install vLLM and other dependencies in a single RUN command to reduce layers
RUN pip install vllm==0.4.3 huggingface_hub==0.23.4
RUN pip install vllm==0.6.3 huggingface_hub==0.26.2

# Copy the serving script into the container
COPY vllm_serve.py /app/vllm_serve.py
COPY vllm_serve.py /app/vllm_serve.py
COPY vllm_openai_serve.py /app/vllm_openai_serve.py
109 changes: 109 additions & 0 deletions gen-ai/inference/vllm-rayserve-gpu/image/vllm_openai_serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from typing import Dict, Optional, List

from fastapi import FastAPI

from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.entrypoints.openai.serving_engine import BaseModelPath

import json
from typing import AsyncGenerator
from fastapi import BackgroundTasks
from starlette.requests import Request
from starlette.responses import StreamingResponse, Response, JSONResponse
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid


from ray import serve
import os
import logging

from huggingface_hub import login

# Environment and configuration setup
logger = logging.getLogger("ray.serve")

app = FastAPI()

@serve.deployment
@serve.ingress(app)
class VLLMDeployment:
def __init__(self, **kwargs):
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
if not hf_token:
raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is not set")
login(token=hf_token)
logger.info("Successfully logged in to Hugging Face Hub")

args = AsyncEngineArgs(
model=os.getenv("MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2"), # Model identifier from Hugging Face Hub or local path.
dtype="auto", # Automatically determine the data type (e.g., float16 or float32) for model weights and computations.
gpu_memory_utilization=float(os.getenv("GPU_MEMORY_UTILIZATION", "0.8")), # Percentage of GPU memory to utilize, reserving some for overhead.
max_model_len=int(os.getenv("MAX_MODEL_LEN", "4096")), # Maximum sequence length (in tokens) the model can handle, including both input and output tokens.
max_num_seqs=int(os.getenv("MAX_NUM_SEQ", "512")), # Maximum number of sequences (requests) to process in parallel.
max_num_batched_tokens=int(os.getenv("MAX_NUM_BATCHED_TOKENS", "32768")), # Maximum number of tokens processed in a single batch across all sequences (max_model_len * max_num_seqs).
trust_remote_code=True, # Allow execution of untrusted code from the model repository (use with caution).
enable_chunked_prefill=False, # Disable chunked prefill to avoid compatibility issues with prefix caching.
tokenizer_pool_size=4, # Number of tokenizer instances to handle concurrent requests efficiently.
tokenizer_pool_type="ray", # Pool type for tokenizers; 'ray' uses Ray for distributed processing.
# max_parallel_loading_workers=2, # Number of parallel workers to load the model concurrently.
pipeline_parallel_size=int(os.getenv("NUM_OF_NODES", "1")), # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
tensor_parallel_size=int(os.getenv("NUM_OF_GPU", "1")), # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
enable_prefix_caching=True, # Enable prefix caching to improve performance for similar prompt prefixes.
enforce_eager=True,
disable_log_requests=True
)

self.response_role = os.getenv("RESPONSE_ROLE", "assistant")
self.engine_args = args
self.engine = AsyncLLMEngine.from_engine_args(args)
self.max_model_len = args.max_model_len
self.openai_serving_chat = None
logger.info(f"VLLM Engine initialized with max_model_len: {self.max_model_len}")

@app.post("/v1/chat/completions")
async def create_chat_completion(
self, request: ChatCompletionRequest, raw_request: Request
):
"""OpenAI-compatible HTTP endpoint.
API reference:
- https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
"""
if not self.openai_serving_chat:
model_config = await self.engine.get_model_config()
# Determine the name of the served model for the OpenAI client.
served_model_names = [BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model)]
self.openai_serving_chat = OpenAIServingChat(
self.engine,
model_config,
served_model_names,
self.response_role,
lora_modules=None,
prompt_adapters=None,
request_logger=None,
chat_template=None,
)
logger.info(f"Request: {request}")
generator = await self.openai_serving_chat.create_chat_completion(
request, raw_request
)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.code
)
if request.stream:
return StreamingResponse(content=generator, media_type="text/event-stream")
else:
assert isinstance(generator, ChatCompletionResponse)
return JSONResponse(content=generator.model_dump())

deployment = VLLMDeployment.bind()
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@
# Environment and configuration setup
logger = logging.getLogger("ray.serve")

@serve.deployment(name="mistral-deployment", route_prefix="/vllm",
ray_actor_options={"num_gpus": 1},
autoscaling_config={"min_replicas": 1, "max_replicas": 2},
)
@serve.deployment
class VLLMDeployment:
def __init__(self, **kwargs):
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
Expand All @@ -41,8 +38,8 @@ def __init__(self, **kwargs):
tokenizer_pool_size=4, # Number of tokenizer instances to handle concurrent requests efficiently.
tokenizer_pool_type="ray", # Pool type for tokenizers; 'ray' uses Ray for distributed processing.
max_parallel_loading_workers=2, # Number of parallel workers to load the model concurrently.
pipeline_parallel_size=1, # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
tensor_parallel_size=1, # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
pipeline_parallel_size=int(os.getenv("NUM_OF_NODES", "1")), # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
tensor_parallel_size=int(os.getenv("NUM_OF_GPU", "1")), # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
enable_prefix_caching=True, # Enable prefix caching to improve performance for similar prompt prefixes.
enforce_eager=True,
disable_log_requests=True,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: rayserve-vllm
resources:
- ../../base
patches:
- path: model-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: ray.io/v1
kind: RayService
metadata:
name: vllm
namespace: rayserve-vllm
spec:
serveConfigV2: |
applications:
- name: llama3
import_path: "vllm_openai_serve:deployment"
route_prefix: "/llama3"
runtime_env:
env_vars:
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
GPU_MEMORY_UTILIZATION: "0.9"
MAX_MODEL_LEN: "4096"
MAX_NUM_SEQ: "4"
MAX_NUM_BATCHED_TOKENS: "32768"
NUM_OF_GPU: "2"
VLLM_ATTENTION_BACKEND: "XFORMERS"
deployments:
- name: VLLMDeployment
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 1
max_replicas: 4
look_back_period_s: 2
downscale_delay_s: 600
upscale_delay_s: 30
target_num_ongoing_requests_per_replica: 20
graceful_shutdown_timeout_s: 5
max_concurrent_queries: 100
ray_actor_options:
num_cpus: 4
num_gpus: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: rayserve-vllm
resources:
- ../../base
patches:
- path: model-config.yaml
Loading

0 comments on commit 395ece5

Please sign in to comment.