forked from awslabs/data-on-eks
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
254 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: rayserve-vllm | ||
resources: | ||
- ray-service-vllm.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
13 changes: 6 additions & 7 deletions
13
...ai/inference/vllm-rayserve-gpu/Dockerfile → ...erence/vllm-rayserve-gpu/image/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,19 @@ | ||
# Use the base image | ||
FROM rayproject/ray:2.24.0-py310-cu118 | ||
FROM rayproject/ray:2.32.0-py311 | ||
|
||
# Maintainer label | ||
LABEL maintainer="DoEKS" | ||
|
||
# Set environment variables to non-interactive (this prevents some prompts) | ||
ENV DEBIAN_FRONTEND=non-interactive | ||
|
||
# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 | ||
ENV LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH | ||
ENV DEBIAN_FRONTEND=non-interactive \ | ||
LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH | ||
|
||
# Set the working directory | ||
WORKDIR /app | ||
|
||
# Install vLLM and other dependencies in a single RUN command to reduce layers | ||
RUN pip install vllm==0.4.3 huggingface_hub==0.23.4 | ||
RUN pip install vllm==0.6.3 huggingface_hub==0.26.2 | ||
|
||
# Copy the serving script into the container | ||
COPY vllm_serve.py /app/vllm_serve.py | ||
COPY vllm_serve.py /app/vllm_serve.py | ||
COPY vllm_openai_serve.py /app/vllm_openai_serve.py |
109 changes: 109 additions & 0 deletions
109
gen-ai/inference/vllm-rayserve-gpu/image/vllm_openai_serve.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
from typing import Dict, Optional, List | ||
|
||
from fastapi import FastAPI | ||
|
||
from vllm.entrypoints.openai.protocol import ( | ||
ChatCompletionRequest, | ||
ChatCompletionResponse, | ||
ErrorResponse, | ||
) | ||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat | ||
from vllm.entrypoints.openai.serving_engine import LoRAModulePath | ||
from vllm.entrypoints.openai.serving_engine import BaseModelPath | ||
|
||
import json | ||
from typing import AsyncGenerator | ||
from fastapi import BackgroundTasks | ||
from starlette.requests import Request | ||
from starlette.responses import StreamingResponse, Response, JSONResponse | ||
from vllm.engine.arg_utils import AsyncEngineArgs | ||
from vllm.engine.async_llm_engine import AsyncLLMEngine | ||
from vllm.sampling_params import SamplingParams | ||
from vllm.utils import random_uuid | ||
|
||
|
||
from ray import serve | ||
import os | ||
import logging | ||
|
||
from huggingface_hub import login | ||
|
||
# Environment and configuration setup | ||
logger = logging.getLogger("ray.serve") | ||
|
||
app = FastAPI() | ||
|
||
@serve.deployment | ||
@serve.ingress(app) | ||
class VLLMDeployment: | ||
def __init__(self, **kwargs): | ||
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") | ||
if not hf_token: | ||
raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is not set") | ||
login(token=hf_token) | ||
logger.info("Successfully logged in to Hugging Face Hub") | ||
|
||
args = AsyncEngineArgs( | ||
model=os.getenv("MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2"), # Model identifier from Hugging Face Hub or local path. | ||
dtype="auto", # Automatically determine the data type (e.g., float16 or float32) for model weights and computations. | ||
gpu_memory_utilization=float(os.getenv("GPU_MEMORY_UTILIZATION", "0.8")), # Percentage of GPU memory to utilize, reserving some for overhead. | ||
max_model_len=int(os.getenv("MAX_MODEL_LEN", "4096")), # Maximum sequence length (in tokens) the model can handle, including both input and output tokens. | ||
max_num_seqs=int(os.getenv("MAX_NUM_SEQ", "512")), # Maximum number of sequences (requests) to process in parallel. | ||
max_num_batched_tokens=int(os.getenv("MAX_NUM_BATCHED_TOKENS", "32768")), # Maximum number of tokens processed in a single batch across all sequences (max_model_len * max_num_seqs). | ||
trust_remote_code=True, # Allow execution of untrusted code from the model repository (use with caution). | ||
enable_chunked_prefill=False, # Disable chunked prefill to avoid compatibility issues with prefix caching. | ||
tokenizer_pool_size=4, # Number of tokenizer instances to handle concurrent requests efficiently. | ||
tokenizer_pool_type="ray", # Pool type for tokenizers; 'ray' uses Ray for distributed processing. | ||
# max_parallel_loading_workers=2, # Number of parallel workers to load the model concurrently. | ||
pipeline_parallel_size=int(os.getenv("NUM_OF_NODES", "1")), # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism. | ||
tensor_parallel_size=int(os.getenv("NUM_OF_GPU", "1")), # Number of tensor parallelism stages; typically set to 1 unless using model parallelism. | ||
enable_prefix_caching=True, # Enable prefix caching to improve performance for similar prompt prefixes. | ||
enforce_eager=True, | ||
disable_log_requests=True | ||
) | ||
|
||
self.response_role = os.getenv("RESPONSE_ROLE", "assistant") | ||
self.engine_args = args | ||
self.engine = AsyncLLMEngine.from_engine_args(args) | ||
self.max_model_len = args.max_model_len | ||
self.openai_serving_chat = None | ||
logger.info(f"VLLM Engine initialized with max_model_len: {self.max_model_len}") | ||
|
||
@app.post("/v1/chat/completions") | ||
async def create_chat_completion( | ||
self, request: ChatCompletionRequest, raw_request: Request | ||
): | ||
"""OpenAI-compatible HTTP endpoint. | ||
API reference: | ||
- https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html | ||
""" | ||
if not self.openai_serving_chat: | ||
model_config = await self.engine.get_model_config() | ||
# Determine the name of the served model for the OpenAI client. | ||
served_model_names = [BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model)] | ||
self.openai_serving_chat = OpenAIServingChat( | ||
self.engine, | ||
model_config, | ||
served_model_names, | ||
self.response_role, | ||
lora_modules=None, | ||
prompt_adapters=None, | ||
request_logger=None, | ||
chat_template=None, | ||
) | ||
logger.info(f"Request: {request}") | ||
generator = await self.openai_serving_chat.create_chat_completion( | ||
request, raw_request | ||
) | ||
if isinstance(generator, ErrorResponse): | ||
return JSONResponse( | ||
content=generator.model_dump(), status_code=generator.code | ||
) | ||
if request.stream: | ||
return StreamingResponse(content=generator, media_type="text/event-stream") | ||
else: | ||
assert isinstance(generator, ChatCompletionResponse) | ||
return JSONResponse(content=generator.model_dump()) | ||
|
||
deployment = VLLMDeployment.bind() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 changes: 7 additions & 0 deletions
7
gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: rayserve-vllm | ||
resources: | ||
- ../../base | ||
patches: | ||
- path: model-config.yaml |
37 changes: 37 additions & 0 deletions
37
gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/model-config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayService | ||
metadata: | ||
name: vllm | ||
namespace: rayserve-vllm | ||
spec: | ||
serveConfigV2: | | ||
applications: | ||
- name: llama3 | ||
import_path: "vllm_openai_serve:deployment" | ||
route_prefix: "/llama3" | ||
runtime_env: | ||
env_vars: | ||
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" | ||
GPU_MEMORY_UTILIZATION: "0.9" | ||
MAX_MODEL_LEN: "4096" | ||
MAX_NUM_SEQ: "4" | ||
MAX_NUM_BATCHED_TOKENS: "32768" | ||
NUM_OF_GPU: "2" | ||
VLLM_ATTENTION_BACKEND: "XFORMERS" | ||
deployments: | ||
- name: VLLMDeployment | ||
autoscaling_config: | ||
metrics_interval_s: 0.2 | ||
min_replicas: 1 | ||
max_replicas: 4 | ||
look_back_period_s: 2 | ||
downscale_delay_s: 600 | ||
upscale_delay_s: 30 | ||
target_num_ongoing_requests_per_replica: 20 | ||
graceful_shutdown_timeout_s: 5 | ||
max_concurrent_queries: 100 | ||
ray_actor_options: | ||
num_cpus: 4 | ||
num_gpus: 2 | ||
7 changes: 7 additions & 0 deletions
7
gen-ai/inference/vllm-rayserve-gpu/models/mistral-7B-Instruct/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: rayserve-vllm | ||
resources: | ||
- ../../base | ||
patches: | ||
- path: model-config.yaml |
Oops, something went wrong.