vllm-rayserve-gpu upgrade

bbgu1 · Nov 4, 2024 · 395ece5 · 395ece5
1 parent d0b0106
commit 395ece5
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 40 deletions.
diff --git a/gen-ai/inference/vllm-rayserve-gpu/base/kustomization.yaml b/gen-ai/inference/vllm-rayserve-gpu/base/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: rayserve-vllm
+resources: 
+  - ray-service-vllm.yaml
diff --git a/...e/vllm-rayserve-gpu/ray-service-vllm.yaml → ...m-rayserve-gpu/base/ray-service-vllm.yaml b/...e/vllm-rayserve-gpu/ray-service-vllm.yaml → ...m-rayserve-gpu/base/ray-service-vllm.yaml
@@ -1,16 +1,3 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: rayserve-vllm
----
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token
-  namespace: rayserve-vllm
-data:
-  hf-token: $HUGGING_FACE_HUB_TOKEN
----
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
@@ -23,38 +10,67 @@ spec:
     applications:
       - name: mistral
         import_path: "vllm_serve:deployment"
+        route_prefix: "/mistral"
         runtime_env:
           env_vars:
-            LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
             MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
             GPU_MEMORY_UTILIZATION: "0.9"
             MAX_MODEL_LEN: "8192"
             MAX_NUM_SEQ: "4"
             MAX_NUM_BATCHED_TOKENS: "32768"
+            VLLM_ATTENTION_BACKEND: "XFORMERS"
         deployments:
-          - name: mistral-deployment
+          - name: VLLMDeployment
             autoscaling_config:
               metrics_interval_s: 0.2
-              min_replicas: 1
+              min_replicas: 2
               max_replicas: 4
               look_back_period_s: 2
               downscale_delay_s: 600
               upscale_delay_s: 30
               target_num_ongoing_requests_per_replica: 20
+              max_replica_per_node: 1
             graceful_shutdown_timeout_s: 5
             max_concurrent_queries: 100
             ray_actor_options:
-              num_cpus: 1
+              num_cpus: 4
               num_gpus: 1
+      - name: llama3
+        import_path: "vllm_openai_serve:deployment"
+        route_prefix: "/llama3"
+        runtime_env:
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            GPU_MEMORY_UTILIZATION: "0.9"
+            MAX_MODEL_LEN: "4096"
+            MAX_NUM_SEQ: "4"
+            MAX_NUM_BATCHED_TOKENS: "32768"
+            NUM_OF_GPU: "2"
+            VLLM_ATTENTION_BACKEND: "XFORMERS"
+        deployments:
+          - name: VLLMDeployment
+            autoscaling_config:
+              metrics_interval_s: 0.2
+              min_replicas: 1
+              max_replicas: 4
+              look_back_period_s: 2
+              downscale_delay_s: 600
+              upscale_delay_s: 30
+              target_num_ongoing_requests_per_replica: 20
+            graceful_shutdown_timeout_s: 5
+            max_concurrent_queries: 100
+            ray_actor_options:
+              num_cpus: 4
+              num_gpus: 2
   rayClusterConfig:
-    rayVersion: '2.24.0' # Should match the Ray version in the image of the containers
+    rayVersion: '2.32.0' # Should match the Ray version in the image of the containers
     enableInTreeAutoscaling: true
     ######################headGroupSpecs#################################
     # Ray head pod template.
     headGroupSpec:
       headService:
         metadata:
-          name: vllm
+          name: llmserve
           namespace: rayserve-vllm
       rayStartParams:
         dashboard-host: '0.0.0.0'
@@ -64,7 +80,7 @@ spec:
         spec:
           containers:
           - name: ray-head
-            image: public.ecr.aws/data-on-eks/ray2.24.0-py310-vllm-gpu:v1
+            image: 301444719761.dkr.ecr.us-west-2.amazonaws.com/ray2.32.2-py311-vllm-gpu:v9
             imagePullPolicy: IfNotPresent
             lifecycle:
               preStop:
@@ -114,31 +130,31 @@ spec:
             emptyDir: {}
     workerGroupSpecs:
     # The pod replicas in this group typed worker
-    - replicas: 1
+    - groupName: gpu-group
+      replicas: 1
       minReplicas: 1
       maxReplicas: 4
-      groupName: gpu-group
       rayStartParams: {}
       # Pod template
       template:
         spec:
           containers:
           - name: ray-worker
-            image: public.ecr.aws/data-on-eks/ray2.24.0-py310-vllm-gpu:v1
+            image: 301444719761.dkr.ecr.us-west-2.amazonaws.com/ray2.32.2-py311-vllm-gpu:v9
             imagePullPolicy: IfNotPresent
             lifecycle:
               preStop:
                 exec:
                   command: ["/bin/sh", "-c", "ray stop"]
             resources:
               limits:
-                cpu: 10
+                cpu: 6
                 memory: "60G"
-                nvidia.com/gpu: 1
+                nvidia.com/gpu: 2
               requests:
-                cpu: 10
+                cpu: 6
                 memory: "60G"
-                nvidia.com/gpu: 1
+                nvidia.com/gpu: 2
             env:
             # Ensure to set VLLM_PORT to avoid conflict with Ray serve port 8000
             - name: VLLM_PORT

diff --git a/gen-ai/inference/vllm-rayserve-gpu/client.py → ...rence/vllm-rayserve-gpu/clients/client.py b/gen-ai/inference/vllm-rayserve-gpu/client.py → ...rence/vllm-rayserve-gpu/clients/client.py
diff --git a/...i/inference/vllm-rayserve-gpu/prompts.txt → ...nce/vllm-rayserve-gpu/clients/prompts.txt b/...i/inference/vllm-rayserve-gpu/prompts.txt → ...nce/vllm-rayserve-gpu/clients/prompts.txt
diff --git a/...ai/inference/vllm-rayserve-gpu/Dockerfile → ...erence/vllm-rayserve-gpu/image/Dockerfile b/...ai/inference/vllm-rayserve-gpu/Dockerfile → ...erence/vllm-rayserve-gpu/image/Dockerfile
@@ -1,20 +1,19 @@
 # Use the base image
-FROM rayproject/ray:2.24.0-py310-cu118
+FROM rayproject/ray:2.32.0-py311
 
 # Maintainer label
 LABEL maintainer="DoEKS"
 
 # Set environment variables to non-interactive (this prevents some prompts)
-ENV DEBIAN_FRONTEND=non-interactive
-
-# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
-ENV LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH
+ENV DEBIAN_FRONTEND=non-interactive \
+    LD_LIBRARY_PATH=/home/ray/anaconda3/lib:$LD_LIBRARY_PATH
 
 # Set the working directory
 WORKDIR /app
 
 # Install vLLM and other dependencies in a single RUN command to reduce layers
-RUN pip install vllm==0.4.3 huggingface_hub==0.23.4
+RUN pip install vllm==0.6.3 huggingface_hub==0.26.2
 
 # Copy the serving script into the container
-COPY vllm_serve.py /app/vllm_serve.py
+COPY vllm_serve.py /app/vllm_serve.py 
+COPY vllm_openai_serve.py /app/vllm_openai_serve.py
diff --git a/gen-ai/inference/vllm-rayserve-gpu/image/vllm_openai_serve.py b/gen-ai/inference/vllm-rayserve-gpu/image/vllm_openai_serve.py
@@ -0,0 +1,109 @@
+from typing import Dict, Optional, List
+
+from fastapi import FastAPI
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_engine import BaseModelPath
+
+import json
+from typing import AsyncGenerator
+from fastapi import BackgroundTasks
+from starlette.requests import Request
+from starlette.responses import StreamingResponse, Response, JSONResponse
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+
+
+from ray import serve
+import os
+import logging
+
+from huggingface_hub import login
+
+# Environment and configuration setup
+logger = logging.getLogger("ray.serve")
+
+app = FastAPI()
+
+@serve.deployment
+@serve.ingress(app)
+class VLLMDeployment:
+    def __init__(self, **kwargs):
+        hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+        if not hf_token:
+            raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is not set")
+        login(token=hf_token)
+        logger.info("Successfully logged in to Hugging Face Hub")
+
+        args = AsyncEngineArgs(
+            model=os.getenv("MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.2"),  # Model identifier from Hugging Face Hub or local path.
+            dtype="auto",  # Automatically determine the data type (e.g., float16 or float32) for model weights and computations.
+            gpu_memory_utilization=float(os.getenv("GPU_MEMORY_UTILIZATION", "0.8")),  # Percentage of GPU memory to utilize, reserving some for overhead.
+            max_model_len=int(os.getenv("MAX_MODEL_LEN", "4096")),  # Maximum sequence length (in tokens) the model can handle, including both input and output tokens.
+            max_num_seqs=int(os.getenv("MAX_NUM_SEQ", "512")),  # Maximum number of sequences (requests) to process in parallel.
+            max_num_batched_tokens=int(os.getenv("MAX_NUM_BATCHED_TOKENS", "32768")),  # Maximum number of tokens processed in a single batch across all sequences (max_model_len * max_num_seqs).
+            trust_remote_code=True,  # Allow execution of untrusted code from the model repository (use with caution).
+            enable_chunked_prefill=False,  # Disable chunked prefill to avoid compatibility issues with prefix caching.
+            tokenizer_pool_size=4,  # Number of tokenizer instances to handle concurrent requests efficiently.
+            tokenizer_pool_type="ray",  # Pool type for tokenizers; 'ray' uses Ray for distributed processing.
+            # max_parallel_loading_workers=2,  # Number of parallel workers to load the model concurrently.
+            pipeline_parallel_size=int(os.getenv("NUM_OF_NODES", "1")),  # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
+            tensor_parallel_size=int(os.getenv("NUM_OF_GPU", "1")),  # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
+            enable_prefix_caching=True,  # Enable prefix caching to improve performance for similar prompt prefixes.
+            enforce_eager=True,
+            disable_log_requests=True
+        )
+
+        self.response_role = os.getenv("RESPONSE_ROLE", "assistant")
+        self.engine_args = args
+        self.engine = AsyncLLMEngine.from_engine_args(args)
+        self.max_model_len = args.max_model_len
+        self.openai_serving_chat = None
+        logger.info(f"VLLM Engine initialized with max_model_len: {self.max_model_len}")
+
+    @app.post("/v1/chat/completions")
+    async def create_chat_completion(
+        self, request: ChatCompletionRequest, raw_request: Request
+    ):
+        """OpenAI-compatible HTTP endpoint.
+
+        API reference:
+            - https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        """
+        if not self.openai_serving_chat:
+            model_config = await self.engine.get_model_config()
+            # Determine the name of the served model for the OpenAI client.
+            served_model_names = [BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model)]
+            self.openai_serving_chat = OpenAIServingChat(
+                self.engine,
+                model_config,
+                served_model_names,
+                self.response_role,
+                lora_modules=None,
+                prompt_adapters=None,
+                request_logger=None,
+                chat_template=None,
+            )
+        logger.info(f"Request: {request}")
+        generator = await self.openai_serving_chat.create_chat_completion(
+            request, raw_request
+        )
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(
+                content=generator.model_dump(), status_code=generator.code
+            )
+        if request.stream:
+            return StreamingResponse(content=generator, media_type="text/event-stream")
+        else:
+            assert isinstance(generator, ChatCompletionResponse)
+            return JSONResponse(content=generator.model_dump())
+
+deployment = VLLMDeployment.bind()
diff --git a/...inference/vllm-rayserve-gpu/vllm_serve.py → ...nce/vllm-rayserve-gpu/image/vllm_serve.py b/...inference/vllm-rayserve-gpu/vllm_serve.py → ...nce/vllm-rayserve-gpu/image/vllm_serve.py
@@ -16,10 +16,7 @@
 # Environment and configuration setup
 logger = logging.getLogger("ray.serve")
 
-@serve.deployment(name="mistral-deployment", route_prefix="/vllm",
-    ray_actor_options={"num_gpus": 1},
-    autoscaling_config={"min_replicas": 1, "max_replicas": 2},
-)
+@serve.deployment
 class VLLMDeployment:
     def __init__(self, **kwargs):
         hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
@@ -41,8 +38,8 @@ def __init__(self, **kwargs):
             tokenizer_pool_size=4,  # Number of tokenizer instances to handle concurrent requests efficiently.
             tokenizer_pool_type="ray",  # Pool type for tokenizers; 'ray' uses Ray for distributed processing.
             max_parallel_loading_workers=2,  # Number of parallel workers to load the model concurrently.
-            pipeline_parallel_size=1,  # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
-            tensor_parallel_size=1,  # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
+            pipeline_parallel_size=int(os.getenv("NUM_OF_NODES", "1")),  # Number of pipeline parallelism stages; typically set to 1 unless using model parallelism.
+            tensor_parallel_size=int(os.getenv("NUM_OF_GPU", "1")),  # Number of tensor parallelism stages; typically set to 1 unless using model parallelism.
             enable_prefix_caching=True,  # Enable prefix caching to improve performance for similar prompt prefixes.
             enforce_eager=True,
             disable_log_requests=True,

diff --git a/gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/kustomization.yaml b/gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: rayserve-vllm
+resources: 
+  - ../../base
+patches:
+  - path: model-config.yaml
diff --git a/gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/model-config.yaml b/gen-ai/inference/vllm-rayserve-gpu/models/llama-3-8B-Instruct/model-config.yaml
@@ -0,0 +1,37 @@
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm
+  namespace: rayserve-vllm
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llama3
+        import_path: "vllm_openai_serve:deployment"
+        route_prefix: "/llama3"
+        runtime_env:
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            GPU_MEMORY_UTILIZATION: "0.9"
+            MAX_MODEL_LEN: "4096"
+            MAX_NUM_SEQ: "4"
+            MAX_NUM_BATCHED_TOKENS: "32768"
+            NUM_OF_GPU: "2"
+            VLLM_ATTENTION_BACKEND: "XFORMERS"
+        deployments:
+          - name: VLLMDeployment
+            autoscaling_config:
+              metrics_interval_s: 0.2
+              min_replicas: 1
+              max_replicas: 4
+              look_back_period_s: 2
+              downscale_delay_s: 600
+              upscale_delay_s: 30
+              target_num_ongoing_requests_per_replica: 20
+            graceful_shutdown_timeout_s: 5
+            max_concurrent_queries: 100
+            ray_actor_options:
+              num_cpus: 4
+              num_gpus: 2
+  
+      
diff --git a/gen-ai/inference/vllm-rayserve-gpu/models/mistral-7B-Instruct/kustomization.yaml b/gen-ai/inference/vllm-rayserve-gpu/models/mistral-7B-Instruct/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: rayserve-vllm
+resources: 
+  - ../../base
+patches:
+  - path: model-config.yaml