From 706a4ba365ec44b4f15e2c49a51a571441a3eae5 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 21 Jun 2024 15:37:05 -0700 Subject: [PATCH 1/4] update vllm deployment and add a local testing script --- gen-ai/inference/vllm-rayserve-gpu/Dockerfile | 13 +++--- .../vllm-rayserve-gpu/ray-service-vllm.yaml | 23 +++++++---- gen-ai/inference/vllm-rayserve-gpu/serve.py | 40 +++++++++++++++++++ .../inference/vllm-rayserve-gpu/vllm_serve.py | 5 ++- 4 files changed, 64 insertions(+), 17 deletions(-) create mode 100644 gen-ai/inference/vllm-rayserve-gpu/serve.py diff --git a/gen-ai/inference/vllm-rayserve-gpu/Dockerfile b/gen-ai/inference/vllm-rayserve-gpu/Dockerfile index 68647ff06..a99a63119 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/Dockerfile +++ b/gen-ai/inference/vllm-rayserve-gpu/Dockerfile @@ -6,16 +6,13 @@ LABEL maintainer="DoEKS" # Set environment variables to non-interactive (this prevents some prompts) ENV DEBIAN_FRONTEND=non-interactive - -# Install vLLM and other dependencies -RUN pip install vllm==0.4.3 - -RUN pip install huggingface_hub +# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 +ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH WORKDIR /app +# Install vLLM and other dependencies +RUN pip install vllm==0.4.3 huggingface_hub + # Copy the serving script into the container COPY vllm_serve.py /app/vllm_serve.py - -# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 -ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH diff --git a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml index d8e56c5c4..b4051f3b5 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml +++ b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml @@ -78,12 +78,19 @@ spec: name: ray-logs resources: limits: - cpu: 16 - memory: "32G" + cpu: 1 + memory: "6G" requests: - cpu: 16 - memory: "32G" + cpu: 1 + memory: "6G" env: + - name: RAY_GRAFANA_HOST + value: FILLIN + - name: RAY_PROMETHEUS_HOST + value: >- + FILLIN + - name: RAY_GRAFANA_IFRAME_HOST + value: FILLIN - name: VLLM_PORT value: "8000" - name: LD_LIBRARY_PATH @@ -131,9 +138,11 @@ spec: value: "8000" - name: LD_LIBRARY_PATH value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" - nodeSelector: - NodeGroupType: g5-gpu-karpenter - type: karpenter + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: hf-token # Please add the following taints to the GPU node. tolerations: - key: "nvidia.com/gpu" diff --git a/gen-ai/inference/vllm-rayserve-gpu/serve.py b/gen-ai/inference/vllm-rayserve-gpu/serve.py new file mode 100644 index 000000000..7684ad88a --- /dev/null +++ b/gen-ai/inference/vllm-rayserve-gpu/serve.py @@ -0,0 +1,40 @@ +import requests +import os + + +# Constants for model endpoint and service name +model_endpoint = os.environ.get("MODEL_ENDPOINT", "vllm") +service_name = os.environ.get("SERVICE_NAME", "http://localhost:56449/") + +# Function to generate text +def generate_text(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.post(url, json={"prompt": prompt}, timeout=180) + # print(response.text) + response.raise_for_status() # Raise an exception for HTTP errors + prompt_to_replace = "[INST]" + prompt + "[/INST]" + + # Removing the original prompt with instruction set from the output + text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n') + # remove '' strikethrough markdown + if text.startswith(""): + text = text.replace("", "", 1) + + text = text.replace("", "", 1) + + answer_only = text + + # Safety filter to remove harmful or inappropriate content + # answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + +print(generate_text("[INST] What is your favourite condiment? [/INST]", [])) \ No newline at end of file diff --git a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py index 064a20729..c9b1b2eb5 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py +++ b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py @@ -31,10 +31,11 @@ ) class VLLMDeployment: def __init__(self, **kwargs): - hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") login(token=hf_token) logger.info(f"login to HF done") args = AsyncEngineArgs(**kwargs) + args.max_model_len=16992 self.engine = AsyncLLMEngine.from_engine_args(args) async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]: @@ -66,7 +67,7 @@ async def __call__(self, request: Request) -> Response: results_generator = self.engine.generate(prompt, sampling_params, request_id) if stream: background_tasks = BackgroundTasks() - # Using background_taks to abort the the request + # Using background_tasks to abort the request # if the client disconnects. background_tasks.add_task(self.may_abort_request, request_id) return StreamingResponse( From 7073accc514002131e94fc219aebc88f0cb639dd Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 21 Jun 2024 15:55:05 -0700 Subject: [PATCH 2/4] newline --- gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py index c9b1b2eb5..3bf29a27b 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py +++ b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py @@ -93,4 +93,4 @@ async def __call__(self, request: Request) -> Response: deployment = VLLMDeployment.bind(model="mistralai/Mistral-7B-Instruct-v0.2", dtype="bfloat16", trust_remote_code=True, - ) \ No newline at end of file + ) From d04510932b5346b9e35a111a5f7241364bad244e Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:13:16 -0700 Subject: [PATCH 3/4] undo nodeSelector removal --- gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml index b4051f3b5..068f7f869 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml +++ b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml @@ -143,6 +143,9 @@ spec: secretKeyRef: name: hf-token key: hf-token + nodeSelector: + NodeGroupType: g5-gpu-karpenter + type: karpenter # Please add the following taints to the GPU node. tolerations: - key: "nvidia.com/gpu" From 2a16138e0b0f3e1e37534bd35b94e22a3b945fa4 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:14:16 -0700 Subject: [PATCH 4/4] newline --- gen-ai/inference/vllm-rayserve-gpu/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen-ai/inference/vllm-rayserve-gpu/serve.py b/gen-ai/inference/vllm-rayserve-gpu/serve.py index 7684ad88a..6f610ff80 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/serve.py +++ b/gen-ai/inference/vllm-rayserve-gpu/serve.py @@ -37,4 +37,4 @@ def generate_text(message, history): # Handle any request exceptions (e.g., connection errors) return f"AI: Error: {str(e)}" -print(generate_text("[INST] What is your favourite condiment? [/INST]", [])) \ No newline at end of file +print(generate_text("[INST] What is your favourite condiment? [/INST]", []))