diff --git a/gen-ai/inference/vllm-rayserve-gpu/Dockerfile b/gen-ai/inference/vllm-rayserve-gpu/Dockerfile index 68647ff06..a99a63119 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/Dockerfile +++ b/gen-ai/inference/vllm-rayserve-gpu/Dockerfile @@ -6,16 +6,13 @@ LABEL maintainer="DoEKS" # Set environment variables to non-interactive (this prevents some prompts) ENV DEBIAN_FRONTEND=non-interactive - -# Install vLLM and other dependencies -RUN pip install vllm==0.4.3 - -RUN pip install huggingface_hub +# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 +ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH WORKDIR /app +# Install vLLM and other dependencies +RUN pip install vllm==0.4.3 huggingface_hub + # Copy the serving script into the container COPY vllm_serve.py /app/vllm_serve.py - -# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 -ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH diff --git a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml index d8e56c5c4..068f7f869 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml +++ b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml @@ -78,12 +78,19 @@ spec: name: ray-logs resources: limits: - cpu: 16 - memory: "32G" + cpu: 1 + memory: "6G" requests: - cpu: 16 - memory: "32G" + cpu: 1 + memory: "6G" env: + - name: RAY_GRAFANA_HOST + value: FILLIN + - name: RAY_PROMETHEUS_HOST + value: >- + FILLIN + - name: RAY_GRAFANA_IFRAME_HOST + value: FILLIN - name: VLLM_PORT value: "8000" - name: LD_LIBRARY_PATH @@ -131,6 +138,11 @@ spec: value: "8000" - name: LD_LIBRARY_PATH value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: hf-token nodeSelector: NodeGroupType: g5-gpu-karpenter type: karpenter diff --git a/gen-ai/inference/vllm-rayserve-gpu/serve.py b/gen-ai/inference/vllm-rayserve-gpu/serve.py new file mode 100644 index 000000000..6f610ff80 --- /dev/null +++ b/gen-ai/inference/vllm-rayserve-gpu/serve.py @@ -0,0 +1,40 @@ +import requests +import os + + +# Constants for model endpoint and service name +model_endpoint = os.environ.get("MODEL_ENDPOINT", "vllm") +service_name = os.environ.get("SERVICE_NAME", "http://localhost:56449/") + +# Function to generate text +def generate_text(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.post(url, json={"prompt": prompt}, timeout=180) + # print(response.text) + response.raise_for_status() # Raise an exception for HTTP errors + prompt_to_replace = "[INST]" + prompt + "[/INST]" + + # Removing the original prompt with instruction set from the output + text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n') + # remove '' strikethrough markdown + if text.startswith(""): + text = text.replace("", "", 1) + + text = text.replace("", "", 1) + + answer_only = text + + # Safety filter to remove harmful or inappropriate content + # answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + +print(generate_text("[INST] What is your favourite condiment? [/INST]", [])) diff --git a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py index 064a20729..3bf29a27b 100644 --- a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py +++ b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py @@ -31,10 +31,11 @@ ) class VLLMDeployment: def __init__(self, **kwargs): - hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") login(token=hf_token) logger.info(f"login to HF done") args = AsyncEngineArgs(**kwargs) + args.max_model_len=16992 self.engine = AsyncLLMEngine.from_engine_args(args) async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]: @@ -66,7 +67,7 @@ async def __call__(self, request: Request) -> Response: results_generator = self.engine.generate(prompt, sampling_params, request_id) if stream: background_tasks = BackgroundTasks() - # Using background_taks to abort the the request + # Using background_tasks to abort the request # if the client disconnects. background_tasks.add_task(self.may_abort_request, request_id) return StreamingResponse( @@ -92,4 +93,4 @@ async def __call__(self, request: Request) -> Response: deployment = VLLMDeployment.bind(model="mistralai/Mistral-7B-Instruct-v0.2", dtype="bfloat16", trust_remote_code=True, - ) \ No newline at end of file + )