ratnopamc · omrishiv · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/gen-ai/inference/vllm-rayserve-gpu/Dockerfile b/gen-ai/inference/vllm-rayserve-gpu/Dockerfile
@@ -6,16 +6,13 @@ LABEL maintainer="DoEKS"
 
 # Set environment variables to non-interactive (this prevents some prompts)
 ENV DEBIAN_FRONTEND=non-interactive
-
-# Install vLLM and other dependencies
-RUN pip install vllm==0.4.3
-
-RUN pip install huggingface_hub
+# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
+ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH
 
 WORKDIR /app
 
+# Install vLLM and other dependencies
+RUN pip install vllm==0.4.3 huggingface_hub
+
 # Copy the serving script into the container
 COPY vllm_serve.py /app/vllm_serve.py
-
-# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
-ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH
diff --git a/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml b/gen-ai/inference/vllm-rayserve-gpu/ray-service-vllm.yaml
@@ -78,12 +78,19 @@ spec:
               name: ray-logs
             resources:
               limits:
-                cpu: 16
-                memory: "32G"
+                cpu: 1
+                memory: "6G"
               requests:
-                cpu: 16
-                memory: "32G"
+                cpu: 1
+                memory: "6G"
             env:
+            - name: RAY_GRAFANA_HOST
+              value: FILLIN
+            - name: RAY_PROMETHEUS_HOST
+              value: >-
+                FILLIN
+            - name: RAY_GRAFANA_IFRAME_HOST
+              value: FILLIN
             - name: VLLM_PORT
               value: "8000"
             - name: LD_LIBRARY_PATH
@@ -131,9 +138,11 @@ spec:
               value: "8000"
             - name: LD_LIBRARY_PATH
               value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
-          nodeSelector:
-            NodeGroupType: g5-gpu-karpenter
-            type: karpenter
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf-token
           # Please add the following taints to the GPU node.
           tolerations:
           - key: "nvidia.com/gpu"

diff --git a/gen-ai/inference/vllm-rayserve-gpu/serve.py b/gen-ai/inference/vllm-rayserve-gpu/serve.py
@@ -0,0 +1,40 @@
+import requests
+import os
+
+
+# Constants for model endpoint and service name
+model_endpoint = os.environ.get("MODEL_ENDPOINT", "vllm")
+service_name = os.environ.get("SERVICE_NAME", "http://localhost:56449/")
+
+# Function to generate text
+def generate_text(message, history):
+    prompt = message
+
+    # Create the URL for the inference
+    url = f"{service_name}{model_endpoint}"
+
+    try:
+        # Send the request to the model service
+        response = requests.post(url, json={"prompt": prompt}, timeout=180)
+        # print(response.text)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        prompt_to_replace = "[INST]" + prompt + "[/INST]"
+
+        # Removing the original prompt with instruction set from the output
+        text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n')
+        # remove '<s>' strikethrough markdown
+        if text.startswith("<s>"):
+            text = text.replace("<s>", "", 1)
+
+        text = text.replace("</s>", "", 1)
+
+        answer_only = text
+
+        # Safety filter to remove harmful or inappropriate content
+        # answer_only = filter_harmful_content(answer_only)
+        return answer_only
+    except requests.exceptions.RequestException as e:
+        # Handle any request exceptions (e.g., connection errors)
+        return f"AI: Error: {str(e)}"
+
+print(generate_text("<s>[INST] What is your favourite condiment? [/INST]", []))
diff --git a/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py b/gen-ai/inference/vllm-rayserve-gpu/vllm_serve.py
@@ -31,10 +31,11 @@
 )
 class VLLMDeployment:
     def __init__(self, **kwargs):
-        hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
+        hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
         login(token=hf_token)
         logger.info(f"login to HF done")
         args = AsyncEngineArgs(**kwargs)
+        args.max_model_len=16992
         self.engine = AsyncLLMEngine.from_engine_args(args)
 
     async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]:
@@ -66,7 +67,7 @@ async def __call__(self, request: Request) -> Response:
         results_generator = self.engine.generate(prompt, sampling_params, request_id)
         if stream:
             background_tasks = BackgroundTasks()
-            # Using background_taks to abort the the request
+            # Using background_tasks to abort the request
             # if the client disconnects.
             background_tasks.add_task(self.may_abort_request, request_id)
             return StreamingResponse(
@@ -92,4 +93,4 @@ async def __call__(self, request: Request) -> Response:
 deployment = VLLMDeployment.bind(model="mistralai/Mistral-7B-Instruct-v0.2",
                                         dtype="bfloat16",
                                         trust_remote_code=True,
-                                        )
+                                        )