feat: New Gen-AI Blueprint for Serving Inference Using Mistral-7b-Ins…

…truct-V0.2 Model on AWS Inferentia2 (#512) Signed-off-by: Vara Bonthu <[email protected]> Co-authored-by: Vara Bonthu <[email protected]>
awslabs · Apr 27, 2024 · 3c6f7ae · 3c6f7ae
1 parent 3506940
commit 3c6f7ae
Show file tree

Hide file tree

Showing 15 changed files with 678 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -74,4 +74,4 @@ This library is licensed under the Apache 2.0 License.
 ## 🙌 Community
 We welcome all individuals who are enthusiastic about data on Kubernetes to become a part of this open source community. Your contributions and participation are invaluable to the success of this project.
 
-Built with ❤️ at AWS.
+Built with ❤️ at AWS.
diff --git a/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral b/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral
@@ -0,0 +1,15 @@
+# Dockerfile to build a container image for the Gradio app for Mistral-7b model
+
+# Use Python base image
+FROM --platform=linux/amd64 python:3.9-slim
+
+# Set working directory in the container
+WORKDIR /app
+
+# Copy the Python script into the container
+COPY gradio-app-mistral.py /app/gradio-app-mistral.py
+
+RUN pip install --no-cache-dir gradio requests
+
+# Command to run the Python script
+ENTRYPOINT ["python", "gradio-app-mistral.py"]
diff --git a/gen-ai/inference/gradio-ui/gradio-app-mistral.py b/gen-ai/inference/gradio-ui/gradio-app-mistral.py
@@ -0,0 +1,65 @@
+import gradio as gr
+import requests
+import os
+
+
+# Constants for model endpoint and service name
+model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer")
+service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
+
+# Function to generate text
+def generate_text(message, history):
+    prompt = message
+
+    # Create the URL for the inference
+    url = f"{service_name}{model_endpoint}"
+
+    try:
+        # Send the request to the model service
+        response = requests.get(url, params={"sentence": prompt}, timeout=180)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        prompt_to_replace = "[INST]" + prompt + "[/INST]"
+
+        # Removing the original prompt with instruction set from the output
+        text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n')
+        # remove '<s>' strikethrough markdown
+        if text.startswith("<s>"):
+            text = text.replace("<s>", "", 1)
+
+        text = text.replace("</s>", "", 1)
+
+        answer_only = text
+
+        # Safety filter to remove harmful or inappropriate content
+        answer_only = filter_harmful_content(answer_only)
+        return answer_only
+    except requests.exceptions.RequestException as e:
+        # Handle any request exceptions (e.g., connection errors)
+        return f"AI: Error: {str(e)}"
+
+
+# Define the safety filter function (you can implement this as needed)
+def filter_harmful_content(text):
+    # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text
+
+    # For now, simply return the text as-is
+    return text
+
+
+# Define the Gradio ChatInterface
+chat_interface = gr.ChatInterface(
+    generate_text,
+    chatbot=gr.Chatbot(height=300),
+    textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
+    title="Mistral AI Chat",
+    description="Ask me any question",
+    theme="soft",
+    examples=["How Big Is Observable Universe", "How to kill a linux process"],
+    cache_examples=False,
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+)
+
+# Launch the ChatInterface
+chat_interface.launch()
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
@@ -0,0 +1,41 @@
+# https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore
+FROM rayproject/ray:2.11.0-py310
+
+# Maintainer label
+LABEL maintainer="DoEKS"
+
+# Set environment variables to non-interactive (this prevents some prompts)
+ENV DEBIAN_FRONTEND=non-interactive
+
+# Switch to root to add Neuron repo and install necessary packages
+USER root
+
+# Set up the Neuron repository and install Neuron packages
+RUN . /etc/os-release && \
+    sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
+    sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
+    sudo apt-get update -y && \
+    sudo apt-get install git -y && \
+    sudo apt-get install aws-neuronx-dkms=2.* -y && \
+    sudo apt-get install aws-neuronx-collectives=2.* -y && \
+    sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \
+    sudo apt-get install aws-neuronx-tools=2.* -y
+
+
+# Switch back to a non-root user for the subsequent commands
+USER $USER
+
+# Set pip repository pointing to the Neuron repository and install required Python packages
+# huggingface_hub is needed to login to huggingface repo for the model access
+RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
+    pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \
+    pip install starlette==0.34.0 && \
+    pip install huggingface_hub
+
+
+# Add Neuron path to PATH
+ENV PATH /opt/aws/neuron/bin:$PATH
+
+WORKDIR /serve_app
+
+COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml
@@ -0,0 +1,58 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gradio
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio-deployment
+  namespace: gradio
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - name: gradio
+        # Update this image to the Gradio app image you want to deploy
+        image: public.ecr.aws/data-on-eks/gradio-app:mistral-7b
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 7860
+        resources:
+          requests:
+            cpu: "512m"
+            memory: "2048Mi"
+          limits:
+            cpu: "1"
+            memory: "4096Mi"
+        env:
+        - name: MODEL_ENDPOINT
+          value: "/infer"
+        # Please note that the service name is currently hardcoded to match the Mistral service for this blueprint.
+        # If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly.
+        - name: SERVICE_NAME
+          value: "http://mistral-service.mistral.svc.cluster.local:8000"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio-service
+  namespace: gradio
+spec:
+  selector:
+    app: gradio
+  ports:
+  - name: http
+    protocol: TCP
+    port: 7860
+    targetPort: 7860
+  type: ClusterIP
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
@@ -0,0 +1,148 @@
+#----------------------------------------------------------------------
+# NOTE: For deployment instructions, refer to the DoEKS website.
+#----------------------------------------------------------------------
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: mistral
+
+---
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: mistral-service
+  namespace: mistral
+spec:
+  serviceUnhealthySecondThreshold: 900
+  deploymentUnhealthySecondThreshold: 300
+  serveConfigV2: |
+    applications:
+      - name: mistral-deployment
+        import_path: "ray_serve_mistral:entrypoint"
+        route_prefix: "/"
+        runtime_env:
+          env_vars:
+            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
+            NEURON_CC_FLAGS: "-O1"
+            HUGGING_FACE_HUB_TOKEN: $HUGGING_FACE_HUB_TOKEN
+        deployments:
+          - name: mistral-7b
+            autoscaling_config:
+              metrics_interval_s: 0.2
+              min_replicas: 2
+              max_replicas: 12
+              look_back_period_s: 2
+              downscale_delay_s: 30
+              upscale_delay_s: 2
+              target_num_ongoing_requests_per_replica: 1
+            graceful_shutdown_timeout_s: 5
+            max_concurrent_queries: 100
+            ray_actor_options:
+              num_cpus: 10
+              resources: {"neuron_cores": 2}
+  rayClusterConfig:
+    rayVersion: '2.11.0'
+    enableInTreeAutoscaling: true
+    headGroupSpec:
+      serviceType: NodePort
+      headService:
+        metadata:
+          name: mistral-service
+          namespace: mistral
+      rayStartParams:
+        dashboard-host: '0.0.0.0'
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            imagePullPolicy: Always # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            ports:
+            - containerPort: 6379
+              name: gcs-server
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            volumeMounts:
+            - mountPath: /tmp/ray
+              name: ray-logs
+            resources:
+              limits:
+                cpu: "2"
+                memory: "20G"
+              requests:
+                cpu: "2"
+                memory: "20G"
+          nodeSelector:
+            instanceType: mixed-x86
+            provisionerType: Karpenter
+            workload: rayhead
+          volumes:
+          - name: ray-logs
+            emptyDir: {}
+
+    workerGroupSpecs:
+    - groupName: inf2-worker-group
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 5
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-worker
+            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            imagePullPolicy: Always # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            # We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second
+            resources:
+              limits:
+                cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
+                memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
+                aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
+              requests:
+                cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
+                memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
+                aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
+          nodeSelector:
+            instanceType: inferentia-inf2
+            provisionerType: Karpenter
+          tolerations:
+          - key: "aws.amazon.com/neuron"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated"
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: mistral-ingress
+  namespace: mistral
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: "/$1"
+spec:
+  ingressClassName: nginx
+  rules:
+  - http:
+      paths:
+      # Ray Dashboard
+      - path: /dashboard/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: mistral-service
+            port:
+              number: 8265