Skip to content

Commit

Permalink
feat: New Gen-AI Blueprint for Serving Inference Using Mistral-7b-Ins…
Browse files Browse the repository at this point in the history
…truct-V0.2 Model on AWS Inferentia2 (#512)

Signed-off-by: Vara Bonthu <[email protected]>
Co-authored-by: Vara Bonthu <[email protected]>
  • Loading branch information
ratnopamc and vara-bonthu authored Apr 27, 2024
1 parent 3506940 commit 3c6f7ae
Show file tree
Hide file tree
Showing 15 changed files with 678 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ This library is licensed under the Apache 2.0 License.
## 🙌 Community
We welcome all individuals who are enthusiastic about data on Kubernetes to become a part of this open source community. Your contributions and participation are invaluable to the success of this project.

Built with ❤️ at AWS.
Built with ❤️ at AWS.
15 changes: 15 additions & 0 deletions gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Dockerfile to build a container image for the Gradio app for Mistral-7b model

# Use Python base image
FROM --platform=linux/amd64 python:3.9-slim

# Set working directory in the container
WORKDIR /app

# Copy the Python script into the container
COPY gradio-app-mistral.py /app/gradio-app-mistral.py

RUN pip install --no-cache-dir gradio requests

# Command to run the Python script
ENTRYPOINT ["python", "gradio-app-mistral.py"]
65 changes: 65 additions & 0 deletions gen-ai/inference/gradio-ui/gradio-app-mistral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import gradio as gr
import requests
import os


# Constants for model endpoint and service name
model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer")
service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")

# Function to generate text
def generate_text(message, history):
prompt = message

# Create the URL for the inference
url = f"{service_name}{model_endpoint}"

try:
# Send the request to the model service
response = requests.get(url, params={"sentence": prompt}, timeout=180)
response.raise_for_status() # Raise an exception for HTTP errors
prompt_to_replace = "[INST]" + prompt + "[/INST]"

# Removing the original prompt with instruction set from the output
text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n')
# remove '<s>' strikethrough markdown
if text.startswith("<s>"):
text = text.replace("<s>", "", 1)

text = text.replace("</s>", "", 1)

answer_only = text

# Safety filter to remove harmful or inappropriate content
answer_only = filter_harmful_content(answer_only)
return answer_only
except requests.exceptions.RequestException as e:
# Handle any request exceptions (e.g., connection errors)
return f"AI: Error: {str(e)}"


# Define the safety filter function (you can implement this as needed)
def filter_harmful_content(text):
# TODO: Implement a safety filter to remove any harmful or inappropriate content from the text

# For now, simply return the text as-is
return text


# Define the Gradio ChatInterface
chat_interface = gr.ChatInterface(
generate_text,
chatbot=gr.Chatbot(height=300),
textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
title="Mistral AI Chat",
description="Ask me any question",
theme="soft",
examples=["How Big Is Observable Universe", "How to kill a linux process"],
cache_examples=False,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)

# Launch the ChatInterface
chat_interface.launch()
41 changes: 41 additions & 0 deletions gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore
FROM rayproject/ray:2.11.0-py310

# Maintainer label
LABEL maintainer="DoEKS"

# Set environment variables to non-interactive (this prevents some prompts)
ENV DEBIAN_FRONTEND=non-interactive

# Switch to root to add Neuron repo and install necessary packages
USER root

# Set up the Neuron repository and install Neuron packages
RUN . /etc/os-release && \
sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
sudo apt-get update -y && \
sudo apt-get install git -y && \
sudo apt-get install aws-neuronx-dkms=2.* -y && \
sudo apt-get install aws-neuronx-collectives=2.* -y && \
sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \
sudo apt-get install aws-neuronx-tools=2.* -y


# Switch back to a non-root user for the subsequent commands
USER $USER

# Set pip repository pointing to the Neuron repository and install required Python packages
# huggingface_hub is needed to login to huggingface repo for the model access
RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \
pip install starlette==0.34.0 && \
pip install huggingface_hub


# Add Neuron path to PATH
ENV PATH /opt/aws/neuron/bin:$PATH

WORKDIR /serve_app

COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py
58 changes: 58 additions & 0 deletions gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
apiVersion: v1
kind: Namespace
metadata:
name: gradio
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gradio-deployment
namespace: gradio
labels:
app: gradio
spec:
replicas: 1
selector:
matchLabels:
app: gradio
template:
metadata:
labels:
app: gradio
spec:
containers:
- name: gradio
# Update this image to the Gradio app image you want to deploy
image: public.ecr.aws/data-on-eks/gradio-app:mistral-7b
imagePullPolicy: Always
ports:
- containerPort: 7860
resources:
requests:
cpu: "512m"
memory: "2048Mi"
limits:
cpu: "1"
memory: "4096Mi"
env:
- name: MODEL_ENDPOINT
value: "/infer"
# Please note that the service name is currently hardcoded to match the Mistral service for this blueprint.
# If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly.
- name: SERVICE_NAME
value: "http://mistral-service.mistral.svc.cluster.local:8000"
---
apiVersion: v1
kind: Service
metadata:
name: gradio-service
namespace: gradio
spec:
selector:
app: gradio
ports:
- name: http
protocol: TCP
port: 7860
targetPort: 7860
type: ClusterIP
148 changes: 148 additions & 0 deletions gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#----------------------------------------------------------------------
# NOTE: For deployment instructions, refer to the DoEKS website.
#----------------------------------------------------------------------
apiVersion: v1
kind: Namespace
metadata:
name: mistral

---
apiVersion: ray.io/v1
kind: RayService
metadata:
name: mistral-service
namespace: mistral
spec:
serviceUnhealthySecondThreshold: 900
deploymentUnhealthySecondThreshold: 300
serveConfigV2: |
applications:
- name: mistral-deployment
import_path: "ray_serve_mistral:entrypoint"
route_prefix: "/"
runtime_env:
env_vars:
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
NEURON_CC_FLAGS: "-O1"
HUGGING_FACE_HUB_TOKEN: $HUGGING_FACE_HUB_TOKEN
deployments:
- name: mistral-7b
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 2
max_replicas: 12
look_back_period_s: 2
downscale_delay_s: 30
upscale_delay_s: 2
target_num_ongoing_requests_per_replica: 1
graceful_shutdown_timeout_s: 5
max_concurrent_queries: 100
ray_actor_options:
num_cpus: 10
resources: {"neuron_cores": 2}
rayClusterConfig:
rayVersion: '2.11.0'
enableInTreeAutoscaling: true
headGroupSpec:
serviceType: NodePort
headService:
metadata:
name: mistral-service
namespace: mistral
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: "2"
memory: "20G"
requests:
cpu: "2"
memory: "20G"
nodeSelector:
instanceType: mixed-x86
provisionerType: Karpenter
workload: rayhead
volumes:
- name: ray-logs
emptyDir: {}

workerGroupSpecs:
- groupName: inf2-worker-group
replicas: 1
minReplicas: 1
maxReplicas: 5
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
# We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second
resources:
limits:
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
requests:
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: mistral-ingress
namespace: mistral
annotations:
nginx.ingress.kubernetes.io/rewrite-target: "/$1"
spec:
ingressClassName: nginx
rules:
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: mistral-service
port:
number: 8265
Loading

0 comments on commit 3c6f7ae

Please sign in to comment.