-
Notifications
You must be signed in to change notification settings - Fork 234
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: New Gen-AI Blueprint for Serving Inference Using Mistral-7b-Ins…
…truct-V0.2 Model on AWS Inferentia2 (#512) Signed-off-by: Vara Bonthu <[email protected]> Co-authored-by: Vara Bonthu <[email protected]>
- Loading branch information
1 parent
3506940
commit 3c6f7ae
Showing
15 changed files
with
678 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Dockerfile to build a container image for the Gradio app for Mistral-7b model | ||
|
||
# Use Python base image | ||
FROM --platform=linux/amd64 python:3.9-slim | ||
|
||
# Set working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy the Python script into the container | ||
COPY gradio-app-mistral.py /app/gradio-app-mistral.py | ||
|
||
RUN pip install --no-cache-dir gradio requests | ||
|
||
# Command to run the Python script | ||
ENTRYPOINT ["python", "gradio-app-mistral.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import gradio as gr | ||
import requests | ||
import os | ||
|
||
|
||
# Constants for model endpoint and service name | ||
model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer") | ||
service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000") | ||
|
||
# Function to generate text | ||
def generate_text(message, history): | ||
prompt = message | ||
|
||
# Create the URL for the inference | ||
url = f"{service_name}{model_endpoint}" | ||
|
||
try: | ||
# Send the request to the model service | ||
response = requests.get(url, params={"sentence": prompt}, timeout=180) | ||
response.raise_for_status() # Raise an exception for HTTP errors | ||
prompt_to_replace = "[INST]" + prompt + "[/INST]" | ||
|
||
# Removing the original prompt with instruction set from the output | ||
text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n') | ||
# remove '<s>' strikethrough markdown | ||
if text.startswith("<s>"): | ||
text = text.replace("<s>", "", 1) | ||
|
||
text = text.replace("</s>", "", 1) | ||
|
||
answer_only = text | ||
|
||
# Safety filter to remove harmful or inappropriate content | ||
answer_only = filter_harmful_content(answer_only) | ||
return answer_only | ||
except requests.exceptions.RequestException as e: | ||
# Handle any request exceptions (e.g., connection errors) | ||
return f"AI: Error: {str(e)}" | ||
|
||
|
||
# Define the safety filter function (you can implement this as needed) | ||
def filter_harmful_content(text): | ||
# TODO: Implement a safety filter to remove any harmful or inappropriate content from the text | ||
|
||
# For now, simply return the text as-is | ||
return text | ||
|
||
|
||
# Define the Gradio ChatInterface | ||
chat_interface = gr.ChatInterface( | ||
generate_text, | ||
chatbot=gr.Chatbot(height=300), | ||
textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), | ||
title="Mistral AI Chat", | ||
description="Ask me any question", | ||
theme="soft", | ||
examples=["How Big Is Observable Universe", "How to kill a linux process"], | ||
cache_examples=False, | ||
retry_btn=None, | ||
undo_btn="Delete Previous", | ||
clear_btn="Clear", | ||
) | ||
|
||
# Launch the ChatInterface | ||
chat_interface.launch() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore | ||
FROM rayproject/ray:2.11.0-py310 | ||
|
||
# Maintainer label | ||
LABEL maintainer="DoEKS" | ||
|
||
# Set environment variables to non-interactive (this prevents some prompts) | ||
ENV DEBIAN_FRONTEND=non-interactive | ||
|
||
# Switch to root to add Neuron repo and install necessary packages | ||
USER root | ||
|
||
# Set up the Neuron repository and install Neuron packages | ||
RUN . /etc/os-release && \ | ||
sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ | ||
sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ | ||
sudo apt-get update -y && \ | ||
sudo apt-get install git -y && \ | ||
sudo apt-get install aws-neuronx-dkms=2.* -y && \ | ||
sudo apt-get install aws-neuronx-collectives=2.* -y && \ | ||
sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \ | ||
sudo apt-get install aws-neuronx-tools=2.* -y | ||
|
||
|
||
# Switch back to a non-root user for the subsequent commands | ||
USER $USER | ||
|
||
# Set pip repository pointing to the Neuron repository and install required Python packages | ||
# huggingface_hub is needed to login to huggingface repo for the model access | ||
RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ | ||
pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \ | ||
pip install starlette==0.34.0 && \ | ||
pip install huggingface_hub | ||
|
||
|
||
# Add Neuron path to PATH | ||
ENV PATH /opt/aws/neuron/bin:$PATH | ||
|
||
WORKDIR /serve_app | ||
|
||
COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py |
58 changes: 58 additions & 0 deletions
58
gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: gradio | ||
--- | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: gradio-deployment | ||
namespace: gradio | ||
labels: | ||
app: gradio | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: gradio | ||
template: | ||
metadata: | ||
labels: | ||
app: gradio | ||
spec: | ||
containers: | ||
- name: gradio | ||
# Update this image to the Gradio app image you want to deploy | ||
image: public.ecr.aws/data-on-eks/gradio-app:mistral-7b | ||
imagePullPolicy: Always | ||
ports: | ||
- containerPort: 7860 | ||
resources: | ||
requests: | ||
cpu: "512m" | ||
memory: "2048Mi" | ||
limits: | ||
cpu: "1" | ||
memory: "4096Mi" | ||
env: | ||
- name: MODEL_ENDPOINT | ||
value: "/infer" | ||
# Please note that the service name is currently hardcoded to match the Mistral service for this blueprint. | ||
# If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly. | ||
- name: SERVICE_NAME | ||
value: "http://mistral-service.mistral.svc.cluster.local:8000" | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: gradio-service | ||
namespace: gradio | ||
spec: | ||
selector: | ||
app: gradio | ||
ports: | ||
- name: http | ||
protocol: TCP | ||
port: 7860 | ||
targetPort: 7860 | ||
type: ClusterIP |
148 changes: 148 additions & 0 deletions
148
gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#---------------------------------------------------------------------- | ||
# NOTE: For deployment instructions, refer to the DoEKS website. | ||
#---------------------------------------------------------------------- | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: mistral | ||
|
||
--- | ||
apiVersion: ray.io/v1 | ||
kind: RayService | ||
metadata: | ||
name: mistral-service | ||
namespace: mistral | ||
spec: | ||
serviceUnhealthySecondThreshold: 900 | ||
deploymentUnhealthySecondThreshold: 300 | ||
serveConfigV2: | | ||
applications: | ||
- name: mistral-deployment | ||
import_path: "ray_serve_mistral:entrypoint" | ||
route_prefix: "/" | ||
runtime_env: | ||
env_vars: | ||
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2" | ||
NEURON_CC_FLAGS: "-O1" | ||
HUGGING_FACE_HUB_TOKEN: $HUGGING_FACE_HUB_TOKEN | ||
deployments: | ||
- name: mistral-7b | ||
autoscaling_config: | ||
metrics_interval_s: 0.2 | ||
min_replicas: 2 | ||
max_replicas: 12 | ||
look_back_period_s: 2 | ||
downscale_delay_s: 30 | ||
upscale_delay_s: 2 | ||
target_num_ongoing_requests_per_replica: 1 | ||
graceful_shutdown_timeout_s: 5 | ||
max_concurrent_queries: 100 | ||
ray_actor_options: | ||
num_cpus: 10 | ||
resources: {"neuron_cores": 2} | ||
rayClusterConfig: | ||
rayVersion: '2.11.0' | ||
enableInTreeAutoscaling: true | ||
headGroupSpec: | ||
serviceType: NodePort | ||
headService: | ||
metadata: | ||
name: mistral-service | ||
namespace: mistral | ||
rayStartParams: | ||
dashboard-host: '0.0.0.0' | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest | ||
imagePullPolicy: Always # Ensure the image is always pulled when updated | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh", "-c", "ray stop"] | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
- containerPort: 8000 | ||
name: serve | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
resources: | ||
limits: | ||
cpu: "2" | ||
memory: "20G" | ||
requests: | ||
cpu: "2" | ||
memory: "20G" | ||
nodeSelector: | ||
instanceType: mixed-x86 | ||
provisionerType: Karpenter | ||
workload: rayhead | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
|
||
workerGroupSpecs: | ||
- groupName: inf2-worker-group | ||
replicas: 1 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest | ||
imagePullPolicy: Always # Ensure the image is always pulled when updated | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh", "-c", "ray stop"] | ||
# We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second | ||
resources: | ||
limits: | ||
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead | ||
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead | ||
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge | ||
requests: | ||
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead | ||
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead | ||
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge | ||
nodeSelector: | ||
instanceType: inferentia-inf2 | ||
provisionerType: Karpenter | ||
tolerations: | ||
- key: "aws.amazon.com/neuron" | ||
operator: "Exists" | ||
effect: "NoSchedule" | ||
- key: "hub.jupyter.org/dedicated" | ||
operator: "Equal" | ||
value: "user" | ||
effect: "NoSchedule" | ||
--- | ||
apiVersion: networking.k8s.io/v1 | ||
kind: Ingress | ||
metadata: | ||
name: mistral-ingress | ||
namespace: mistral | ||
annotations: | ||
nginx.ingress.kubernetes.io/rewrite-target: "/$1" | ||
spec: | ||
ingressClassName: nginx | ||
rules: | ||
- http: | ||
paths: | ||
# Ray Dashboard | ||
- path: /dashboard/(.*) | ||
pathType: ImplementationSpecific | ||
backend: | ||
service: | ||
name: mistral-service | ||
port: | ||
number: 8265 |
Oops, something went wrong.