diff --git a/README.md b/README.md index 3c5449bc3..e4b06c80b 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,4 @@ This library is licensed under the Apache 2.0 License. ## 🙌 Community We welcome all individuals who are enthusiastic about data on Kubernetes to become a part of this open source community. Your contributions and participation are invaluable to the success of this project. -Built with ❤️ at AWS. \ No newline at end of file +Built with ❤️ at AWS. diff --git a/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral b/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral new file mode 100644 index 000000000..9fbb22f67 --- /dev/null +++ b/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral @@ -0,0 +1,15 @@ +# Dockerfile to build a container image for the Gradio app for Mistral-7b model + +# Use Python base image +FROM --platform=linux/amd64 python:3.9-slim + +# Set working directory in the container +WORKDIR /app + +# Copy the Python script into the container +COPY gradio-app-mistral.py /app/gradio-app-mistral.py + +RUN pip install --no-cache-dir gradio requests + +# Command to run the Python script +ENTRYPOINT ["python", "gradio-app-mistral.py"] diff --git a/gen-ai/inference/gradio-ui/gradio-app-mistral.py b/gen-ai/inference/gradio-ui/gradio-app-mistral.py new file mode 100644 index 000000000..12ede2af4 --- /dev/null +++ b/gen-ai/inference/gradio-ui/gradio-app-mistral.py @@ -0,0 +1,65 @@ +import gradio as gr +import requests +import os + + +# Constants for model endpoint and service name +model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer") +service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000") + +# Function to generate text +def generate_text(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.get(url, params={"sentence": prompt}, timeout=180) + response.raise_for_status() # Raise an exception for HTTP errors + prompt_to_replace = "[INST]" + prompt + "[/INST]" + + # Removing the original prompt with instruction set from the output + text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n') + # remove '' strikethrough markdown + if text.startswith(""): + text = text.replace("", "", 1) + + text = text.replace("", "", 1) + + answer_only = text + + # Safety filter to remove harmful or inappropriate content + answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + + +# Define the safety filter function (you can implement this as needed) +def filter_harmful_content(text): + # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text + + # For now, simply return the text as-is + return text + + +# Define the Gradio ChatInterface +chat_interface = gr.ChatInterface( + generate_text, + chatbot=gr.Chatbot(height=300), + textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), + title="Mistral AI Chat", + description="Ask me any question", + theme="soft", + examples=["How Big Is Observable Universe", "How to kill a linux process"], + cache_examples=False, + retry_btn=None, + undo_btn="Delete Previous", + clear_btn="Clear", +) + +# Launch the ChatInterface +chat_interface.launch() diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile new file mode 100644 index 000000000..370180c25 --- /dev/null +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile @@ -0,0 +1,41 @@ +# https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore +FROM rayproject/ray:2.11.0-py310 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Set up the Neuron repository and install Neuron packages +RUN . /etc/os-release && \ + sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + sudo apt-get update -y && \ + sudo apt-get install git -y && \ + sudo apt-get install aws-neuronx-dkms=2.* -y && \ + sudo apt-get install aws-neuronx-collectives=2.* -y && \ + sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \ + sudo apt-get install aws-neuronx-tools=2.* -y + + +# Switch back to a non-root user for the subsequent commands +USER $USER + +# Set pip repository pointing to the Neuron repository and install required Python packages +# huggingface_hub is needed to login to huggingface repo for the model access +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \ + pip install starlette==0.34.0 && \ + pip install huggingface_hub + + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH + +WORKDIR /serve_app + +COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml new file mode 100644 index 000000000..5ef753e3a --- /dev/null +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-deploy.yaml @@ -0,0 +1,58 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: gradio +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gradio-deployment + namespace: gradio + labels: + app: gradio +spec: + replicas: 1 + selector: + matchLabels: + app: gradio + template: + metadata: + labels: + app: gradio + spec: + containers: + - name: gradio + # Update this image to the Gradio app image you want to deploy + image: public.ecr.aws/data-on-eks/gradio-app:mistral-7b + imagePullPolicy: Always + ports: + - containerPort: 7860 + resources: + requests: + cpu: "512m" + memory: "2048Mi" + limits: + cpu: "1" + memory: "4096Mi" + env: + - name: MODEL_ENDPOINT + value: "/infer" + # Please note that the service name is currently hardcoded to match the Mistral service for this blueprint. + # If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly. + - name: SERVICE_NAME + value: "http://mistral-service.mistral.svc.cluster.local:8000" +--- +apiVersion: v1 +kind: Service +metadata: + name: gradio-service + namespace: gradio +spec: + selector: + app: gradio + ports: + - name: http + protocol: TCP + port: 7860 + targetPort: 7860 + type: ClusterIP diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml new file mode 100644 index 000000000..ecb6b52a2 --- /dev/null +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml @@ -0,0 +1,148 @@ +#---------------------------------------------------------------------- +# NOTE: For deployment instructions, refer to the DoEKS website. +#---------------------------------------------------------------------- +apiVersion: v1 +kind: Namespace +metadata: + name: mistral + +--- +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: mistral-service + namespace: mistral +spec: + serviceUnhealthySecondThreshold: 900 + deploymentUnhealthySecondThreshold: 300 + serveConfigV2: | + applications: + - name: mistral-deployment + import_path: "ray_serve_mistral:entrypoint" + route_prefix: "/" + runtime_env: + env_vars: + MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2" + NEURON_CC_FLAGS: "-O1" + HUGGING_FACE_HUB_TOKEN: $HUGGING_FACE_HUB_TOKEN + deployments: + - name: mistral-7b + autoscaling_config: + metrics_interval_s: 0.2 + min_replicas: 2 + max_replicas: 12 + look_back_period_s: 2 + downscale_delay_s: 30 + upscale_delay_s: 2 + target_num_ongoing_requests_per_replica: 1 + graceful_shutdown_timeout_s: 5 + max_concurrent_queries: 100 + ray_actor_options: + num_cpus: 10 + resources: {"neuron_cores": 2} + rayClusterConfig: + rayVersion: '2.11.0' + enableInTreeAutoscaling: true + headGroupSpec: + serviceType: NodePort + headService: + metadata: + name: mistral-service + namespace: mistral + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + resources: + limits: + cpu: "2" + memory: "20G" + requests: + cpu: "2" + memory: "20G" + nodeSelector: + instanceType: mixed-x86 + provisionerType: Karpenter + workload: rayhead + volumes: + - name: ray-logs + emptyDir: {} + + workerGroupSpecs: + - groupName: inf2-worker-group + replicas: 1 + minReplicas: 1 + maxReplicas: 5 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + # We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second + resources: + limits: + cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead + memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead + aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge + requests: + cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead + memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead + aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge + nodeSelector: + instanceType: inferentia-inf2 + provisionerType: Karpenter + tolerations: + - key: "aws.amazon.com/neuron" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: mistral-ingress + namespace: mistral + annotations: + nginx.ingress.kubernetes.io/rewrite-target: "/$1" +spec: + ingressClassName: nginx + rules: + - http: + paths: + # Ray Dashboard + - path: /dashboard/(.*) + pathType: ImplementationSpecific + backend: + service: + name: mistral-service + port: + number: 8265 diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray_serve_mistral.py b/gen-ai/inference/mistral-7b-rayserve-inf2/ray_serve_mistral.py new file mode 100644 index 000000000..daa7b10b8 --- /dev/null +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray_serve_mistral.py @@ -0,0 +1,83 @@ +# Import necessary libraries and modules +from io import BytesIO +from fastapi import FastAPI +import os + +from ray import serve + +import torch + +# Initialize the FastAPI app +app = FastAPI() + +# Define the number of Neuron cores to be used +neuron_cores = 2 + +# Deployment settings for the API ingress using Ray Serve +@serve.deployment(name="mistral-deployment", num_replicas=1, route_prefix="/") +@serve.ingress(app) +class APIIngress: + # Constructor to initialize the API with a model handle + def __init__(self, mistral_model_handle) -> None: + self.handle = mistral_model_handle + + # Define a GET endpoint for inference + @app.get("/infer") + async def infer(self, sentence: str): + # Asynchronously perform inference using the provided sentence and return the result + result = await self.handle.infer.remote(sentence) + return result + +# Deployment settings for the Mistral model using Ray Serve +@serve.deployment(name="mistral-7b", + autoscaling_config={"min_replicas": 0, "max_replicas": 6}, + ray_actor_options={ + "resources": {"neuron_cores": neuron_cores}, + "runtime_env": {"env_vars": {"NEURON_CC_FLAGS": "-O1"}}, + }, +) +class MistralModel: + # Constructor to initialize and load the model + def __init__(self): + + # Import additional necessary modules + from transformers import AutoTokenizer + from transformers_neuronx import MistralForSampling, GQA, NeuronConfig + from huggingface_hub import login + + # Retrieve environment variables for API authentication and model ID + hf_token = os.getenv('HUGGING_FACE_HUB_TOKEN') + model_id = os.getenv('MODEL_ID') + + # Log in to the Hugging Face Hub + login(token=hf_token) + + # Set the sharding strategy for the model to optimize performance + neuron_config = NeuronConfig( + group_query_attention=GQA.SHARD_OVER_HEADS + ) + + # Load and compile the Neuron model with specific configuration + self.neuron_model = MistralForSampling.from_pretrained(model_id, amp='bf16', neuron_config=neuron_config) + self.neuron_model.to_neuron() + + # Initialize tokenizer for the model + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + + # Define the inference method to process input text + def infer(self, sentence: str): + # Prepare input text with specific format + text = "[INST]" + sentence + "[/INST]" + + # Tokenize and encode the input text + encoded_input = self.tokenizer.encode(text, return_tensors='pt') + + # Perform inference in a context that disables gradient calculation + with torch.inference_mode(): + generated_sequence = self.neuron_model.sample(encoded_input, sequence_length=512, start_ids=None) + + # Decode the generated sequences into human-readable text and return + return [self.tokenizer.decode(seq) for seq in generated_sequence] + +# Bind the model to the API ingress to enable endpoint functionality +entrypoint = APIIngress.bind(MistralModel.bind()) diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md new file mode 100644 index 000000000..2aaebe2c7 --- /dev/null +++ b/website/docs/gen-ai/inference/Mistral-7b-inf2.md @@ -0,0 +1,267 @@ +--- +title: Mistral-7B on Inferentia2 +sidebar_position: 2 +--- +import CollapsibleContent from '../../../src/components/CollapsibleContent'; + +:::danger + +Note: Mistral-7B-Instruct-v0.2 is a gated model in [Huggingface](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) repository. In order to use this model, one needs to use a HuggingFace Token. +To generate a token in HuggingFace, log in using your HuggingFace account and click on `Access Tokens` menu item on the [Settings](https://huggingface.co/settings/tokens) page. + +::: + +# Deploying Mistral-7B-Instruct-v0.2 on Inferentia2, Ray Serve, Gradio +This pattern outlines the deployment of the [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) model on Amazon EKS, utilizing [AWS Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/) for enhanced text generation performance. [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) ensures efficient scaling of Ray Worker nodes, while [Karpenter](https://karpenter.sh/) dynamically manages the provisioning of AWS Inferentia2 nodes. This setup optimizes for high-performance and cost-effective text generation applications in a scalable cloud environment. + +Through this pattern, you will accomplish the following: + +- Create an [Amazon EKS](https://aws.amazon.com/eks/) cluster with a Karpenter managed AWS Inferentia2 nodepool for dynamic provisioning of Nodes. +- Install [KubeRay Operator](https://github.com/ray-project/kuberay) and other core EKS add-ons using the [trainium-inferentia](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/trainium-inferentia) Terraform blueprint. +- Deploy the `Mistral-7B-Instruct-v0.2` model with RayServe for efficient scaling. + +### What is Mistral-7B-Instruct-v0.2 Model? + +The `mistralai/Mistral-7B-Instruct-v0.2` is an instruction-tuned version of the `Mistral-7B-v0.2 base model`, which has been fine-tuned using publicly available conversation datasets. It is designed to follow instructions and complete tasks, making it suitable for applications such as chatbots, virtual assistants, and task-oriented dialogue systems. It is built on top of the `Mistral-7B-v0.2` base model, which has 7.3 billion parameters and employs a state-of-the-art architecture including Grouped-Query Attention (GQA) for faster inference and a Byte-fallback BPE tokenizer for improved robustness. + +Please refer to the [Model Card](https://replicate.com/mistralai/mistral-7b-instruct-v0.2/readme) for more detail. + +## Deploying the Solution +Let's get `Mistral-7B-Instruct-v0.2` model up and running on Amazon EKS! In this section, we'll cover: + +- **Prerequisites**: Ensuring all necessary tools are installed before you begin. +- **Infrastructure Setup**: Creating your your EKS cluster and setting the stage for deployment. +- **Deploying the Ray Cluster**: The core of your image generation pipeline, providing scalability and efficiency. +- **Building the Gradio Web UI**: Creating a user-friendly interface for seamless interaction with the Mistral 7B model. + +Prerequisites}> +Before we begin, ensure you have all the prerequisites in place to make the deployment process smooth and hassle-free. +Ensure that you have installed the following tools on your machine. + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) +3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +4. [envsubst](https://pypi.org/project/envsubst/) + +### Deploy + +Clone the repository + +```bash +git clone https://github.com/awslabs/data-on-eks.git +``` + +Navigate into one of the example directories and run `install.sh` script + +**Important Note:** Ensure that you update the region in the `variables.tf` file before deploying the blueprint. +Additionally, confirm that your local region setting matches the specified region to prevent any discrepancies. +For example, set your `export AWS_DEFAULT_REGION=""` to the desired region: + +```bash +cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh +./install.sh +``` + +### Verify the resources + +Verify the Amazon EKS Cluster + +```bash +aws eks --region us-west-2 describe-cluster --name trainium-inferentia +``` + +```bash +# Creates k8s config file to authenticate with EKS +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia + +# Output shows the EKS Managed Node group nodes +kubectl get nodes +``` + + + +## Deploying the Ray Cluster with Mistral 7B Model + +Once the `trainium-inferentia` EKS cluster is deployed, you can proceed to use `kubectl` to deploy the `ray-service-mistral.yaml` from `/data-on-eks/gen-ai/inference/mistral-7b-rayserve-inf2/` path. + +In this step, we will deploy the Ray Serve cluster, which comprises one `Head Pod` on `x86 CPU` instances using Karpenter autoscaling, as well as `Ray workers` on `inf2.24xlarge` instances, autoscaled by [Karpenter](https://karpenter.sh/). + +Let's take a closer look at the key files used in this deployment and understand their functionalities before proceeding with the deployment: +- **ray_serve_mistral.py:** + This script sets up a FastAPI application with two main components deployed using Ray Serve, which enables scalable model serving on AWS Neuron infrastructure(Inf2): + - **mistral-7b Deployment**: This class initializes the Mistral 7B model using a scheduler and moves it to an Inf2 node for processing. The script leverages Transformers Neuron support for grouped-query attention (GQA) models for this Mistral model. The `mistral-7b-instruct-v0.2` is a chat based model. The script also adds the required prefix for instructions by adding `[INST]` and `[/INST]` tokens surrounding the actual prompt. + - **APIIngress**: This FastAPI endpoint acts as an interface to the Mistral 7B model. It exposes a GET method on the `/infer` path that takes a text prompt. It responds to the prompt by replying with a text. + +- **ray-service-mistral.yaml:** + This RayServe deployment pattern sets up a scalable service for hosting the Mistral-7B-Instruct-v0.2 Model model on Amazon EKS with AWS Inferentia2 support. It creates a dedicated namespace and configures a RayService with autoscaling capabilities to efficiently manage resource utilization based on incoming traffic. The deployment ensures that the model, served under the RayService umbrella, can automatically adjust replicas, depending on demand, with each replica requiring 2 neuron cores. This pattern makes use of custom container images designed to maximize performance and minimizes startup delays by ensuring that heavy dependencies are preloaded. + +### Deploy the Mistral-7B-Instruct-v0.2 Model + +Ensure the cluster is configured locally + +```bash +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia +``` + +**Deploy RayServe Cluster** + +:::info + +To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your Hugging Face Hub token as an environment variable. This token is required for authentication and accessing the model. For guidance on how to create and manage your Hugging Face tokens, please visit [Hugging Face Token Management](https://huggingface.co/docs/hub/security-tokens). + +::: + + +```bash +# set the Hugging Face Hub Token as an environment variable. This variable will be substituted when applying the ray-service-mistral.yaml file + +export HUGGING_FACE_HUB_TOKEN= + +cd data-on-eks/gen-ai/ +envsubst < inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml| kubectl apply -f - +``` + +Verify the deployment by running the following commands + +:::info + +The deployment process may take up to 10 to 12 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface. + +::: + +This deployment establishes a Ray head pod running on an `x86` instance and a worker pod on `inf2.24xl` instance as shown below. + +```bash +kubectl get pods -n mistral + +NAME READY STATUS +service-raycluster-68tvp-worker-inf2-worker-group-2kckv 1/1 Running +mistral-service-raycluster-68tvp-head-dmfz5 2/2 Running +``` + +This deployment also sets up a mistral service with multiple ports configured; port `8265` is designated for the Ray dashboard and port `8000` for the Mistral model endpoint. + +```bash +kubectl get svc -n mistral + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +mistral-service NodePort 172.20.118.238 10001:30998/TCP,8000:32437/TCP,52365:31487/TCP,8080:30351/TCP,6379:30392/TCP,8265:30904/TCP +mistral-service-head-svc NodePort 172.20.245.131 6379:31478/TCP,8265:31393/TCP,10001:32627/TCP,8000:31251/TCP,52365:31492/TCP,8080:31471/TCP +mistral-service-serve-svc NodePort 172.20.109.223 8000:31679/TCP +``` + +For the Ray dashboard, you can port-forward these ports individually to access the web UI locally using localhost. + +```bash +kubectl -n mistral port-forward svc/mistral-service 8265:8265 +``` + +Access the web UI via `http://localhost:8265` . This interface displays the deployment of jobs and actors within the Ray ecosystem. + +![RayServe Deployment In Progress](img/ray-dashboard-deploying-mistral-inf2.png) + +Once the deployment is complete, the Controller and Proxy status should be `HEALTHY` and Application status should be `RUNNING` + +![RayServe Deployment Completed](img/ray-dashboard-deployed-mistral-inf2.png) + + +You can monitor Serve deployment and the Ray Cluster deployment including resource utilization using the Ray Dashboard. + +![RayServe Cluster](img/ray-serve-inf2-mistral-cluster.png) + +## Deploying the Gradio WebUI App +Discover how to create a user-friendly chat interface using [Gradio](https://www.gradio.app/) that integrates seamlessly with deployed models. + +Let's move forward with setting up the Gradio app as a Kubernetes deployment, utilizing a Docker container. This setup will enable interaction with the Mistral model, which is deployed using RayServe. + +:::info + +The Gradio UI application is containerized and the container image is stored in [data-on-eks](https://gallery.ecr.aws/data-on-eks/gradio-app) public repository. The Gradio app container internally points to the `mistral-service` that's running on port 8000. + +The Dockerfile for the above image is available at `data-on-eks/gen-ai/inference/gradio-ui/Dockerfile-gradio-app-mistral` path. + +This is an optional step for this deployment. You can also customize the Gradio UI app according to your design requirements. +To build a custom Gradio app Docker image, please run the below commands. Please make sure to change the image `tag` and custom `Dockerfile` name accordingly. + +```bash +cd data-on-eks/gen-ai/inference +docker buildx build --platform=linux/amd64 -t gradio-app: -f gradio-ui/ gradio-ui/ +``` + +::: + +### Deploy the Gradio Pod as Deployment + +First, deploy the Gradio app as a Deployment on EKS using kubectl: + +```bash +cd data-on-eks/gen-ai/inference/ +kubectl apply -f mistral-7b-rayserve-inf2/gradio-deploy.yaml + +namespace/gradio created +deployment.apps/gradio-deployment created +service/gradio-service created +``` + +This should create a Deployment and a Service in namespace `gradio`. Check the status of the resources. + +```bash +NAME READY STATUS RESTARTS AGE +pod/gradio-deployment-846cb4dbf6-plmgc 1/1 Running 0 61s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/gradio-service ClusterIP 172.20.179.184 7860/TCP 60s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/gradio-deployment 1/1 1 1 62s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/gradio-deployment-846cb4dbf6 1 1 1 62s +``` + +#### Invoke the WebUI + +Execute a port forward to the `gradio-service` Service using kubectl: + +```bash +kubectl -n gradio port-forward service/gradio-service 8080:7860 +``` + +Open your web browser and access the Gradio WebUI by navigating to the following URL: + +Running on local URL: http://localhost:8080 + +You should now be able to interact with the Gradio application from your local machine. + +![Gradio WebUI](img/mistral-gradio.png) + +#### Interaction With Mistral Model + +`Mistral-7B-Instruct-v0.2` Model can be used for purposes such as chat applications (Q&A, conversation), text generation, knowledge retrieval and others. + +Below screenshots provide some examples of the model response based on different text prompts. + +![Gradio QA](img/mistral-sample-prompt-1.png) + +![Gradio Convo 1](img/mistral-conv-1.png) + +![Gradio Convo 2](img/mistral-conv-2.png) + +## Cleanup +Finally, we'll provide instructions for cleaning up and deprovisioning the resources when they are no longer needed. + +**Step1:** Delete Ray Cluster + +```bash +cd data-on-eks/gen-ai/inference/mistral-7b-rayserve-inf2 +kubectl delete -f ray-service-mistral.yaml +``` + +**Step2:** Cleanup the EKS Cluster +This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order. + +```bash +export AWS_DEAFULT_REGION="DEPLOYED_EKS_CLUSTER_REGION>" +cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x cleanup.sh +./cleanup.sh +``` diff --git a/website/docs/gen-ai/inference/img/mistral-conv-1.png b/website/docs/gen-ai/inference/img/mistral-conv-1.png new file mode 100644 index 000000000..47daf4c0f Binary files /dev/null and b/website/docs/gen-ai/inference/img/mistral-conv-1.png differ diff --git a/website/docs/gen-ai/inference/img/mistral-conv-2.png b/website/docs/gen-ai/inference/img/mistral-conv-2.png new file mode 100644 index 000000000..d76b0f77d Binary files /dev/null and b/website/docs/gen-ai/inference/img/mistral-conv-2.png differ diff --git a/website/docs/gen-ai/inference/img/mistral-gradio.png b/website/docs/gen-ai/inference/img/mistral-gradio.png new file mode 100644 index 000000000..d0a3a9cab Binary files /dev/null and b/website/docs/gen-ai/inference/img/mistral-gradio.png differ diff --git a/website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png b/website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png new file mode 100644 index 000000000..20371a08c Binary files /dev/null and b/website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png differ diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png b/website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png new file mode 100644 index 000000000..12e520fa1 Binary files /dev/null and b/website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png differ diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png b/website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png new file mode 100644 index 000000000..1bee1d9a9 Binary files /dev/null and b/website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png differ diff --git a/website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png b/website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png new file mode 100644 index 000000000..9d0efe5b1 Binary files /dev/null and b/website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png differ