awslabs · vara-bonthu · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/ai-ml/jark-stack/terraform/README.md b/ai-ml/jark-stack/terraform/README.md
@@ -52,6 +52,7 @@ Docs coming soon...
 |------|-------------|------|---------|:--------:|
 | <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.29"` | no |
 | <a name="input_enable_aws_efa_k8s_device_plugin"></a> [enable\_aws\_efa\_k8s\_device\_plugin](#input\_enable\_aws\_efa\_k8s\_device\_plugin) | Enable AWS EFA K8s Device Plugin | `bool` | `false` | no |
+| <a name="input_enable_kubecost"></a> [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost addon | `bool` | `false` | no |
 | <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"jark-stack"` | no |
 | <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |

diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf
@@ -209,6 +209,17 @@ module "data_addons" {
     values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")]
   }
 
+  #---------------------------------------------------------------
+  # Kubecost Add-on
+  #---------------------------------------------------------------
+  enable_kubecost = var.enable_kubecost
+  kubecost_helm_config = {
+    values              = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})]
+    version             = "2.2.2"
+    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+    repository_password = data.aws_ecrpublic_authorization_token.token.password
+  }
+
   #---------------------------------------------------------------
   # Karpenter Resources Add-on
   #---------------------------------------------------------------

diff --git a/ai-ml/jark-stack/terraform/examples/training/README.md b/ai-ml/jark-stack/terraform/examples/training/README.md
diff --git a/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml b/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml
@@ -0,0 +1,69 @@
+
+# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090
+
+global:
+  # pricingCsv:
+  #   enabled: false
+  #   location:
+  #     provider: "AWS"
+  #     region: "us-east-1"
+  #     URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
+  #     csvAccessCredentials: pricing-schema-access-secret
+
+  prometheus:
+    enabled: true  # Kubecost depends on Prometheus data, it is not optional. When enabled: false, Prometheus will not be installed and you must configure your own Prometheus to scrape kubecost as well as provide the fqdn below. -- Warning: Before changing this setting, please read to understand the risks https://docs.kubecost.com/install-and-configure/install/custom-prom
+    fqdn: http://cost-analyzer-prometheus-server.default.svc  # example address of a prometheus to connect to. Include protocol (http:// or https://) Ignored if enabled: true
+
+  grafana:
+    enabled: true  # If false, Grafana will not be installed
+    domainName: cost-analyzer-grafana.default.svc  # example grafana domain Ignored if enabled: true
+    scheme: "http"  # http or https, for the domain name above.
+    proxy: true  # If true, the kubecost frontend will route to your grafana through its service endpoint
+
+kubecostFrontend:
+  image: public.ecr.aws/kubecost/frontend
+  resources:
+    requests:
+      cpu: "200m"
+      memory: "512Mi"
+
+kubecostMetrics:
+  emitPodAnnotations: true
+  emitNamespaceAnnotations: true
+
+kubecostModel:
+  image: public.ecr.aws/kubecost/cost-model
+  resources:
+    requests:
+      cpu: "500m"
+      memory: "512Mi"
+
+forecasting:
+  fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.6
+
+networkCosts:
+  image:
+    repository: public.ecr.aws/kubecost/kubecost-network-costs
+
+clusterController:
+  image:
+    repository: public.ecr.aws/kubecost/cluster-controller
+
+prometheus:
+  server:
+    image:
+      repository: public.ecr.aws/kubecost/prometheus
+
+  configmapReload:
+    prometheus:
+      image:
+        repository: public.ecr.aws/kubecost/prometheus-config-reloader
+
+reporting:
+  productAnalytics: false
+
+# Define persistence volume for cost-analyzer
+persistentVolume:
+  size: 32Gi
+  dbSize: 32.0Gi
+  enabled: true # Note that setting this to false means configurations will be wiped out on pod restart.
diff --git a/ai-ml/jark-stack/terraform/variables.tf b/ai-ml/jark-stack/terraform/variables.tf
@@ -44,3 +44,9 @@ variable "enable_aws_efa_k8s_device_plugin" {
   type        = bool
   default     = false
 }
+
+variable "enable_kubecost" {
+  description = "Enable Kubecost addon"
+  type        = bool
+  default     = false
+}
diff --git a/...lediffusion-rayserve/gradio-ui/Dockerfile → gen-ai/inference/gradio-ui/Dockerfile b/...lediffusion-rayserve/gradio-ui/Dockerfile → gen-ai/inference/gradio-ui/Dockerfile
diff --git a/...iffusion-rayserve/gradio-ui/gradio-app.py → gen-ai/inference/gradio-ui/gradio-app.py b/...iffusion-rayserve/gradio-ui/gradio-app.py → gen-ai/inference/gradio-ui/gradio-app.py
diff --git a/...rence/stablediffusion-rayserve/Dockerfile → .../stable-diffusion-rayserve-gpu/Dockerfile b/...rence/stablediffusion-rayserve/Dockerfile → .../stable-diffusion-rayserve-gpu/Dockerfile
@@ -14,7 +14,7 @@ ENV DEBIAN_FRONTEND=non-interactive
 USER $USER
 
 # Install Ray Serve and other Python packages with specific versions
-RUN pip install --no-cache-dir requests torch "diffusers==0.12.1" "transformers=4.25.1"
+RUN pip install --no-cache-dir requests torch "diffusers==0.12.1" "transformers==4.25.1"
 
 # Set a working directory
 WORKDIR /serve_app

diff --git a/...ion-rayserve/gradio-ui/gradio-deploy.yaml → ...diffusion-rayserve-gpu/gradio-deploy.yaml b/...ion-rayserve/gradio-ui/gradio-deploy.yaml → ...diffusion-rayserve-gpu/gradio-deploy.yaml
diff --git a/...n-rayserve/ray-serve-stablediffusion.yaml → ...yserve-gpu/ray-serve-stablediffusion.yaml b/...n-rayserve/ray-serve-stablediffusion.yaml → ...yserve-gpu/ray-serve-stablediffusion.yaml
@@ -45,7 +45,7 @@ spec:
               num_cpus: 3
               num_gpus: 1
   rayClusterConfig:
-    rayVersion: '2.10.0'
+    rayVersion: '2.11.0'
     enableInTreeAutoscaling: true
     headGroupSpec:
       serviceType: NodePort
@@ -64,7 +64,7 @@ spec:
             # For faster inference scaling, consider building a custom image with only your workload's essential dependencies.
             # Smaller images lead to faster scaling, especially across multiple nodes.
             # Notice that we are using the same image for both the head and worker nodes. You might hit ModuleNotFoundError if you use a different image for head and worker nodes.
-              image: public.ecr.aws/data-on-eks/ray2.10.0-py310-gpu-stablediffusion:latest
+              image: public.ecr.aws/data-on-eks/ray2.11.0-py310-gpu-stablediffusion:latest
               imagePullPolicy: IfNotPresent # Ensure the image is always pulled when updated
               lifecycle:
                 preStop:
@@ -110,7 +110,7 @@ spec:
             # For faster inference scaling, consider building a custom image with only your workload's essential dependencies.
             # Smaller images lead to faster scaling, especially across multiple nodes.
             # Notice that we are using the same image for both the head and worker nodes. You might hit ModuleNotFoundError if you use a different image for head and worker nodes.
-                image: public.ecr.aws/data-on-eks/ray2.10.0-py310-gpu-stablediffusion:latest
+                image: public.ecr.aws/data-on-eks/ray2.11.0-py310-gpu-stablediffusion:latest
                 imagePullPolicy: IfNotPresent # Ensure the image is always pulled when updated
                 lifecycle:
                   preStop:

diff --git a/.../stablediffusion-rayserve/ray_serve_sd.py → ...le-diffusion-rayserve-gpu/ray_serve_sd.py b/.../stablediffusion-rayserve/ray_serve_sd.py → ...le-diffusion-rayserve-gpu/ray_serve_sd.py
diff --git a/.../jupyter-notebooks/llama2-chat-inf2.ipynb → gen-ai/notebooks/llama2-chat-inf2.ipynb b/.../jupyter-notebooks/llama2-chat-inf2.ipynb → gen-ai/notebooks/llama2-chat-inf2.ipynb
diff --git a/website/docs/gen-ai/inference/img/gradio-app-gpu.png b/website/docs/gen-ai/inference/img/gradio-app-gpu.png
diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png b/website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png
diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png b/website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png
diff --git a/website/docs/gen-ai/inference/stablediffusion-gpus.md b/website/docs/gen-ai/inference/stablediffusion-gpus.md
@@ -0,0 +1,241 @@
+---
+title: Stable Diffusion on GPU
+sidebar_position: 2
+---
+import CollapsibleContent from '../../../src/components/CollapsibleContent';
+
+:::info
+
+We are actively enhancing this blueprint to incorporate improvements in observability and logging.
+
+:::
+
+# Deploying Stable Diffusion v2 with GPUs, Ray Serve and Gradio
+This pattern demonstrates how to deploy the [Stable Diffusion V2](https://huggingface.co/stabilityai/stable-diffusion-2-1) model on Amazon EKS, using [GPUs](https://aws.amazon.com/ec2/instance-types/g4/) for accelerated image generation. [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) provides efficient scaling across multiple GPU nodes, while [Karpenter](https://karpenter.sh/) dynamically manages node provisioning.
+
+Through this pattern, you will accomplish the following:
+
+- Create an Amazon EKS cluster with a Karpenter managed GPU nodepool for dynamic scaling of Nodes.
+- Install KubeRay Operator and other core EKS add-ons using the [jark-stack](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/jark-stack/terraform) Terraform blueprint.
+- Deploy the Stable Diffusion model using RayServe for efficient scaling across your GPU resources
+
+### What is Stable Diffusion?
+Stable Diffusion is a cutting-edge text-to-image model that generates stunning, detailed images from text descriptions. It's a powerful tool for artists, designers, and anyone who wants to unleash their imagination through image generation. This model stands out by offering a high degree of creative control and flexibility in the image generation process.
+
+## Deploying the Solution
+Let's get Stable Diffusion v2-1 up and running on Amazon EKS! In this section, we'll cover:
+
+- **Prerequisites**: Ensuring you have everything in place.
+- **Infrastructure Setup**: Creating your EKS cluster and preparing it for deployment.
+- **Deploying the Ray Cluster**: The core of your image generation pipeline, providing scalability and efficiency.
+- **Building the Gradio Web UI**: A user-friendly interface for interacting with Stable Diffusion.
+
+<CollapsibleContent header={<h2><span>Prerequisites</span></h2>}>
+Before we begin, ensure you have all the prerequisites in place to make the deployment process smooth and hassle-free.
+Ensure that you have installed the following tools on your machine.
+
+1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
+2. [kubectl](https://Kubernetes.io/docs/tasks/tools/)
+3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli)
+
+### Deploy
+
+Clone the repository
+
+```bash
+git clone https://github.com/awslabs/data-on-eks.git
+```
+
+Navigate into one of the example directories and run `install.sh` script
+
+**Important Note:** Ensure that you update the region in the `variables.tf` file before deploying the blueprint.
+Additionally, confirm that your local region setting matches the specified region to prevent any discrepancies.
+For example, set your `export AWS_DEFAULT_REGION="<REGION>"` to the desired region:
+
+```bash
+cd data-on-eks/ai-ml/jark-stack/ && chmod +x install.sh
+./install.sh
+```
+
+### Verify the resources
+
+Verify the Amazon EKS Cluster
+
+```bash
+aws eks --region us-west-2 describe-cluster --name jark-stack
+```
+
+```bash
+# Creates k8s config file to authenticate with EKS
+aws eks --region us-west-2 update-kubeconfig --name jark-stack
+
+# Output shows the EKS Managed Node group nodes
+kubectl get nodes
+```
+
+</CollapsibleContent>
+
+## Deploying the Ray Cluster with Stable Diffusion Model
+
+Once the `jark-stack` cluster is deployed, you can proceed to use `kubectl` to deploy the `ray-service-stablediffusion.yaml` from `/data-on-eks/gen-ai/inference/stable-diffusion-rayserve-gpu/` path.
+
+In this step, we will deploy the Ray Serve cluster, which comprises one `Head Pod` on `x86 CPU` instances using Karpenter autoscaling, as well as `Ray workers` on `g5.2xlarge` instances, autoscaled by [Karpenter](https://karpenter.sh/).
+
+Let's take a closer look at the key files used in this deployment and understand their functionalities before proceeding with the deployment:
+- **ray_serve_sd.py:**
+  This script sets up a FastAPI application with two main components deployed using Ray Serve, which enables scalable model serving on GPU-equipped infrastructure:
+  - **StableDiffusionV2 Deployment**: This class initializes the Stable Diffusion V2 model using a scheduler and moves it to a GPU for processing. It includes functionality to generate images based on textual prompts, with the image size customizable via the input parameter.
+  - **APIIngress**: This FastAPI endpoint acts as an interface to the Stable Diffusion model. It exposes a GET method on the `/imagine` path that takes a text prompt and an optional image size. It generates an image using the Stable Diffusion model and returns it as a PNG file.
+
+- **ray-service-stablediffusion.yaml:**
+  This RayServe deployment pattern sets up a scalable service for hosting the Stable Diffusion model on Amazon EKS with GPU support. It creates a dedicated namespace and configures a RayService with autoscaling capabilities to efficiently manage resource utilization based on incoming traffic. The deployment ensures that the model, served under the RayService umbrella, can automatically adjust between 1 and 4 replicas, depending on demand, with each replica requiring a GPU. This pattern makes use of custom container images designed to maximize performance and minimizes startup delays by ensuring that heavy dependencies are preloaded.
+
+### Deploy the Stable Diffusion V2 Model
+
+Ensure the cluster is configured locally
+
+```bash
+aws eks --region us-west-2 update-kubeconfig --name jark-stack
+```
+
+**Deploy RayServe Cluster**
+
+```bash
+cd data-on-eks/gen-ai/inference/stable-diffusion-rayserve-gpu
+kubectl apply -f ray-service-stablediffusion.yaml
+```
+
+Verify the deployment by running the following commands
+
+:::info
+
+The deployment process may take up to 10 to 12 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface.
+
+:::
+
+This deployment establishes a Ray head pod running on an x86 instance and a worker pod on a GPU G5 instance as shown below.
+
+```bash
+kubectl get pods -n stablediffusion
+
+NAME                                                      READY   STATUS
+rservice-raycluster-hb4l4-worker-gpu-worker-group-z8gdw   1/1     Running
+stablediffusion-service-raycluster-hb4l4-head-4kfzz       2/2     Running
+```
+
+This deployment also sets up a stablediffusion service with multiple ports configured; port `8265` is designated for the Ray dashboard and port `8000` for the Stable Diffusion model endpoint.
+
+```bash
+kubectl get svc -n stablediffusion
+NAME                                TYPE       CLUSTER-IP       EXTERNAL-IP   PORT(S)
+stablediffusion-service             NodePort   172.20.223.142   <none>        8080:30213/TCP,6379:30386/TCP,8265:30857/TCP,10001:30666/TCP,8000:31194/TCP
+stablediffusion-service-head-svc    NodePort   172.20.215.100   <none>        8265:30170/TCP,10001:31246/TCP,8000:30376/TCP,8080:32646/TCP,6379:31846/TCP
+stablediffusion-service-serve-svc   NodePort   172.20.153.125   <none>        8000:31459/TCP
+```
+
+For the Ray dashboard, you can port-forward these ports individually to access the web UI locally using localhost.
+
+```bash
+kubectl port-forward svc/stablediffusion-service 8266:8265 -n stablediffusion
+```
+
+Access the web UI via `http://localhost:8265` . This interface displays the deployment of jobs and actors within the Ray ecosystem.
+
+![RayServe Deployment](img/ray-serve-gpu-sd.png)
+
+The screenshots provided will show the Serve deployment and the Ray Cluster deployment, offering a visual overview of the setup and operational status.
+
+![RayServe Cluster](img/ray-serve-gpu-sd-cluster.png)
+
+## Deploying the Gradio WebUI App
+Discover how to create a user-friendly chat interface using [Gradio](https://www.gradio.app/) that integrates seamlessly with deployed models.
+
+Let's move forward with setting up the Gradio app as a Kubernetes deployment, utilizing a Docker container. This setup will enable interaction with the Stable Diffusion model, which is deployed using RayServe.
+
+:::info
+
+The Gradio UI application is containerized and the container image is stored in [data-on-eks](https://gallery.ecr.aws/data-on-eks/gradio-app) public repository. The Gradio app container internally points to the `stablediffusion-service` that's running on port 8000.
+
+:::
+
+### Deploy the Gradio Pod as Deployment
+
+First, deploy the Gradio app as a Deployment on EKS using kubectl:
+
+```bash
+cd data-on-eks/gen-ai/inference/gradio-ui
+kubectl apply -f gradio-deploy.yaml
+
+  namespace/gradio created
+  deployment.apps/gradio-deployment created
+  service/gradio-service created
+```
+
+This should create a Deployment and a Service in namespace `gradio`. Check the status of the resources.
+
+```bash
+kubectl -n gradio get all
+NAME                                     READY   STATUS    RESTARTS   AGE
+pod/gradio-deployment-668cf5dc7c-h22bl   1/1     Running   0          52s
+
+NAME                     TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)    AGE
+service/gradio-service   ClusterIP   172.20.130.85   <none>        7860/TCP   52s
+
+NAME                                READY   UP-TO-DATE   AVAILABLE   AGE
+deployment.apps/gradio-deployment   1/1     1            1           52s
+
+NAME                                           DESIRED   CURRENT   READY   AGE
+replicaset.apps/gradio-deployment-668cf5dc7c   1         1         1       52s
+```
+
+#### Invoke the WebUI
+
+Execute a port forward to the `gradio-service` Service using kubectl:
+
+```bash
+kubectl -n gradio port-forward service/gradio-service 8080:7860
+```
+
+Open your web browser and access the Gradio WebUI by navigating to the following URL:
+
+Running on local URL:  http://localhost:8080
+
+You should now be able to interact with the Gradio application from your local machine.
+
+![Gradio WebUI](img/gradio-app-gpu.png)
+
+### Ray Autoscaling
+The Ray Autoscaling configuration detailed in the `ray-serve-stablediffusion.yaml` file leverages the capabilities of Ray on Kubernetes to dynamically scale applications based on computational needs.
+
+1. **Incoming Traffic**: Incoming requests to your stable-diffusion deployment trigger Ray Serve to monitor the load on existing replicas.
+2. **Metrics-Based Scaling**: Ray Serve tracks the average number of ongoing requests per replica.  This configuration has `target_num_ongoing_requests_per_replica` set to 1. If this metric exceeds the threshold, it signals the need for more replicas.
+3. **Replica Creation (Within Node)**: If a node has sufficient GPU capacity, Ray Serve will attempt to add a new replica within the existing node. Your deployment requests 1 GPU per replica (`ray_actor_options: num_gpus: 1`).
+4. **Node Scaling (Karpenter)**:  If a node cannot accommodate an additional replica (e.g., only one GPU per node), Ray will signal to Kubernetes that it needs more resources. Karpenter observes pending pod requests from Kubernetes and provisions a new g5 GPU node to fulfill the resource need.
+5. **Replica Creation (Across Nodes)**: Once the new node is ready, Ray Serve schedules an additional replica on the newly provisioned node.
+
+**To simulate autoscaling:**
+1. **Generate Load**: Create a script or use a load testing tool to send a burst of image generation requests to your stable diffusion service.
+2. **Observe (Ray Dashboard)**: Access the Ray Dashboard (via port-forwarding or public NLB if configured) at http://your-cluster/dashboard. Observe how these metrics change:
+        The number of replicas for your deployment.
+        The number of nodes in your Ray cluster.
+3. **Observe (Kubernetes)**: Use `kubectl get pods -n stablediffusion` to see the creation of new pods. Use `kubectl get nodes` to observe new nodes provisioned by Karpenter.
+
+
+## Cleanup
+Finally, we'll provide instructions for cleaning up and deprovisioning the resources when they are no longer needed.
+
+**Step1:** Delete Ray Cluster
+
+```bash
+cd data-on-eks/gen-ai/inference/stable-diffusion-rayserve-gpu
+kubectl delete -f ray-service-stablediffusion.yaml
+```
+
+**Step2:** Cleanup the EKS Cluster
+This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order.
+
+```bash
+export AWS_DEAFULT_REGION="DEPLOYED_EKS_CLUSTER_REGION>"
+cd data-on-eks/ai-ml/jark-stack/ && chmod +x cleanup.sh
+./cleanup.sh
+```