awslabs · vara-bonthu · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/ai-ml/jark-stack/terraform/README.md b/ai-ml/jark-stack/terraform/README.md
@@ -20,13 +20,14 @@ Docs coming soon...
 | Name | Version |
 |------|---------|
 | <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.72 |
+| <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
 | <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 2.10 |
 
 ## Modules
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.1 |
+| <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.31.4 |
 | <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
 | <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
 | <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
@@ -41,13 +42,16 @@ Docs coming soon...
 | [kubernetes_namespace_v1.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
 | [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
 | [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
+| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
+| [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source |
 | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |
 
 ## Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no |
+| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.29"` | no |
+| <a name="input_enable_aws_efa_k8s_device_plugin"></a> [enable\_aws\_efa\_k8s\_device\_plugin](#input\_enable\_aws\_efa\_k8s\_device\_plugin) | Enable AWS EFA K8s Device Plugin | `bool` | `false` | no |
 | <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"jark-stack"` | no |
 | <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |

diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf
@@ -105,28 +105,49 @@ module "eks_blueprints_addons" {
     values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
   }
 
-  helm_releases = {
-    #---------------------------------------
-    # NVIDIA Device Plugin Add-on
-    #---------------------------------------
-    nvidia-device-plugin = {
-      description      = "A Helm chart for NVIDIA Device Plugin"
-      namespace        = "nvidia-device-plugin"
-      create_namespace = true
-      chart            = "nvidia-device-plugin"
-      chart_version    = "0.14.0"
-      repository       = "https://nvidia.github.io/k8s-device-plugin"
-      values           = [file("${path.module}/helm-values/nvidia-values.yaml")]
+  #---------------------------------------
+  # Karpenter Autoscaler for EKS Cluster
+  #---------------------------------------
+  enable_karpenter                  = true
+  karpenter_enable_spot_termination = true
+  karpenter_node = {
+    iam_role_additional_policies = {
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
     }
   }
+  karpenter = {
+    chart_version       = "v0.34.0"
+    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+    repository_password = data.aws_ecrpublic_authorization_token.token.password
+  }
+
+  #---------------------------------------
+  # Argo Workflows & Argo Events
+  #---------------------------------------
+  enable_argo_workflows = true
+  argo_workflows = {
+    name       = "argo-workflows"
+    namespace  = "argo-workflows"
+    repository = "https://argoproj.github.io/argo-helm"
+    values     = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})]
+  }
+
+  enable_argo_events = true
+  argo_events = {
+    name       = "argo-events"
+    namespace  = "argo-events"
+    repository = "https://argoproj.github.io/argo-helm"
+    values     = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})]
+  }
+
 }
 
 #---------------------------------------------------------------
 # Data on EKS Kubernetes Addons
 #---------------------------------------------------------------
 module "data_addons" {
   source  = "aws-ia/eks-data-addons/aws"
-  version = "~> 1.1" # ensure to update this to the latest/desired version
+  version = "~> 1.31.4" # ensure to update this to the latest/desired version
 
   oidc_provider_arn = module.eks.oidc_provider_arn
 
@@ -140,19 +161,147 @@ module "data_addons" {
     values           = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
   }
 
+  enable_volcano = true
+  #---------------------------------------
+  # Kuberay Operator
+  #---------------------------------------
+  enable_kuberay_operator = true
+  kuberay_operator_helm_config = {
+    version = "1.1.0"
+    # Enabling Volcano as Batch scheduler for KubeRay Operator
+    values = [
+      <<-EOT
+      batchScheduler:
+        enabled: true
+    EOT
+    ]
+  }
+
   #---------------------------------------------------------------
-  # KubeRay Operator Add-on
+  # NVIDIA Device Plugin Add-on
   #---------------------------------------------------------------
-  enable_kuberay_operator = true
+  enable_nvidia_device_plugin = true
+  nvidia_device_plugin_helm_config = {
+    version = "v0.14.5"
+    name    = "nvidia-device-plugin"
+    values = [
+      <<-EOT
+        gfd:
+          enabled: true
+        nfd:
+          worker:
+            tolerations:
+              - key: nvidia.com/gpu
+                operator: Exists
+                effect: NoSchedule
+              - operator: "Exists"
+      EOT
+    ]
+  }
 
   #---------------------------------------
   # EFA Device Plugin Add-on
   #---------------------------------------
-  enable_aws_efa_k8s_device_plugin = true
+  # IMPORTANT: Enable EFA only on nodes with EFA devices attached.
+  # Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling.
+  enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin
   aws_efa_k8s_device_plugin_helm_config = {
     values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")]
   }
 
+  #---------------------------------------------------------------
+  # Karpenter Resources Add-on
+  #---------------------------------------------------------------
+  enable_karpenter_resources = true
+  karpenter_resources_helm_config = {
+    g5-gpu-karpenter = {
+      values = [
+        <<-EOT
+      name: g5-gpu-karpenter
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[2]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        instanceStorePolicy: RAID0
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - NodeGroupType: g5-gpu-karpenter
+        taints:
+          - key: nvidia.com/gpu
+            value: "Exists"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["g5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: [ "2xlarge", "4xlarge", "8xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 180s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+    x86-cpu-karpenter = {
+      values = [
+        <<-EOT
+      name: x86-cpu-karpenter
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        instanceStorePolicy: RAID0
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - NodeGroupType: x86-cpu-karpenter
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["m5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 180s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+  }
+
   depends_on = [
     kubernetes_secret_v1.huggingface_token,
     kubernetes_config_map_v1.notebook

diff --git a/ai-ml/jark-stack/terraform/examples/inference/stablediffusion-rayserve/Dockerfile b/ai-ml/jark-stack/terraform/examples/inference/stablediffusion-rayserve/Dockerfile
@@ -0,0 +1,26 @@
+# https://hub.docker.com/layers/rayproject/ray-ml/2.10.0-py310-gpu/images/sha256-4181ed53b0b25a758b155312ca6ab29a65cb78cd57296d42cfbe4806a2b77df4?context=explore
+# docker buildx build --platform=linux/amd64 -t ray2.10.0-py310-gpu-stablediffusion:v1.0 -f Dockerfile .
+
+# Use Ray base image
+FROM rayproject/ray-ml:2.10.0-py310-gpu
+
+# Maintainer label
+LABEL maintainer="DoEKS"
+
+# Set environment variables to non-interactive (this prevents some prompts)
+ENV DEBIAN_FRONTEND=non-interactive
+
+# Switch back to a non-root user for the subsequent commands
+USER $USER
+
+# Install Ray Serve and other Python packages with specific versions
+RUN pip install --no-cache-dir requests torch "diffusers==0.12.1" "transformers=4.25.1"
+
+# Set a working directory
+WORKDIR /serve_app
+
+# Copy your Ray Serve script into the container
+COPY ray_serve_sd.py /serve_app/ray_serve_sd.py
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=/serve_app:$PYTHONPATH
diff --git a/ai-ml/jark-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/Dockerfile b/ai-ml/jark-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/Dockerfile
@@ -0,0 +1,13 @@
+# Use Python base image
+FROM --platform=linux/amd64 python:3.9-slim
+
+# Set working directory in the container
+WORKDIR /app
+
+# Copy the Python script into the container
+COPY gradio-app.py /app/gradio-app.py
+
+RUN pip install --no-cache-dir gradio requests Pillow
+
+# Command to run the Python script
+ENTRYPOINT ["python", "gradio-app.py"]
diff --git a/.../jark-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/gradio-app.py b/.../jark-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/gradio-app.py
@@ -0,0 +1,32 @@
+import gradio as gr
+import requests
+import os
+from PIL import Image
+from io import BytesIO
+
+# Constants for model endpoint and service name
+model_endpoint = os.environ.get("MODEL_ENDPOINT", "/imagine")
+service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
+
+# Function to generate image based on prompt
+def generate_image(prompt):
+
+    # Create the URL for the inference
+    url = f"{service_name}{model_endpoint}"
+
+    try:
+        # Send the request to the model service
+        response = requests.get(url, params={"prompt": prompt}, timeout=180)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        i = Image.open(BytesIO(response.content))
+        return i
+
+    except requests.exceptions.RequestException as e:
+        # Handle any request exceptions (e.g., connection errors)
+        # return f"AI: Error: {str(e)}"
+        return Image.new('RGB', (100, 100), color='red')
+
+# Define the Gradio PromptInterface
+demo = gr.Interface(fn=generate_image,
+                    inputs = [gr.Textbox(label="Enter the Prompt")],
+                    outputs = gr.Image(type='pil')).launch(server_name="0.0.0.0")
diff --git a/...-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/gradio-deploy.yaml b/...-stack/terraform/examples/inference/stablediffusion-rayserve/gradio-ui/gradio-deploy.yaml
@@ -0,0 +1,58 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gradio
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio-deployment
+  namespace: gradio
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - name: gradio
+        # Update this image to the Gradio app image you want to deploy
+        image: public.ecr.aws/data-on-eks/gradio-app:sd-v1.0
+        imagePullPolicy: IfNotPresent
+        ports:
+        - containerPort: 7860
+        resources:
+          requests:
+            cpu: "512m"
+            memory: "2048Mi"
+          limits:
+            cpu: "1"
+            memory: "4096Mi"
+        env:
+        - name: MODEL_ENDPOINT
+          value: "/imagine"
+        # Please note that the service name is currently hardcoded to match the Stable Diffusion service for this blueprint.
+        # If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly.
+        - name: SERVICE_NAME
+          value: "http://stablediffusion-service.stablediffusion.svc.cluster.local:8000"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio-service
+  namespace: gradio
+spec:
+  selector:
+    app: gradio
+  ports:
+  - name: http
+    protocol: TCP
+    port: 7860
+    targetPort: 7860
+  type: ClusterIP