From b16c4c576b5436818ea29d86f4ddce3705c3d165 Mon Sep 17 00:00:00 2001
From: Ratnopam Charabarti <ratnopamc@yahoo.com>
Date: Tue, 20 Feb 2024 12:20:13 -0600
Subject: [PATCH] refactor: Upgrade ray version for stable diffusion inference
 (#438)

---
 ai-ml/trainium-inferentia/addons.tf           |  2 +-
 .../stable-diffusion-inf2/Dockerfile          |  3 +-
 .../ray-service-stablediffusion.yaml          | 52 +++++++++++++------
 .../ray_serve_stablediffusion.py              | 12 ++---
 .../troubleshooting/troubleshooting.md        | 16 ++++++
 5 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf
index 6bf894f84..c523bb1ed 100644
--- a/ai-ml/trainium-inferentia/addons.tf
+++ b/ai-ml/trainium-inferentia/addons.tf
@@ -268,7 +268,7 @@ module "eks_data_addons" {
   #---------------------------------------
   enable_kuberay_operator = true
   kuberay_operator_helm_config = {
-    version = "1.0.0-rc.0"
+    version = "1.0.0"
     # Enabling Volcano as Batch scheduler for KubeRay Operator
     values = [
       <<-EOT
diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile
index cf51d2fb1..50351405a 100644
--- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile
+++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile
@@ -1,4 +1,5 @@
-FROM rayproject/ray:2.7.1-py310
+# https://hub.docker.com/layers/rayproject/ray/2.9.0-py310/images/sha256-846cda01841c6c11610292aba8f190d49cc54844d1d578b307678cab5076ef98?context=explore
+FROM rayproject/ray:2.9.0-py310
 
 # Maintainer label
 LABEL maintainer="DoEKS"
diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml
index 42d1e2f9a..ebde03cb8 100644
--- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml
+++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml
@@ -4,7 +4,7 @@ metadata:
   name: stablediffusion
 
 ---
-apiVersion: ray.io/v1alpha1
+apiVersion: ray.io/v1
 kind: RayService
 metadata:
   name: stablediffusion-service
@@ -12,13 +12,33 @@ metadata:
 spec:
   serviceUnhealthySecondThreshold: 900
   deploymentUnhealthySecondThreshold: 300
-  serveConfig:
-    importPath: ray_serve_stablediffusion:entrypoint # Specify the correct path to your Python script
-    runtimeEnv: |
-      env_vars: {"MODEL_ID": "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024"}  # Replace with the appropriate model ID
-
+  serveConfigV2: |
+    applications:
+      - name: stable-diffusion-deployment
+        import_path: "ray_serve_stablediffusion:entrypoint"
+        route_prefix: "/"
+        runtime_env:
+          env_vars:
+            MODEL_ID: "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024"
+            NEURON_CC_FLAGS: "-O1"
+        deployments:
+          - name: stable-diffusion-v2
+            autoscaling_config:
+              metrics_interval_s: 0.2
+              min_replicas: 8
+              max_replicas: 12
+              look_back_period_s: 2
+              downscale_delay_s: 30
+              upscale_delay_s: 2
+              target_num_ongoing_requests_per_replica: 1
+            graceful_shutdown_timeout_s: 5
+            max_concurrent_queries: 100
+            ray_actor_options:
+              num_cpus: 10
+              resources: {"neuron_cores": 2}
   rayClusterConfig:
-    rayVersion: '2.7.1'
+    rayVersion: '2.9.0'
+    enableInTreeAutoscaling: true
     headGroupSpec:
       serviceType: NodePort
       headService:
@@ -31,7 +51,7 @@ spec:
         spec:
           containers:
           - name: ray-head
-            image: public.ecr.aws/data-on-eks/ray2.7.1-py310-stablediffusion-neuron:latest # Image created using the Dockerfile attached in the folder
+            image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:v1.0
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
@@ -39,7 +59,7 @@ spec:
                   command: ["/bin/sh", "-c", "ray stop"]
             ports:
             - containerPort: 6379
-              name: gcs
+              name: gcs-server
             - containerPort: 8265
               name: dashboard
             - containerPort: 10001
@@ -66,15 +86,15 @@ spec:
 
     workerGroupSpecs:
     - groupName: inf2-worker-group
-      replicas: 1
-      minReplicas: 1
-      maxReplicas: 1
+      # replicas: 1
+      minReplicas: 2
+      maxReplicas: 8
       rayStartParams: {}
       template:
         spec:
           containers:
           - name: ray-worker
-            image: public.ecr.aws/data-on-eks/ray2.7.1-py310-stablediffusion-neuron:latest # Image created using the Dockerfile attached in the folder
+            image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:v1.0
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
@@ -87,10 +107,8 @@ spec:
                 memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
                 aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
               requests:
-                cpu: "45" # Half of vCPUs for reservation for inf2.24xlarge; leaving 3 vCPUs for daemonset overhead
-                memory: "180G" # Half of memory for reservation for inf2.24xlarge; leaving 12G for daemonset overhead
-                # Set maximum neuron core available to the instance to acocomodate multiple requests to leverage all the neuron cores
-                # You cannot run multiple pods of the model on the same instance unless you shard the model
+                cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
+                memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
                 aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
           nodeSelector:
             instanceType: inferentia-inf2
diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py
index c312d92c7..acb8d07e1 100644
--- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py
+++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py
@@ -10,7 +10,7 @@
 
 neuron_cores = 2
 
-@serve.deployment(num_replicas=1, route_prefix="/")
+@serve.deployment(name="stable-diffusion-api", num_replicas=1, route_prefix="/")
 @serve.ingress(app)
 class APIIngress:
     def __init__(self, diffusion_model_handle) -> None:
@@ -22,20 +22,17 @@ def __init__(self, diffusion_model_handle) -> None:
         response_class=Response,
     )
     async def generate(self, prompt: str):
-
-        image_ref = await self.handle.generate.remote(prompt)
-        image = await image_ref
+        image = await self.handle.generate.remote(prompt)
         file_stream = BytesIO()
         image.save(file_stream, "PNG")
         return Response(content=file_stream.getvalue(), media_type="image/png")
 
-
-@serve.deployment(
+@serve.deployment(name="stable-diffusion-v2",
+    autoscaling_config={"min_replicas": 0, "max_replicas": 6},
     ray_actor_options={
         "resources": {"neuron_cores": neuron_cores},
         "runtime_env": {"env_vars": {"NEURON_CC_FLAGS": "-O1"}},
     },
-    autoscaling_config={"min_replicas": 1, "max_replicas": 1},
 )
 class StableDiffusionV2:
     def __init__(self):
@@ -47,7 +44,6 @@ def __init__(self):
         self.pipe = NeuronStableDiffusionXLPipeline.from_pretrained(compiled_model_id, device_ids=[0, 1])
 
     async def generate(self, prompt: str):
-
         assert len(prompt), "prompt parameter cannot be empty"
         image = self.pipe(prompt).images[0]
         return image
diff --git a/website/docs/blueprints/troubleshooting/troubleshooting.md b/website/docs/blueprints/troubleshooting/troubleshooting.md
index 33865e5d3..12310f93f 100644
--- a/website/docs/blueprints/troubleshooting/troubleshooting.md
+++ b/website/docs/blueprints/troubleshooting/troubleshooting.md
@@ -220,3 +220,19 @@ Delete the existing log group by updating log group name and the region.
 ```sh
 aws logs delete-log-group --log-group-name <LOG_GROUP_NAME> --region <ENTER_REGION>
 ```
+
+## Karpenter Error - Missing Service Linked Role
+
+Karpenter throws below error while trying to create new instances.
+
+```
+"error":"launching nodeclaim, creating instance, with fleet error(s), AuthFailure.ServiceLinkedRoleCreationNotPermitted: The provided credentials do not have permission to create the service-linked role for EC2 Spot Instances."}
+```
+
+**Solution:**
+
+You will need to create the service linked role in the AWS account you're using to avoid `ServiceLinkedRoleCreationNotPermitted` error.
+
+```sh
+aws iam create-service-linked-role --aws-service-name spot.amazonaws.com
+```