From b16c4c576b5436818ea29d86f4ddce3705c3d165 Mon Sep 17 00:00:00 2001 From: Ratnopam Charabarti Date: Tue, 20 Feb 2024 12:20:13 -0600 Subject: [PATCH] refactor: Upgrade ray version for stable diffusion inference (#438) --- ai-ml/trainium-inferentia/addons.tf | 2 +- .../stable-diffusion-inf2/Dockerfile | 3 +- .../ray-service-stablediffusion.yaml | 52 +++++++++++++------ .../ray_serve_stablediffusion.py | 12 ++--- .../troubleshooting/troubleshooting.md | 16 ++++++ 5 files changed, 58 insertions(+), 27 deletions(-) diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf index 6bf894f84..c523bb1ed 100644 --- a/ai-ml/trainium-inferentia/addons.tf +++ b/ai-ml/trainium-inferentia/addons.tf @@ -268,7 +268,7 @@ module "eks_data_addons" { #--------------------------------------- enable_kuberay_operator = true kuberay_operator_helm_config = { - version = "1.0.0-rc.0" + version = "1.0.0" # Enabling Volcano as Batch scheduler for KubeRay Operator values = [ <<-EOT diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile index cf51d2fb1..50351405a 100644 --- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile +++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/Dockerfile @@ -1,4 +1,5 @@ -FROM rayproject/ray:2.7.1-py310 +# https://hub.docker.com/layers/rayproject/ray/2.9.0-py310/images/sha256-846cda01841c6c11610292aba8f190d49cc54844d1d578b307678cab5076ef98?context=explore +FROM rayproject/ray:2.9.0-py310 # Maintainer label LABEL maintainer="DoEKS" diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml index 42d1e2f9a..ebde03cb8 100644 --- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml +++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray-service-stablediffusion.yaml @@ -4,7 +4,7 @@ metadata: name: stablediffusion --- -apiVersion: ray.io/v1alpha1 +apiVersion: ray.io/v1 kind: RayService metadata: name: stablediffusion-service @@ -12,13 +12,33 @@ metadata: spec: serviceUnhealthySecondThreshold: 900 deploymentUnhealthySecondThreshold: 300 - serveConfig: - importPath: ray_serve_stablediffusion:entrypoint # Specify the correct path to your Python script - runtimeEnv: | - env_vars: {"MODEL_ID": "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024"} # Replace with the appropriate model ID - + serveConfigV2: | + applications: + - name: stable-diffusion-deployment + import_path: "ray_serve_stablediffusion:entrypoint" + route_prefix: "/" + runtime_env: + env_vars: + MODEL_ID: "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024" + NEURON_CC_FLAGS: "-O1" + deployments: + - name: stable-diffusion-v2 + autoscaling_config: + metrics_interval_s: 0.2 + min_replicas: 8 + max_replicas: 12 + look_back_period_s: 2 + downscale_delay_s: 30 + upscale_delay_s: 2 + target_num_ongoing_requests_per_replica: 1 + graceful_shutdown_timeout_s: 5 + max_concurrent_queries: 100 + ray_actor_options: + num_cpus: 10 + resources: {"neuron_cores": 2} rayClusterConfig: - rayVersion: '2.7.1' + rayVersion: '2.9.0' + enableInTreeAutoscaling: true headGroupSpec: serviceType: NodePort headService: @@ -31,7 +51,7 @@ spec: spec: containers: - name: ray-head - image: public.ecr.aws/data-on-eks/ray2.7.1-py310-stablediffusion-neuron:latest # Image created using the Dockerfile attached in the folder + image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:v1.0 imagePullPolicy: Always # Ensure the image is always pulled when updated lifecycle: preStop: @@ -39,7 +59,7 @@ spec: command: ["/bin/sh", "-c", "ray stop"] ports: - containerPort: 6379 - name: gcs + name: gcs-server - containerPort: 8265 name: dashboard - containerPort: 10001 @@ -66,15 +86,15 @@ spec: workerGroupSpecs: - groupName: inf2-worker-group - replicas: 1 - minReplicas: 1 - maxReplicas: 1 + # replicas: 1 + minReplicas: 2 + maxReplicas: 8 rayStartParams: {} template: spec: containers: - name: ray-worker - image: public.ecr.aws/data-on-eks/ray2.7.1-py310-stablediffusion-neuron:latest # Image created using the Dockerfile attached in the folder + image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:v1.0 imagePullPolicy: Always # Ensure the image is always pulled when updated lifecycle: preStop: @@ -87,10 +107,8 @@ spec: memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge requests: - cpu: "45" # Half of vCPUs for reservation for inf2.24xlarge; leaving 3 vCPUs for daemonset overhead - memory: "180G" # Half of memory for reservation for inf2.24xlarge; leaving 12G for daemonset overhead - # Set maximum neuron core available to the instance to acocomodate multiple requests to leverage all the neuron cores - # You cannot run multiple pods of the model on the same instance unless you shard the model + cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead + memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge nodeSelector: instanceType: inferentia-inf2 diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py index c312d92c7..acb8d07e1 100644 --- a/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py +++ b/ai-ml/trainium-inferentia/examples/ray-serve/stable-diffusion-inf2/ray_serve_stablediffusion.py @@ -10,7 +10,7 @@ neuron_cores = 2 -@serve.deployment(num_replicas=1, route_prefix="/") +@serve.deployment(name="stable-diffusion-api", num_replicas=1, route_prefix="/") @serve.ingress(app) class APIIngress: def __init__(self, diffusion_model_handle) -> None: @@ -22,20 +22,17 @@ def __init__(self, diffusion_model_handle) -> None: response_class=Response, ) async def generate(self, prompt: str): - - image_ref = await self.handle.generate.remote(prompt) - image = await image_ref + image = await self.handle.generate.remote(prompt) file_stream = BytesIO() image.save(file_stream, "PNG") return Response(content=file_stream.getvalue(), media_type="image/png") - -@serve.deployment( +@serve.deployment(name="stable-diffusion-v2", + autoscaling_config={"min_replicas": 0, "max_replicas": 6}, ray_actor_options={ "resources": {"neuron_cores": neuron_cores}, "runtime_env": {"env_vars": {"NEURON_CC_FLAGS": "-O1"}}, }, - autoscaling_config={"min_replicas": 1, "max_replicas": 1}, ) class StableDiffusionV2: def __init__(self): @@ -47,7 +44,6 @@ def __init__(self): self.pipe = NeuronStableDiffusionXLPipeline.from_pretrained(compiled_model_id, device_ids=[0, 1]) async def generate(self, prompt: str): - assert len(prompt), "prompt parameter cannot be empty" image = self.pipe(prompt).images[0] return image diff --git a/website/docs/blueprints/troubleshooting/troubleshooting.md b/website/docs/blueprints/troubleshooting/troubleshooting.md index 33865e5d3..12310f93f 100644 --- a/website/docs/blueprints/troubleshooting/troubleshooting.md +++ b/website/docs/blueprints/troubleshooting/troubleshooting.md @@ -220,3 +220,19 @@ Delete the existing log group by updating log group name and the region. ```sh aws logs delete-log-group --log-group-name --region ``` + +## Karpenter Error - Missing Service Linked Role + +Karpenter throws below error while trying to create new instances. + +``` +"error":"launching nodeclaim, creating instance, with fleet error(s), AuthFailure.ServiceLinkedRoleCreationNotPermitted: The provided credentials do not have permission to create the service-linked role for EC2 Spot Instances."} +``` + +**Solution:** + +You will need to create the service linked role in the AWS account you're using to avoid `ServiceLinkedRoleCreationNotPermitted` error. + +```sh +aws iam create-service-linked-role --aws-service-name spot.amazonaws.com +```