Skip to content

Commit

Permalink
Fix iam issues in AI/ML chatbot PR and update to Karpenter v1 (#1110)
Browse files Browse the repository at this point in the history
  • Loading branch information
svennam92 authored Sep 26, 2024
1 parent 47b9896 commit 59fffc4
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 87 deletions.
1 change: 1 addition & 0 deletions lab/iam/policies/labs2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Statement:
Resource:
- !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/${Env}*
- !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/eks-workshop*
- !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter*
- Effect: Allow
Action:
- vpc-lattice:List*
Expand Down
35 changes: 11 additions & 24 deletions manifests/modules/aiml/chatbot/.workshop/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,30 @@

set -e

logmessage "Deleting AIML resources..."

logmessage "Deleting Gradio-UI Components..."

kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/gradio --ignore-not-found=true
kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/gradio --ignore-not-found

logmessage "Deleting Llama2 pods..."

kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin-rbac.yml --ignore-not-found
kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin.yml --ignore-not-found
kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/ray-service-llama2-chatbot --ignore-not-found

logmessage "Deleting Neuron Device Plugin..."

kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/neuron-device-plugin --ignore-not-found=true
kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin-rbac.yml --ignore-not-found
kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin.yml --ignore-not-found

logmessage "Un-installing kuberay operator..."

helm uninstall kuberay-operator

kubectl delete namespace llama2 --ignore-not-found

kubectl delete namespace gradio-llama2-inf2 --ignore-not-found
helm uninstall kuberay-operator --ignore-not-found

logmessage "Deleting Karpenter NodePool and EC2NodeClass..."
logmessage "Deleting Karpenter resources..."

delete-all-if-crd-exists nodepools.karpenter.sh
delete-all-if-crd-exists ec2nodeclasses.karpenter.k8s.aws
kubectl kustomize ~/environment/eks-workshop/modules/aiml/chatbot/nodepool \
| envsubst | kubectl delete -f-

logmessage "Waiting for Karpenter nodes to be removed..."
logmessage "Deleting llama2 and gradio-llama2-inf2 namespaces..."

EXIT_CODE=0

timeout --foreground -s TERM 30 bash -c \
'while [[ $(kubectl get nodes --selector=type=karpenter -o json | jq -r ".items | length") -gt 0 ]];\
do sleep 5;\
done' || EXIT_CODE=$?
kubectl delete namespace llama2 --ignore-not-found

if [ $EXIT_CODE -ne 0 ]; then
logmessage "Warning: Karpenter nodes did not clean up"
fi
kubectl delete namespace gradio-llama2-inf2 --ignore-not-found
91 changes: 78 additions & 13 deletions manifests/modules/aiml/chatbot/.workshop/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
locals {
namespace = "kube-system"
}

terraform {
required_providers {
kubectl = {
Expand All @@ -16,29 +20,22 @@ data "aws_ecrpublic_authorization_token" "token" {
provider = aws.virginia
}

# Addons for ALB Controller

module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "1.16.3"

enable_aws_load_balancer_controller = true
# turn off the mutating webhook for services because we are using
# retrieved from Data on EKS
aws_load_balancer_controller = {
wait = true
role_name = "${var.addon_context.eks_cluster_id}-alb-controller"
policy_name = "${var.addon_context.eks_cluster_id}-alb-controller"
# turn off the mutating webhook
set = [{
name = "enableServiceMutatorWebhook"
value = "false"
}]
wait = true
}

enable_karpenter = true

karpenter_enable_spot_termination = true
karpenter_enable_instance_profile_creation = true
karpenter = {
chart_version = var.karpenter_version
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}

cluster_name = var.addon_context.eks_cluster_id
Expand All @@ -58,3 +55,71 @@ data "aws_subnets" "private" {
values = ["*Private*"]
}
}

# Pod identity for Karpenter

resource "aws_eks_addon" "pod_identity" {
cluster_name = var.addon_context.eks_cluster_id
addon_name = "eks-pod-identity-agent"
resolve_conflicts_on_create = "OVERWRITE"
preserve = false
}

# Karpenter controller & Node IAM roles, SQS Queue, Eventbridge Rules

module "karpenter" {
source = "terraform-aws-modules/eks/aws//modules/karpenter"
version = "~> 20.24"

cluster_name = var.addon_context.eks_cluster_id
enable_v1_permissions = true
namespace = local.namespace

iam_role_name = "${var.addon_context.eks_cluster_id}-karpenter-controller"
iam_role_use_name_prefix = false
iam_policy_name = "${var.addon_context.eks_cluster_id}-karpenter-controller"
iam_policy_use_name_prefix = false
node_iam_role_name = "${var.addon_context.eks_cluster_id}-karpenter-node"
node_iam_role_use_name_prefix = false
queue_name = "${var.addon_context.eks_cluster_id}-karpenter"
rule_name_prefix = "eks-workshop"
create_pod_identity_association = true

tags = {
created-by = "eks-workshop-v2"
env = var.addon_context.eks_cluster_id
}
}

# Helm chart

resource "helm_release" "karpenter" {
name = "karpenter"
namespace = local.namespace
create_namespace = true
repository = "oci://public.ecr.aws/karpenter"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
chart = "karpenter"
# renovate: datasource=github-releases depName=aws/karpenter-provider-aws
version = "1.0.2"
wait = true

values = [
<<-EOT
settings:
clusterName: ${var.addon_context.eks_cluster_id}
clusterEndpoint: ${var.addon_context.aws_eks_cluster_endpoint}
interruptionQueue: ${module.karpenter.queue_name}
tolerations:
- key: CriticalAddonsOnly
operator: Exists
EOT
]

lifecycle {
ignore_changes = [
repository_password
]
}
}
4 changes: 2 additions & 2 deletions manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ output "environment_variables" {
description = "Environment variables to be added to the IDE shell"
value = {
AIML_SUBNETS = "${data.aws_subnets.private.ids[0]},${data.aws_subnets.private.ids[1]},${data.aws_subnets.private.ids[2]}"
KARPENTER_NODE_ROLE = module.eks_blueprints_addons.karpenter.node_iam_role_name
KARPENTER_ARN = module.eks_blueprints_addons.karpenter.node_iam_role_arn
KARPENTER_NODE_ROLE = module.karpenter.node_iam_role_name
KARPENTER_ARN = module.karpenter.node_iam_role_arn
}
}
7 changes: 0 additions & 7 deletions manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,3 @@ variable "resources_precreated" {
description = "Have expensive resources been created already"
type = bool
}

variable "karpenter_version" {
description = "The version of Karpenter chart to use"
type = string
# renovate: datasource=github-releases depName=aws/karpenter-provider-aws
default = "0.37.2"
}
21 changes: 13 additions & 8 deletions manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: karpenter.sh/v1beta1
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: inferentia-inf2
Expand All @@ -9,6 +9,10 @@ spec:
instanceType: inferentia-inf2
provisionerType: Karpenter
spec:
taints:
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
Expand All @@ -19,26 +23,27 @@ spec:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand", "spot"]
expireAfter: 720h
terminationGracePeriod: 24h
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: inferentia-inf2
taints:
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
limits:
cpu: "512"
disruption:
consolidateAfter: 300s
consolidationPolicy: WhenEmpty
expireAfter: 720h
consolidationPolicy: WhenEmptyOrUnderutilized

---
apiVersion: karpenter.k8s.aws/v1beta1
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: inferentia-inf2
spec:
amiFamily: AL2
amiSelectorTerms:
- alias: al2@latest
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
Expand Down
13 changes: 9 additions & 4 deletions manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: karpenter.sh/v1beta1
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: x86-cpu-karpenter
Expand All @@ -21,22 +21,27 @@ spec:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand", "spot"]
expireAfter: 720h
terminationGracePeriod: 24h
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: x86-cpu-karpenter
limits:
cpu: "256"
disruption:
consolidateAfter: 300s
consolidationPolicy: WhenEmpty
expireAfter: 720h
consolidationPolicy: WhenEmptyOrUnderutilized

---
apiVersion: karpenter.k8s.aws/v1beta1
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: x86-cpu-karpenter
spec:
amiFamily: AL2
amiSelectorTerms:
- alias: al2@latest
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
Expand Down
18 changes: 0 additions & 18 deletions manifests/modules/aiml/inferentia/.workshop/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,3 @@ set -e
logmessage "Deleting AIML resources..."

kubectl delete namespace aiml --ignore-not-found

logmessage "Deleting Karpenter NodePool and EC2NodeClass..."

delete-all-if-crd-exists nodepools.karpenter.sh
delete-all-if-crd-exists ec2nodeclasses.karpenter.k8s.aws

logmessage "Waiting for Karpenter nodes to be removed..."

EXIT_CODE=0

timeout --foreground -s TERM 30 bash -c \
'while [[ $(kubectl get nodes --selector=type=karpenter -o json | jq -r ".items | length") -gt 0 ]];\
do sleep 5;\
done' || EXIT_CODE=$?

if [ $EXIT_CODE -ne 0 ]; then
logmessage "Warning: Karpenter nodes did not clean up"
fi
2 changes: 1 addition & 1 deletion website/docs/aiml/chatbot/gradio.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ kube-root-ca.crt 1 111s

Once the load balancer has finished deploying, use the external IP address to directly access the website:

```bash
```bash wait=10
$ kubectl get services -n gradio-llama2-inf2
NAME TYPE ClUSTER-IP EXTERNAL-IP PORT(S) AGE
gradio-service LoadBalancer 172.20.84.26 k8s-gradioll-gradiose-a6d0b586ce-06885d584b38b400.elb.us-west-2.amazonaws.com 80:30802/TCP 8m42s
Expand Down
12 changes: 2 additions & 10 deletions website/docs/aiml/chatbot/nodepool.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,12 @@ To learn more about Karpenter, check out the [Karpenter module](../../autoscalin
Karpenter has already been installed in our EKS Cluster and runs as a deployment:

```bash
$ kubectl get deployment -n karpenter
$ kubectl get deployment -n kube-system
NAME READY UP-TO-DATE AVAILABLE AGE
...
karpenter 2/2 2 2 11m
```

As we did in a previous lab, we need to update our EKS IAM mappings to allow Karpenter nodes to join the cluster:

```bash
$ eksctl create iamidentitymapping --cluster $EKS_CLUSTER_NAME \
--region $AWS_REGION --arn $KARPENTER_ARN \
--group system:bootstrappers --group system:nodes \
--username system:node:{{EC2PrivateDNSName}}
```

Since the Ray Cluster creates head and worker pods with different specifications for handling various EC2 families, we'll create two separate node pools to handle the workload demands.

Here's the first Karpenter `NodePool` that will provision one `Head Pod` on `x86 CPU` instances:
Expand Down

0 comments on commit 59fffc4

Please sign in to comment.