diff --git a/lab/iam/policies/labs2.yaml b/lab/iam/policies/labs2.yaml index 908e42aa7..ea28aab64 100644 --- a/lab/iam/policies/labs2.yaml +++ b/lab/iam/policies/labs2.yaml @@ -34,6 +34,7 @@ Statement: Resource: - !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/${Env}* - !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/eks-workshop* + - !Sub arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter* - Effect: Allow Action: - vpc-lattice:List* diff --git a/manifests/modules/aiml/chatbot/.workshop/cleanup.sh b/manifests/modules/aiml/chatbot/.workshop/cleanup.sh index d82c3a314..c022c590c 100755 --- a/manifests/modules/aiml/chatbot/.workshop/cleanup.sh +++ b/manifests/modules/aiml/chatbot/.workshop/cleanup.sh @@ -2,43 +2,30 @@ set -e -logmessage "Deleting AIML resources..." - logmessage "Deleting Gradio-UI Components..." -kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/gradio --ignore-not-found=true +kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/gradio --ignore-not-found logmessage "Deleting Llama2 pods..." -kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin-rbac.yml --ignore-not-found -kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin.yml --ignore-not-found +kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/ray-service-llama2-chatbot --ignore-not-found logmessage "Deleting Neuron Device Plugin..." -kubectl delete -k /eks-workshop/manifests/modules/aiml/chatbot/neuron-device-plugin --ignore-not-found=true +kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin-rbac.yml --ignore-not-found +kubectl delete -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/v2.19.1/src/k8/k8s-neuron-device-plugin.yml --ignore-not-found logmessage "Un-installing kuberay operator..." -helm uninstall kuberay-operator - -kubectl delete namespace llama2 --ignore-not-found - -kubectl delete namespace gradio-llama2-inf2 --ignore-not-found +helm uninstall kuberay-operator --ignore-not-found -logmessage "Deleting Karpenter NodePool and EC2NodeClass..." +logmessage "Deleting Karpenter resources..." -delete-all-if-crd-exists nodepools.karpenter.sh -delete-all-if-crd-exists ec2nodeclasses.karpenter.k8s.aws +kubectl kustomize ~/environment/eks-workshop/modules/aiml/chatbot/nodepool \ + | envsubst | kubectl delete -f- -logmessage "Waiting for Karpenter nodes to be removed..." +logmessage "Deleting llama2 and gradio-llama2-inf2 namespaces..." -EXIT_CODE=0 - -timeout --foreground -s TERM 30 bash -c \ - 'while [[ $(kubectl get nodes --selector=type=karpenter -o json | jq -r ".items | length") -gt 0 ]];\ - do sleep 5;\ - done' || EXIT_CODE=$? +kubectl delete namespace llama2 --ignore-not-found -if [ $EXIT_CODE -ne 0 ]; then - logmessage "Warning: Karpenter nodes did not clean up" -fi +kubectl delete namespace gradio-llama2-inf2 --ignore-not-found diff --git a/manifests/modules/aiml/chatbot/.workshop/terraform/main.tf b/manifests/modules/aiml/chatbot/.workshop/terraform/main.tf index 7587514f3..80765e959 100644 --- a/manifests/modules/aiml/chatbot/.workshop/terraform/main.tf +++ b/manifests/modules/aiml/chatbot/.workshop/terraform/main.tf @@ -1,3 +1,7 @@ +locals { + namespace = "kube-system" +} + terraform { required_providers { kubectl = { @@ -16,29 +20,22 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.virginia } +# Addons for ALB Controller + module "eks_blueprints_addons" { source = "aws-ia/eks-blueprints-addons/aws" version = "1.16.3" enable_aws_load_balancer_controller = true - # turn off the mutating webhook for services because we are using - # retrieved from Data on EKS aws_load_balancer_controller = { + wait = true + role_name = "${var.addon_context.eks_cluster_id}-alb-controller" + policy_name = "${var.addon_context.eks_cluster_id}-alb-controller" + # turn off the mutating webhook set = [{ name = "enableServiceMutatorWebhook" value = "false" }] - wait = true - } - - enable_karpenter = true - - karpenter_enable_spot_termination = true - karpenter_enable_instance_profile_creation = true - karpenter = { - chart_version = var.karpenter_version - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password } cluster_name = var.addon_context.eks_cluster_id @@ -58,3 +55,71 @@ data "aws_subnets" "private" { values = ["*Private*"] } } + +# Pod identity for Karpenter + +resource "aws_eks_addon" "pod_identity" { + cluster_name = var.addon_context.eks_cluster_id + addon_name = "eks-pod-identity-agent" + resolve_conflicts_on_create = "OVERWRITE" + preserve = false +} + +# Karpenter controller & Node IAM roles, SQS Queue, Eventbridge Rules + +module "karpenter" { + source = "terraform-aws-modules/eks/aws//modules/karpenter" + version = "~> 20.24" + + cluster_name = var.addon_context.eks_cluster_id + enable_v1_permissions = true + namespace = local.namespace + + iam_role_name = "${var.addon_context.eks_cluster_id}-karpenter-controller" + iam_role_use_name_prefix = false + iam_policy_name = "${var.addon_context.eks_cluster_id}-karpenter-controller" + iam_policy_use_name_prefix = false + node_iam_role_name = "${var.addon_context.eks_cluster_id}-karpenter-node" + node_iam_role_use_name_prefix = false + queue_name = "${var.addon_context.eks_cluster_id}-karpenter" + rule_name_prefix = "eks-workshop" + create_pod_identity_association = true + + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } +} + +# Helm chart + +resource "helm_release" "karpenter" { + name = "karpenter" + namespace = local.namespace + create_namespace = true + repository = "oci://public.ecr.aws/karpenter" + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + chart = "karpenter" + # renovate: datasource=github-releases depName=aws/karpenter-provider-aws + version = "1.0.2" + wait = true + + values = [ + <<-EOT + settings: + clusterName: ${var.addon_context.eks_cluster_id} + clusterEndpoint: ${var.addon_context.aws_eks_cluster_endpoint} + interruptionQueue: ${module.karpenter.queue_name} + tolerations: + - key: CriticalAddonsOnly + operator: Exists + EOT + ] + + lifecycle { + ignore_changes = [ + repository_password + ] + } +} diff --git a/manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf b/manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf index 7c1f1753f..06fffd04d 100644 --- a/manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf +++ b/manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf @@ -2,7 +2,7 @@ output "environment_variables" { description = "Environment variables to be added to the IDE shell" value = { AIML_SUBNETS = "${data.aws_subnets.private.ids[0]},${data.aws_subnets.private.ids[1]},${data.aws_subnets.private.ids[2]}" - KARPENTER_NODE_ROLE = module.eks_blueprints_addons.karpenter.node_iam_role_name - KARPENTER_ARN = module.eks_blueprints_addons.karpenter.node_iam_role_arn + KARPENTER_NODE_ROLE = module.karpenter.node_iam_role_name + KARPENTER_ARN = module.karpenter.node_iam_role_arn } } diff --git a/manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf b/manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf index 8855aef74..812087dc5 100644 --- a/manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf +++ b/manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf @@ -33,10 +33,3 @@ variable "resources_precreated" { description = "Have expensive resources been created already" type = bool } - -variable "karpenter_version" { - description = "The version of Karpenter chart to use" - type = string - # renovate: datasource=github-releases depName=aws/karpenter-provider-aws - default = "0.37.2" -} diff --git a/manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml b/manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml index 5446c3fcb..01b2fb599 100644 --- a/manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml +++ b/manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml @@ -1,4 +1,4 @@ -apiVersion: karpenter.sh/v1beta1 +apiVersion: karpenter.sh/v1 kind: NodePool metadata: name: inferentia-inf2 @@ -9,6 +9,10 @@ spec: instanceType: inferentia-inf2 provisionerType: Karpenter spec: + taints: + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" requirements: - key: "karpenter.k8s.aws/instance-family" operator: In @@ -19,26 +23,27 @@ spec: - key: "karpenter.sh/capacity-type" operator: In values: ["on-demand", "spot"] + expireAfter: 720h + terminationGracePeriod: 24h nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass name: inferentia-inf2 - taints: - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" limits: cpu: "512" disruption: consolidateAfter: 300s - consolidationPolicy: WhenEmpty - expireAfter: 720h + consolidationPolicy: WhenEmptyOrUnderutilized --- -apiVersion: karpenter.k8s.aws/v1beta1 +apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass metadata: name: inferentia-inf2 spec: amiFamily: AL2 + amiSelectorTerms: + - alias: al2@latest blockDeviceMappings: - deviceName: /dev/xvda ebs: diff --git a/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml b/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml index 5ad10d70f..41937cfaf 100644 --- a/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml +++ b/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml @@ -1,4 +1,4 @@ -apiVersion: karpenter.sh/v1beta1 +apiVersion: karpenter.sh/v1 kind: NodePool metadata: name: x86-cpu-karpenter @@ -21,22 +21,27 @@ spec: - key: "karpenter.sh/capacity-type" operator: In values: ["on-demand", "spot"] + expireAfter: 720h + terminationGracePeriod: 24h nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass name: x86-cpu-karpenter limits: cpu: "256" disruption: consolidateAfter: 300s - consolidationPolicy: WhenEmpty - expireAfter: 720h + consolidationPolicy: WhenEmptyOrUnderutilized --- -apiVersion: karpenter.k8s.aws/v1beta1 +apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass metadata: name: x86-cpu-karpenter spec: amiFamily: AL2 + amiSelectorTerms: + - alias: al2@latest blockDeviceMappings: - deviceName: /dev/xvda ebs: diff --git a/manifests/modules/aiml/inferentia/.workshop/cleanup.sh b/manifests/modules/aiml/inferentia/.workshop/cleanup.sh index 03e67944e..02fd7062d 100644 --- a/manifests/modules/aiml/inferentia/.workshop/cleanup.sh +++ b/manifests/modules/aiml/inferentia/.workshop/cleanup.sh @@ -5,21 +5,3 @@ set -e logmessage "Deleting AIML resources..." kubectl delete namespace aiml --ignore-not-found - -logmessage "Deleting Karpenter NodePool and EC2NodeClass..." - -delete-all-if-crd-exists nodepools.karpenter.sh -delete-all-if-crd-exists ec2nodeclasses.karpenter.k8s.aws - -logmessage "Waiting for Karpenter nodes to be removed..." - -EXIT_CODE=0 - -timeout --foreground -s TERM 30 bash -c \ - 'while [[ $(kubectl get nodes --selector=type=karpenter -o json | jq -r ".items | length") -gt 0 ]];\ - do sleep 5;\ - done' || EXIT_CODE=$? - -if [ $EXIT_CODE -ne 0 ]; then - logmessage "Warning: Karpenter nodes did not clean up" -fi \ No newline at end of file diff --git a/website/docs/aiml/chatbot/gradio.md b/website/docs/aiml/chatbot/gradio.md index 3a02074c6..601fd80c4 100644 --- a/website/docs/aiml/chatbot/gradio.md +++ b/website/docs/aiml/chatbot/gradio.md @@ -46,7 +46,7 @@ kube-root-ca.crt 1 111s Once the load balancer has finished deploying, use the external IP address to directly access the website: -```bash +```bash wait=10 $ kubectl get services -n gradio-llama2-inf2 NAME TYPE ClUSTER-IP EXTERNAL-IP PORT(S) AGE gradio-service LoadBalancer 172.20.84.26 k8s-gradioll-gradiose-a6d0b586ce-06885d584b38b400.elb.us-west-2.amazonaws.com 80:30802/TCP 8m42s diff --git a/website/docs/aiml/chatbot/nodepool.md b/website/docs/aiml/chatbot/nodepool.md index 3d3148b89..f6f59ce2f 100644 --- a/website/docs/aiml/chatbot/nodepool.md +++ b/website/docs/aiml/chatbot/nodepool.md @@ -12,20 +12,12 @@ To learn more about Karpenter, check out the [Karpenter module](../../autoscalin Karpenter has already been installed in our EKS Cluster and runs as a deployment: ```bash -$ kubectl get deployment -n karpenter +$ kubectl get deployment -n kube-system NAME READY UP-TO-DATE AVAILABLE AGE +... karpenter 2/2 2 2 11m ``` -As we did in a previous lab, we need to update our EKS IAM mappings to allow Karpenter nodes to join the cluster: - -```bash -$ eksctl create iamidentitymapping --cluster $EKS_CLUSTER_NAME \ - --region $AWS_REGION --arn $KARPENTER_ARN \ - --group system:bootstrappers --group system:nodes \ - --username system:node:{{EC2PrivateDNSName}} -``` - Since the Ray Cluster creates head and worker pods with different specifications for handling various EC2 families, we'll create two separate node pools to handle the workload demands. Here's the first Karpenter `NodePool` that will provision one `Head Pod` on `x86 CPU` instances: