Skip to content

Commit

Permalink
feat: Update karpenter self-managed-airflow to use karpenter_resource…
Browse files Browse the repository at this point in the history
…s chart (#426)
  • Loading branch information
lusoal authored Feb 7, 2024
1 parent be5ade7 commit c9f4435
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 300 deletions.
164 changes: 0 additions & 164 deletions ai-ml/trainium-inferentia/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -432,167 +432,3 @@ resource "kubectl_manifest" "mpi_operator" {
yaml_body = each.value
depends_on = [module.eks.eks_cluster_id]
}

#---------------------------------------------------------------
# Create a Launch Template Userdata for Trainium, and use it in Karpenter, deprecated
# This commented section of the pattern is commented due to lack of support in utilizing LaunchTemplates in newer Karpenter versions.
# See full change list https://github.com/aws/karpenter-provider-aws/blob/d1d1371ae2e1552b8fdded7d343bf24ea18bee31/designs/v1beta1-full-changelist.md#remove-speclaunchtemplate
#---------------------------------------------------------------
# data "cloudinit_config" "trn1_lt" {
# base64_encode = true
# gzip = false
# boundary = "//"

# # Prepend to existing user data supplied by AWS EKS
# part {
# content_type = "text/x-shellscript"
# content = <<-EOT
# cat <<-EOF > /etc/profile.d/bootstrap.sh
# #!/bin/sh

# # Configure NVMe volumes in RAID0 configuration
# # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126
# # Mount will be: /mnt/k8s-disks
# export LOCAL_DISKS='raid0'

# # Install Neuron monitoring tools
# yum install aws-neuronx-tools-2.* -y
# export PATH=/opt/aws/neuron/bin:$PATH

# # EFA Setup for Trainium and Inferentia
# export FI_EFA_USE_DEVICE_RDMA=1
# export FI_PROVIDER=efa
# export FI_EFA_FORK_SAFE=1

# curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
# tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
# ./efa_installer.sh -y -g
# /opt/amazon/efa/bin/fi_info -p efa
# EOF

# # Source extra environment variables in bootstrap script
# sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh

# # Bootstrap the node
# B64_CLUSTER_CA=${module.eks.cluster_certificate_authority_data}
# API_SERVER_URL=${module.eks.cluster_endpoint}
# /etc/eks/bootstrap.sh ${local.name} --kubelet-extra-args "--node-labels=eks.amazonaws.com/nodegroup-image=${data.aws_ami.eks_gpu.id}" --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL

# EOT
# }
# }

#---------------------------------------------------------------
# This Terraform code defines a data block to fetch the most recent Amazon Machine Image (AMI)
# for an Amazon Elastic Kubernetes Service (EKS) cluster with GPU support.
#---------------------------------------------------------------
# data "aws_ami" "eks_gpu" {
# owners = ["amazon"]
# most_recent = true

# filter {
# name = "name"
# values = ["amazon-eks-gpu-node-${var.eks_cluster_version}-*"]
# }
# }


# locals {
# karpenter_trn1_32xl_lt_name = format("%s-trn132xl-lt", local.name)
# }

#---------------------------------------------------------------
# AWS Launch Template Configuration for Karpenter Trn1.32xlarge Instances
#---------------------------------------------------------------
# resource "aws_launch_template" "trn1_lt" {
# name = local.karpenter_trn1_32xl_lt_name
# description = "Karpenter Trn1.32xlarge Launch Template"

# user_data = data.cloudinit_config.trn1_lt.rendered

# ebs_optimized = true

# image_id = data.aws_ami.eks_gpu.id

# iam_instance_profile {
# name = module.eks_blueprints_addons.karpenter.node_instance_profile_name
# }

# # Commented for visibility to implement this feature in the future
# # placement {
# # tenancy = "default"
# # availability_zone = "${local.region}d"
# # group_name = local.karpenter_trn1_32xl_lt_name
# # }

# metadata_options {
# http_endpoint = "enabled"
# http_tokens = "required"
# http_put_response_hop_limit = 2
# }

# block_device_mappings {
# device_name = "/dev/xvda"
# ebs {
# volume_size = 100
# delete_on_termination = true
# volume_type = "gp3"
# }
# }

# monitoring {
# enabled = true
# }

# tag_specifications {
# resource_type = "instance"

# tags = merge(local.tags, {
# "karpenter.sh/discovery" = local.name
# })
# }

# # First network interface with device_index=0 and network_card_index=0
# network_interfaces {
# device_index = 0
# network_card_index = 0
# associate_public_ip_address = false
# interface_type = "efa"
# delete_on_termination = true
# security_groups = [module.eks.node_security_group_id]
# description = "Karpenter EFA config for Trainium"
# }

# # Additional network interfaces with device_index=1 and network_card_index ranging from 1 to 7
# dynamic "network_interfaces" {
# for_each = range(1, 8) # Create 7 additional network interfaces
# content {
# device_index = 1
# network_card_index = network_interfaces.value
# associate_public_ip_address = false
# interface_type = "efa"
# delete_on_termination = true
# security_groups = [module.eks.node_security_group_id]
# description = "Karpenter EFA config for Trainium"
# }
# }
# }

# #---------------------------------------
# # Karpenter Provisioners
# #---------------------------------------
# data "kubectl_path_documents" "karpenter_provisioners" {
# pattern = "${path.module}/karpenter-provisioners/karpenter-*.yaml"
# vars = {
# azs = local.region
# eks_cluster_id = local.name
# launch_template_name = local.karpenter_trn1_32xl_lt_name
# }
# }

# resource "kubectl_manifest" "karpenter_provisioner" {
# for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
# yaml_body = each.value

# depends_on = [module.eks_blueprints_addons]
# }
2 changes: 1 addition & 1 deletion ai-ml/trainium-inferentia/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@ locals {
Blueprint = local.name
GithubRepo = "github.com/awslabs/data-on-eks"
}
}
}
8 changes: 3 additions & 5 deletions schedulers/terraform/self-managed-airflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| <a name="module_db"></a> [db](#module\_db) | terraform-aws-modules/rds/aws | ~> 5.0 |
| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.0 |
| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.9 |
| <a name="module_fluentbit_s3_bucket"></a> [fluentbit\_s3\_bucket](#module\_fluentbit\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
| <a name="module_security_group"></a> [security\_group](#module\_security\_group) | terraform-aws-modules/security-group/aws | ~> 5.0 |
| <a name="module_spark_logs_s3_bucket"></a> [spark\_logs\_s3\_bucket](#module\_spark\_logs\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
Expand Down Expand Up @@ -66,7 +66,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [kubectl_manifest.airflow_webserver](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubectl_manifest.efs_pvc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubectl_manifest.efs_sc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
| [kubernetes_cluster_role_binding.airflow_worker_spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource |
| [kubernetes_cluster_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource |
Expand All @@ -93,7 +92,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source |
| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source |

## Inputs

Expand All @@ -108,7 +106,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"self-managed-airflow"` | no |
| <a name="input_private_subnets"></a> [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` | <pre>[<br> "10.0.1.0/24",<br> "10.0.2.0/24"<br>]</pre> | no |
| <a name="input_public_subnets"></a> [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` | <pre>[<br> "10.0.0.0/26",<br> "10.0.0.64/26"<br>]</pre> | no |
| <a name="input_region"></a> [region](#input\_region) | Region | `string` | `"eu-west-1"` | no |
| <a name="input_region"></a> [region](#input\_region) | Region | `string` | `"us-west-2"` | no |
| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br> "100.64.0.0/16"<br>]</pre> | no |
| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.0.0.0/16"` | no |

Expand Down
111 changes: 91 additions & 20 deletions schedulers/terraform/self-managed-airflow/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ module "ebs_csi_driver_irsa" {
module "eks_blueprints_addons" {
# Short commit hash from 8th May using git rev-parse --short HEAD
source = "aws-ia/eks-blueprints-addons/aws"
version = "~> 1.0"
version = "~> 1.2"

cluster_name = module.eks.cluster_name
cluster_endpoint = module.eks.cluster_endpoint
Expand Down Expand Up @@ -79,7 +79,13 @@ module "eks_blueprints_addons" {
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
karpenter_node = {
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
}
karpenter = {
chart_version = "v0.34.0"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}
Expand Down Expand Up @@ -166,7 +172,7 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
module "eks_data_addons" {
source = "aws-ia/eks-data-addons/aws"
version = "~> 1.0" # ensure to update this to the latest/desired version
version = "~> 1.2.9" # ensure to update this to the latest/desired version

oidc_provider_arn = module.eks.oidc_provider_arn

Expand Down Expand Up @@ -237,6 +243,89 @@ module "eks_data_addons" {
EOT
]
}
enable_karpenter_resources = true
karpenter_resources_helm_config = {
spark-compute-optimized = {
values = [
<<-EOT
name: spark-compute-optimized
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
tags:
Name: "${module.eks.cluster_name}-private*"
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
userData: |
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="BOUNDARY"
--BOUNDARY
Content-Type: text/x-shellscript; charset="us-ascii"
#!/bin/bash
echo "Running a custom user data script"
set -ex
yum install mdadm -y
DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}')
DISK_ARRAY=()
for DEV in $DEVICES
do
DISK_ARRAY+=("/dev/$${DEV}")
done
DISK_COUNT=$${#DISK_ARRAY[@]}
if [ $${DISK_COUNT} -eq 0 ]; then
echo "No SSD disks available. No further action needed."
else
if [ $${DISK_COUNT} -eq 1 ]; then
TARGET_DEV=$${DISK_ARRAY[0]}
mkfs.xfs $${TARGET_DEV}
else
mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]}
mkfs.xfs /dev/md0
TARGET_DEV=/dev/md0
fi
mkdir -p /local1
echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab
mount -a
/usr/bin/chown -hR +999:+1000 /local1
fi
--BOUNDARY--
nodePool:
labels:
- provisioner: spark-compute-optimized
- NodeGroupType: SparkComputeOptimized
- type: karpenter
taints:
- key: spark-compute-optimized
value: 'true'
effect: NoSchedule
requirements:
- key: "topology.kubernetes.io/zone"
operator: In
values: [${local.region}a]
- key: "node.kubernetes.io/instance-type"
operator: In
values: ["c5d.large","c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
EOT
]
}
}
}

#---------------------------------------------------------------
Expand Down Expand Up @@ -285,21 +374,3 @@ module "fluentbit_s3_bucket" {

tags = local.tags
}

#---------------------------------------
# Karpenter Provisioners for workloads
#---------------------------------------
data "kubectl_path_documents" "karpenter_provisioners" {
pattern = "${path.module}/karpenter-provisioners/*.yaml"
vars = {
azs = local.region
eks_cluster_id = module.eks.cluster_name
}
}

resource "kubectl_manifest" "karpenter_provisioner" {
for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
yaml_body = each.value

depends_on = [module.eks_blueprints_addons]
}
2 changes: 1 addition & 1 deletion schedulers/terraform/self-managed-airflow/airflow-core.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module "db" {
identifier = local.airflow_name

engine = "postgres"
engine_version = "14.3"
engine_version = "14.10"
family = "postgres14"
major_engine_version = "14"
instance_class = "db.m6i.xlarge"
Expand Down
Loading

0 comments on commit c9f4435

Please sign in to comment.