Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: MLflow blueprint upgrade to latest Karpenter #464

Merged
merged 1 commit into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 110 additions & 18 deletions ai-ml/mlflow/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,13 @@ module "eks_blueprints_addons" {
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
karpenter_node = {
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
}
karpenter = {
chart_version = "v0.34.0"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}
Expand Down Expand Up @@ -168,7 +174,7 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
module "eks_data_addons" {
source = "aws-ia/eks-data-addons/aws"
version = "~> 1.2.3" # ensure to update this to the latest/desired version
version = "~> 1.3" # ensure to update this to the latest/desired version

oidc_provider_arn = module.eks.oidc_provider_arn

Expand Down Expand Up @@ -201,6 +207,109 @@ module "eks_data_addons" {
values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})]
}

#---------------------------------------
# Deploying Karpenter resources(Nodepool and NodeClass) with Helm Chart
#---------------------------------------
enable_karpenter_resources = true
# We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
# module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
karpenter_resources_helm_config = {
gpu-g5 = {
values = [
<<-EOT
name: gpu-g5
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[2]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
blockDevice:
deviceName: /dev/xvda
volumeSize: 500Gi
volumeType: gp3
encrypted: true
deleteOnTermination: true
nodePool:
labels:
- instanceType: gp5
- provisionerType: Karpenter
taints:
- key: nvidia.com/gpu
operator: "Exists"
effect: "NoSchedule"
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["g5"]
- key: "karpenter.k8s.aws/instance-size"
operator: In
values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand"]
limits:
cpu: 1000
amiFamily: Ubuntu
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 30s
expireAfter: 720h
weight: 100
EOT
]
}
default = {
values = [
<<-EOT
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[2]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
blockDevice:
deviceName: /dev/xvda
volumeSize: 200Gi
volumeType: gp3
encrypted: true
deleteOnTermination: true
nodePool:
labels:
- instanceType: mixed-x86
- provisionerType: Karpenter
- workload: mlflow
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["c5", "m5", "r5"]
- key: "karpenter.k8s.aws/instance-size"
operator: In
values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand"]
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 30s
expireAfter: 720h
weight: 100
EOT
]
}
}
}

#---------------------------------------------------------------
Expand Down Expand Up @@ -282,23 +391,6 @@ module "fluentbit_s3_bucket" {
tags = local.tags
}

#---------------------------------------
# Karpenter Provisioners for workloads
#---------------------------------------
data "kubectl_path_documents" "karpenter_provisioners" {
pattern = "${path.module}/karpenter-provisioners/*.yaml"
vars = {
cluster_name = module.eks.cluster_name
}
}

resource "kubectl_manifest" "karpenter_provisioner" {
for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
yaml_body = each.value

depends_on = [module.eks_blueprints_addons]
}

#---------------------------------------------------------------
# GP3 Encrypted Storage Class
#---------------------------------------------------------------
Expand Down
35 changes: 0 additions & 35 deletions ai-ml/mlflow/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -114,40 +114,5 @@ module "eks" {
"karpenter.sh/discovery" = local.name
})
}

gpu1 = {
name = "gpu-node-grp"
description = "EKS Node Group to run GPU workloads"
# Filtering only Secondary CIDR private subnets starting with "100.".
# Subnet IDs where the nodes/node groups will be provisioned
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
)

ami_type = "AL2_x86_64_GPU"
min_size = 0
max_size = 1
desired_size = 0

instance_types = ["g5.12xlarge"]

labels = {
WorkerType = "ON_DEMAND"
NodeGroupType = "gpu"
}

taints = {
gpu = {
key = "nvidia.com/gpu"
effect = "NO_SCHEDULE"
operator = "EXISTS"
}
}

tags = merge(local.tags, {
Name = "gpu-node-grp",
"karpenter.sh/discovery" = local.name
})
}
}
}
Loading