Skip to content

Commit

Permalink
feat: Argo Workflow blueprint upgrade to latest Karpenter (#459)
Browse files Browse the repository at this point in the history
  • Loading branch information
ovaleanu authored Mar 3, 2024
1 parent 1e4b519 commit cde1440
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 189 deletions.
7 changes: 2 additions & 5 deletions schedulers/terraform/argo-workflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.72 |
| <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
| <a name="provider_kubectl"></a> [kubectl](#provider\_kubectl) | >= 1.14 |
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 2.10 |
| <a name="provider_random"></a> [random](#provider\_random) | 3.3.2 |

Expand All @@ -30,8 +29,8 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| <a name="module_amp_ingest_irsa"></a> [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | 1.9.2 |
| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.30 |
| <a name="module_irsa_argo_events"></a> [irsa\_argo\_events](#module\_irsa\_argo\_events) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
| <a name="module_s3_bucket"></a> [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
| <a name="module_spark_team_a_irsa"></a> [spark\_team\_a\_irsa](#module\_spark\_team\_a\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
Expand All @@ -50,7 +49,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [aws_s3_object.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_object) | resource |
| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
| [kubernetes_cluster_role.spark_argowf_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
| [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
Expand All @@ -75,7 +73,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source |
| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source |

## Inputs

Expand Down
150 changes: 123 additions & 27 deletions schedulers/terraform/argo-workflow/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ module "ebs_csi_driver_irsa" {
#---------------------------------------------------------------
module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "1.9.2"
version = "~> 1.2"


cluster_name = module.eks.cluster_name
Expand Down Expand Up @@ -100,24 +100,30 @@ module "eks_blueprints_addons" {
description = "Cluster Proportional Autoscaler for CoreDNS Service"
}

#---------------------------------------
# Metrics Server
#---------------------------------------
enable_metrics_server = true
metrics_server = {
values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
}

#---------------------------------------
# Karpenter Autoscaler for EKS Cluster
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
karpenter_node = {
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
}
karpenter = {
chart_version = "v0.34.0"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}

#---------------------------------------
# Metrics Server
#---------------------------------------
enable_metrics_server = true
metrics_server = {
values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
}

#---------------------------------------
# AWS for FluentBit - DaemonSet
#---------------------------------------
Expand Down Expand Up @@ -199,7 +205,7 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
module "eks_data_addons" {
source = "aws-ia/eks-data-addons/aws"
version = "~> 1.0" # ensure to update this to the latest/desired version
version = "~> 1.30" # ensure to update this to the latest/desired version

oidc_provider_arn = module.eks.oidc_provider_arn

Expand Down Expand Up @@ -234,26 +240,116 @@ module "eks_data_addons" {
]
}

}

#---------------------------------------
# Karpenter Provisioners
#---------------------------------------
data "kubectl_path_documents" "karpenter_provisioners" {
pattern = "${path.module}/karpenter-provisioners/spark-*.yaml"
vars = {
azs = local.region
eks_cluster_id = module.eks.cluster_name
#---------------------------------------
# Karpenter Autoscaler for EKS Cluster
#---------------------------------------
enable_karpenter_resources = true
karpenter_resources_helm_config = {
spark-compute-optimized = {
values = [
<<-EOT
name: spark-compute-optimized
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
tags:
Name: "${module.eks.cluster_name}-private*"
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
nodePool:
labels:
- type: karpenter
- NodeGroupType: SparkComputeOptimized
- multiArch: Spark
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["c"]
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["c5d"]
- key: "karpenter.k8s.aws/instance-cpu"
operator: In
values: ["4", "8", "16", "36"]
- key: "karpenter.k8s.aws/instance-hypervisor"
operator: In
values: ["nitro"]
- key: "karpenter.k8s.aws/instance-generation"
operator: Gt
values: ["2"]
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 30s
expireAfter: 720h
weight: 100
EOT
]
}
spark-graviton-compute-optimized = {
values = [
<<-EOT
name: spark-graviton-compute-optimized
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
tags:
Name: "${module.eks.cluster_name}-private*"
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
nodePool:
labels:
- type: karpenter
- NodeGroupType: SparkGravitonComputeOptimized
- multiArch: Spark
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
- key: "kubernetes.io/arch"
operator: In
values: ["arm64"]
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["c"]
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["c7gd"]
- key: "karpenter.k8s.aws/instance-cpu"
operator: In
values: ["4", "8", "16", "32"]
- key: "karpenter.k8s.aws/instance-hypervisor"
operator: In
values: ["nitro"]
- key: "karpenter.k8s.aws/instance-generation"
operator: Gt
values: ["2"]
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 30s
expireAfter: 720h
weight: 50
EOT
]
}
}
}

resource "kubectl_manifest" "karpenter_provisioner" {
for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
yaml_body = each.value

depends_on = [module.eks_blueprints_addons]
}

#tfsec:ignore:*
module "s3_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
Expand Down

This file was deleted.

8 changes: 0 additions & 8 deletions schedulers/terraform/argo-workflow/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@ provider "helm" {
}
}

provider "kubectl" {
apply_retry_count = 30
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
load_config_file = false
token = data.aws_eks_cluster_auth.this.token
}

data "aws_eks_cluster_auth" "this" {
name = module.eks.cluster_name
}
Expand Down
Loading

0 comments on commit cde1440

Please sign in to comment.