-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from KubedAI/infra
feat: Added Terraform templates
- Loading branch information
Showing
14 changed files
with
492 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,4 +24,4 @@ jobs: | |
- name: Install dependencies | ||
run: npm ci | ||
- name: Test build website | ||
run: npm run build | ||
run: npm run build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,5 +56,3 @@ site | |
|
||
# node modules | ||
node_modules | ||
website/package-lock.json | ||
website/package.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
repos: | ||
- repo: https://github.com/streetsidesoftware/cspell-cli | ||
rev: v8.13.3 | ||
hooks: | ||
- id: cspell | ||
args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh'] | ||
- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks | ||
rev: v2.14.0 | ||
hooks: | ||
- id: pretty-format-yaml | ||
args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes] | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.6.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-merge-conflict | ||
- id: detect-private-key | ||
- id: detect-aws-credentials | ||
args: [--allow-missing-credentials] | ||
- repo: https://github.com/antonbabenko/pre-commit-terraform | ||
rev: v1.96.1 | ||
hooks: | ||
- id: terraform_fmt | ||
- id: terraform_docs | ||
args: | ||
- --args=--lockfile=false | ||
- id: terraform_tflint | ||
args: | ||
- --args=--only=terraform_deprecated_interpolation | ||
- --args=--only=terraform_deprecated_index | ||
- --args=--only=terraform_unused_declarations | ||
- --args=--only=terraform_comment_syntax | ||
- --args=--only=terraform_documented_outputs | ||
- --args=--only=terraform_documented_variables | ||
- --args=--only=terraform_typed_variables | ||
- --args=--only=terraform_module_pinned_source | ||
- --args=--only=terraform_naming_convention | ||
- --args=--only=terraform_required_version | ||
- --args=--only=terraform_required_providers | ||
- --args=--only=terraform_unused_required_providers | ||
- --args=--only=terraform_workspace_remote | ||
- id: terraform_validate | ||
exclude: (docs|modules) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
|
||
module "eks" { | ||
source = "terraform-aws-modules/eks/aws" | ||
version = "~> 20.24" | ||
|
||
cluster_name = local.name | ||
cluster_version = var.eks_cluster_version | ||
|
||
# EKS Addons | ||
cluster_addons = { | ||
coredns = {} | ||
eks-pod-identity-agent = {} | ||
kube-proxy = {} | ||
vpc-cni = {} | ||
} | ||
|
||
# Give the Terraform identity admin access to the cluster | ||
# which will allow it to deploy resources into the cluster | ||
enable_cluster_creator_admin_permissions = true | ||
cluster_endpoint_public_access = true | ||
access_entries = var.access_entries | ||
|
||
vpc_id = module.vpc.vpc_id | ||
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created | ||
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : | ||
substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) | ||
|
||
# Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates | ||
kms_key_administrators = distinct(concat([ | ||
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"], | ||
var.kms_key_admin_roles, | ||
[data.aws_iam_session_context.current.issuer_arn] | ||
)) | ||
|
||
#--------------------------------------- | ||
# Note: This can further restricted to specific required for each Add-on and your application | ||
#--------------------------------------- | ||
# Extend cluster security group rules | ||
cluster_security_group_additional_rules = { | ||
ingress_nodes_ephemeral_ports_tcp = { | ||
description = "Nodes on ephemeral ports" | ||
protocol = "tcp" | ||
from_port = 0 | ||
to_port = 65535 | ||
type = "ingress" | ||
source_node_security_group = true | ||
} | ||
} | ||
|
||
# security group rule from all ipv4 to nodes for port 22 | ||
node_security_group_additional_rules = { | ||
# Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. | ||
# This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc. | ||
# Update this according to your security requirements if needed | ||
ingress_cluster_to_node_all_traffic = { | ||
description = "Cluster API to Nodegroup all traffic" | ||
protocol = "-1" | ||
from_port = 0 | ||
to_port = 0 | ||
type = "ingress" | ||
source_cluster_security_group = true | ||
} | ||
} | ||
|
||
eks_managed_node_group_defaults = { | ||
iam_role_additional_policies = { | ||
# Not required, but used in the example to access the nodes to inspect mounted volumes | ||
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" | ||
} | ||
|
||
ebs_optimized = true | ||
# This block device is used only for root volume. Adjust volume according to your size. | ||
# NOTE: Don't use this volume for ML workloads | ||
block_device_mappings = { | ||
xvda = { | ||
device_name = "/dev/xvda" | ||
ebs = { | ||
volume_size = 100 | ||
volume_type = "gp3" | ||
} | ||
} | ||
} | ||
} | ||
|
||
eks_managed_node_groups = { | ||
# It's recommended to have a Managed Node group for hosting critical add-ons | ||
# It's recommended to use Karpenter to place your workloads instead of using Managed Node groups | ||
# You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. | ||
system_node_group = { | ||
name = "system-node-group" | ||
description = "EKS Core node group for hosting system add-ons" | ||
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned | ||
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : | ||
substr(cidr_block, 0, 4) == "100." ? subnet_id : null] | ||
) | ||
|
||
# aws ssm get-parameters --names /aws/service/eks/optimized-ami/${var.eks_cluster_version}/amazon-linux-2023/x86_64/standard/recommended/release_version --region us-west-2 | ||
ami_type = "AL2023_x86_64_STANDARD" # Use this for Graviton AL2023_ARM_64_STANDARD | ||
min_size = 2 | ||
max_size = 8 | ||
desired_size = 2 | ||
|
||
instance_types = ["m6i.large"] | ||
|
||
labels = { | ||
NodeGroupType = "system-nodegroup" | ||
} | ||
|
||
tags = merge(local.tags, { | ||
Name = "system-nodegroup" | ||
}) | ||
} | ||
|
||
tags = merge(local.tags, { | ||
# NOTE - if creating multiple security groups with this module, only tag the | ||
# security group that Karpenter should utilize with the following tag | ||
# (i.e. - at most, only one security group should have this tag in your account) | ||
"karpenter.sh/discovery" = local.name | ||
}) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
# --------------------------------------------------------------- | ||
# AWS Provider Configuration | ||
# --------------------------------------------------------------- | ||
# The primary AWS provider, used for interacting with resources in the region specified by 'var.region'. | ||
provider "aws" { | ||
region = local.region | ||
} | ||
|
||
# Secondary AWS provider for ECR (Elastic Container Registry) authentication. | ||
# ECR public authentication requires the 'us-east-1' region, which is hardcoded here. | ||
# If your main region is 'us-east-1', you can remove this second provider and use the primary one. | ||
provider "aws" { | ||
alias = "ecr" | ||
region = "us-east-1" | ||
} | ||
|
||
# --------------------------------------------------------------- | ||
# Helm Provider Configuration | ||
# --------------------------------------------------------------- | ||
# The Helm provider is used to manage Kubernetes applications, relying on the EKS cluster. | ||
provider "helm" { | ||
kubernetes { | ||
# The EKS cluster API endpoint and certificate are retrieved from the EKS module. | ||
host = module.eks.cluster_endpoint | ||
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) | ||
|
||
exec { | ||
# Retrieves an authentication token for Kubernetes API using the AWS CLI. | ||
api_version = "client.authentication.k8s.io/v1beta1" | ||
command = "aws" | ||
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] | ||
# Note: The AWS CLI must be installed locally where Terraform is executed. | ||
} | ||
} | ||
} | ||
|
||
# --------------------------------------------------------------- | ||
# Local Variables | ||
# --------------------------------------------------------------- | ||
# These locals store reusable values for the project, such as the name, region, and tags. | ||
locals { | ||
# Name and region variables for naming consistency across resources. | ||
name = var.name | ||
region = var.region | ||
|
||
# Limiting Availability Zones to two for resource allocation. | ||
azs = slice(data.aws_availability_zones.available.names, 0, 2) | ||
|
||
# Project tags for tracking and referencing the GitHub repository. | ||
tags = { | ||
GithubRepo = "https://github.com/KubedAI/spark-rapids-on-kubernetes" | ||
} | ||
} | ||
|
||
# --------------------------------------------------------------- | ||
# AWS Data Sources | ||
# --------------------------------------------------------------- | ||
# Data sources used to retrieve AWS-specific information such as current identity, region, and session context. | ||
|
||
# EKS cluster authentication data | ||
# data "aws_eks_cluster_auth" "this" { | ||
# name = module.eks.cluster_name | ||
# } | ||
|
||
# Retrieves an authorization token for public ECR registry to authenticate image pulls. | ||
# data "aws_ecrpublic_authorization_token" "token" { | ||
# provider = aws.ecr | ||
# } | ||
|
||
# Retrieves all available AWS availability zones in the selected region. | ||
data "aws_availability_zones" "available" {} | ||
|
||
# Retrieves the current AWS region. | ||
# data "aws_region" "current" {} | ||
|
||
# Retrieves the AWS account and caller identity details for the session. | ||
data "aws_caller_identity" "current" {} | ||
|
||
# Retrieves the current AWS partition (useful for AWS GovCloud or China regions). | ||
# data "aws_partition" "current" {} | ||
|
||
# Retrieves the IAM session context, including the ARN of the currently logged-in user/role. | ||
data "aws_iam_session_context" "current" { | ||
arn = data.aws_caller_identity.current.arn | ||
} | ||
|
||
# --------------------------------------------------------------- | ||
# IAM Policy Document for Spark Operator | ||
# --------------------------------------------------------------- | ||
# This IAM policy document allows the Spark operator to interact with S3 and CloudWatch Logs for logging and object storage. | ||
|
||
# Policy granting permissions for S3 operations required by Spark jobs. | ||
# data "aws_iam_policy_document" "spark_operator" { | ||
# statement { | ||
# sid = "AllowS3AccessForSparkJobs" | ||
# effect = "Allow" | ||
# # Grants access to all S3 resources in the current AWS partition. | ||
# resources = ["arn:${data.aws_partition.current.partition}:s3:::*"] | ||
|
||
# actions = [ | ||
# "s3:DeleteObject", | ||
# "s3:DeleteObjectVersion", | ||
# "s3:GetObject", | ||
# "s3:ListBucket", | ||
# "s3:PutObject", | ||
# ] | ||
# } | ||
|
||
# # Policy granting permissions for CloudWatch Logs operations. | ||
# statement { | ||
# sid = "AllowCloudWatchLogsAccessForSpark" | ||
# effect = "Allow" | ||
# # Grants access to all CloudWatch Log Groups in the current AWS region and account. | ||
# resources = [ | ||
# "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*" | ||
# ] | ||
|
||
# actions = [ | ||
# "logs:CreateLogGroup", | ||
# "logs:CreateLogStream", | ||
# "logs:DescribeLogGroups", | ||
# "logs:DescribeLogStreams", | ||
# "logs:PutLogEvents", | ||
# ] | ||
# } | ||
# } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
output "configure_kubectl" { | ||
description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" | ||
value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" | ||
} |
Oops, something went wrong.