Skip to content

Commit

Permalink
Merge pull request #1 from KubedAI/infra
Browse files Browse the repository at this point in the history
feat: Added Terraform templates
  • Loading branch information
vara-bonthu authored Oct 5, 2024
2 parents dd5c8ea + 40547c0 commit a301b0a
Show file tree
Hide file tree
Showing 14 changed files with 492 additions and 10 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/codespell.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
name: Codespell

on:
Expand All @@ -24,4 +23,4 @@ jobs:
check_filenames: true
# When using this Action in other repos, the --skip option below can be removed
skip: "*.excalidraw,*.git,*.png,*.jpg,*.svg,go.mod,go.sum"
continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction
continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction
6 changes: 3 additions & 3 deletions .github/workflows/dependabot-automerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ name: Dependabot auto-merge

on:
pull_request_target:
branches: [ main ]
types: [ opened ]
branches: [main]
types: [opened]

permissions:
pull-requests: write
Expand All @@ -18,4 +18,4 @@ jobs:
run: gh pr merge --auto --merge "$PR_URL"
env:
PR_URL: ${{github.event.pull_request.html_url}}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
2 changes: 1 addition & 1 deletion .github/workflows/website-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:
# The GH actions bot is used by default if you didn't specify the two fields.
# You can swap them out with your own user credentials.
user_name: github-actions[bot]
user_email: github-actions[bot]@users.noreply.github.com
user_email: github-actions[bot]@users.noreply.github.com
2 changes: 1 addition & 1 deletion .github/workflows/website-test-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
- name: Install dependencies
run: npm ci
- name: Test build website
run: npm run build
run: npm run build
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,3 @@ site

# node modules
node_modules
website/package-lock.json
website/package.json
44 changes: 44 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
repos:
- repo: https://github.com/streetsidesoftware/cspell-cli
rev: v8.13.3
hooks:
- id: cspell
args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh']
- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
rev: v2.14.0
hooks:
- id: pretty-format-yaml
args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-merge-conflict
- id: detect-private-key
- id: detect-aws-credentials
args: [--allow-missing-credentials]
- repo: https://github.com/antonbabenko/pre-commit-terraform
rev: v1.96.1
hooks:
- id: terraform_fmt
- id: terraform_docs
args:
- --args=--lockfile=false
- id: terraform_tflint
args:
- --args=--only=terraform_deprecated_interpolation
- --args=--only=terraform_deprecated_index
- --args=--only=terraform_unused_declarations
- --args=--only=terraform_comment_syntax
- --args=--only=terraform_documented_outputs
- --args=--only=terraform_documented_variables
- --args=--only=terraform_typed_variables
- --args=--only=terraform_module_pinned_source
- --args=--only=terraform_naming_convention
- --args=--only=terraform_required_version
- --args=--only=terraform_required_providers
- --args=--only=terraform_unused_required_providers
- --args=--only=terraform_workspace_remote
- id: terraform_validate
exclude: (docs|modules)
122 changes: 122 additions & 0 deletions infra/aws/terraform/eks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.24"

cluster_name = local.name
cluster_version = var.eks_cluster_version

# EKS Addons
cluster_addons = {
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
}

# Give the Terraform identity admin access to the cluster
# which will allow it to deploy resources into the cluster
enable_cluster_creator_admin_permissions = true
cluster_endpoint_public_access = true
access_entries = var.access_entries

vpc_id = module.vpc.vpc_id
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null])

# Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates
kms_key_administrators = distinct(concat([
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"],
var.kms_key_admin_roles,
[data.aws_iam_session_context.current.issuer_arn]
))

#---------------------------------------
# Note: This can further restricted to specific required for each Add-on and your application
#---------------------------------------
# Extend cluster security group rules
cluster_security_group_additional_rules = {
ingress_nodes_ephemeral_ports_tcp = {
description = "Nodes on ephemeral ports"
protocol = "tcp"
from_port = 0
to_port = 65535
type = "ingress"
source_node_security_group = true
}
}

# security group rule from all ipv4 to nodes for port 22
node_security_group_additional_rules = {
# Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
# This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
# Update this according to your security requirements if needed
ingress_cluster_to_node_all_traffic = {
description = "Cluster API to Nodegroup all traffic"
protocol = "-1"
from_port = 0
to_port = 0
type = "ingress"
source_cluster_security_group = true
}
}

eks_managed_node_group_defaults = {
iam_role_additional_policies = {
# Not required, but used in the example to access the nodes to inspect mounted volumes
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}

ebs_optimized = true
# This block device is used only for root volume. Adjust volume according to your size.
# NOTE: Don't use this volume for ML workloads
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 100
volume_type = "gp3"
}
}
}
}

eks_managed_node_groups = {
# It's recommended to have a Managed Node group for hosting critical add-ons
# It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
# You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
system_node_group = {
name = "system-node-group"
description = "EKS Core node group for hosting system add-ons"
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
)

# aws ssm get-parameters --names /aws/service/eks/optimized-ami/${var.eks_cluster_version}/amazon-linux-2023/x86_64/standard/recommended/release_version --region us-west-2
ami_type = "AL2023_x86_64_STANDARD" # Use this for Graviton AL2023_ARM_64_STANDARD
min_size = 2
max_size = 8
desired_size = 2

instance_types = ["m6i.large"]

labels = {
NodeGroupType = "system-nodegroup"
}

tags = merge(local.tags, {
Name = "system-nodegroup"
})
}

tags = merge(local.tags, {
# NOTE - if creating multiple security groups with this module, only tag the
# security group that Karpenter should utilize with the following tag
# (i.e. - at most, only one security group should have this tag in your account)
"karpenter.sh/discovery" = local.name
})
}

}
126 changes: 126 additions & 0 deletions infra/aws/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# ---------------------------------------------------------------
# AWS Provider Configuration
# ---------------------------------------------------------------
# The primary AWS provider, used for interacting with resources in the region specified by 'var.region'.
provider "aws" {
region = local.region
}

# Secondary AWS provider for ECR (Elastic Container Registry) authentication.
# ECR public authentication requires the 'us-east-1' region, which is hardcoded here.
# If your main region is 'us-east-1', you can remove this second provider and use the primary one.
provider "aws" {
alias = "ecr"
region = "us-east-1"
}

# ---------------------------------------------------------------
# Helm Provider Configuration
# ---------------------------------------------------------------
# The Helm provider is used to manage Kubernetes applications, relying on the EKS cluster.
provider "helm" {
kubernetes {
# The EKS cluster API endpoint and certificate are retrieved from the EKS module.
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)

exec {
# Retrieves an authentication token for Kubernetes API using the AWS CLI.
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
# Note: The AWS CLI must be installed locally where Terraform is executed.
}
}
}

# ---------------------------------------------------------------
# Local Variables
# ---------------------------------------------------------------
# These locals store reusable values for the project, such as the name, region, and tags.
locals {
# Name and region variables for naming consistency across resources.
name = var.name
region = var.region

# Limiting Availability Zones to two for resource allocation.
azs = slice(data.aws_availability_zones.available.names, 0, 2)

# Project tags for tracking and referencing the GitHub repository.
tags = {
GithubRepo = "https://github.com/KubedAI/spark-rapids-on-kubernetes"
}
}

# ---------------------------------------------------------------
# AWS Data Sources
# ---------------------------------------------------------------
# Data sources used to retrieve AWS-specific information such as current identity, region, and session context.

# EKS cluster authentication data
# data "aws_eks_cluster_auth" "this" {
# name = module.eks.cluster_name
# }

# Retrieves an authorization token for public ECR registry to authenticate image pulls.
# data "aws_ecrpublic_authorization_token" "token" {
# provider = aws.ecr
# }

# Retrieves all available AWS availability zones in the selected region.
data "aws_availability_zones" "available" {}

# Retrieves the current AWS region.
# data "aws_region" "current" {}

# Retrieves the AWS account and caller identity details for the session.
data "aws_caller_identity" "current" {}

# Retrieves the current AWS partition (useful for AWS GovCloud or China regions).
# data "aws_partition" "current" {}

# Retrieves the IAM session context, including the ARN of the currently logged-in user/role.
data "aws_iam_session_context" "current" {
arn = data.aws_caller_identity.current.arn
}

# ---------------------------------------------------------------
# IAM Policy Document for Spark Operator
# ---------------------------------------------------------------
# This IAM policy document allows the Spark operator to interact with S3 and CloudWatch Logs for logging and object storage.

# Policy granting permissions for S3 operations required by Spark jobs.
# data "aws_iam_policy_document" "spark_operator" {
# statement {
# sid = "AllowS3AccessForSparkJobs"
# effect = "Allow"
# # Grants access to all S3 resources in the current AWS partition.
# resources = ["arn:${data.aws_partition.current.partition}:s3:::*"]

# actions = [
# "s3:DeleteObject",
# "s3:DeleteObjectVersion",
# "s3:GetObject",
# "s3:ListBucket",
# "s3:PutObject",
# ]
# }

# # Policy granting permissions for CloudWatch Logs operations.
# statement {
# sid = "AllowCloudWatchLogsAccessForSpark"
# effect = "Allow"
# # Grants access to all CloudWatch Log Groups in the current AWS region and account.
# resources = [
# "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*"
# ]

# actions = [
# "logs:CreateLogGroup",
# "logs:CreateLogStream",
# "logs:DescribeLogGroups",
# "logs:DescribeLogStreams",
# "logs:PutLogEvents",
# ]
# }
# }
4 changes: 4 additions & 0 deletions infra/aws/terraform/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "configure_kubectl" {
description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
}
Loading

0 comments on commit a301b0a

Please sign in to comment.