Skip to content

Commit

Permalink
Bottlerocket cache container image (#1)
Browse files Browse the repository at this point in the history
* feat: run GPU node with BR and EBS snapshot with container image cache

* refactor: remove kubectl_manifest of karpenter custom resources

* feat: locust file fo load testing

* feat: End-to-end deployment of Bottlerocket nodes with container image cache
  • Loading branch information
lindarr915 authored Aug 2, 2024
1 parent d0cd671 commit 035bc29
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 155 deletions.
22 changes: 12 additions & 10 deletions ai-ml/jark-stack/terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,6 @@ Before we begin, ensure you have all the prerequisites in place to make the depl
- kubectl
- terraform

## Reduce Cold Start Time by Preloading Container Images in Bottlerocket OS

Define the `TF_VAR_bottlerocket_data_disk_snpashot_id` to enable Karpenter to provision Bottlerocket worker nodes with EBS Snapshots, to reduce cold start for container startup. This will likely to save 10 mins for downloading and extracting container images from Amazon ECR.

To build snapshots with preloaded container images, refer to [this page](../preload-container-image-ami/README.md) for details.

```
# Get the snapshot ID with ./snapshot.sh
export TF_VAR_bottlerocket_data_disk_snpashot_id=snap-0c6d965cf431785ed

./install.sh
...
Expand All @@ -44,6 +34,8 @@ Events:
# Deploy
Clone the repository
```
Expand All @@ -54,6 +46,16 @@ Navigate into one of the example directories and run install.sh script
Important Note: Ensure that you update the region in the variables.tf file before deploying the blueprint. Additionally, confirm that your local region setting matches the specified region to prevent any discrepancies. For example, set your `export AWS_DEFAULT_REGION="<REGION>"` to the desired region:
## Reduce Cold Start Time by Preloading Container Images in Bottlerocket OS (o)
Define the `TF_VAR_bottlerocket_data_disk_snpashot_id` to enable Karpenter to provision Bottlerocket worker nodes with EBS Snapshots, to reduce cold start for container startup. This will likely to save 10 mins for downloading and extracting container images from Amazon ECR.
To build snapshots with preloaded container images, refer to [this page](../preload-container-image-ami/README.md) for details.
# Get the snapshot ID with ./snapshot.sh
export TF_VAR_bottlerocket_data_disk_snpashot_id=snap-0c6d965cf431785ed
```
cd data-on-eks/ai-ml/jark-stack/ && chmod +x install.sh
./install.sh
Expand Down
40 changes: 36 additions & 4 deletions ai-ml/jark-stack/terraform/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,10 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
# Data on EKS Kubernetes Addons
#---------------------------------------------------------------

module "data_addons" {
source = "aws-ia/eks-data-addons/aws"
version = "~> 1.31.4" # ensure to update this to the latest/desired version
version = "~> 1.40" # source = "aws-ia/eks-data-addons/aws"

oidc_provider_arn = module.eks.oidc_provider_arn

Expand Down Expand Up @@ -182,7 +183,7 @@ module "data_addons" {
#---------------------------------------------------------------
enable_nvidia_device_plugin = true
nvidia_device_plugin_helm_config = {
version = "v0.15.1"
version = "v0.16.1"
name = "nvidia-device-plugin"
values = [
<<-EOT
Expand Down Expand Up @@ -223,21 +224,37 @@ module "data_addons" {
#---------------------------------------------------------------
# Karpenter Resources Add-on
#---------------------------------------------------------------
enable_karpenter_resources = false
enable_karpenter_resources = true
karpenter_resources_helm_config = {

g5-gpu-karpenter = {
values = [
<<-EOT
name: g5-gpu-karpenter
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
amiFamily: Bottlerocket
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[2]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
blockDeviceMappings:
# Root device
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp3
encrypted: true
# Data device: Container resources such as images and logs
- deviceName: /dev/xvdb
ebs:
volumeSize: 300Gi
volumeType: gp3
encrypted: true
${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""}
nodePool:
labels:
Expand Down Expand Up @@ -276,13 +293,28 @@ module "data_addons" {
name: x86-cpu-karpenter
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
amiFamily: Bottlerocket
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[3]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
# instanceStorePolicy: RAID0
blockDeviceMappings:
# Root device
- deviceName: /dev/xvda
ebs:
volumeSize: 100Gi
volumeType: gp3
encrypted: true
# Data device: Container resources such as images and logs
- deviceName: /dev/xvdb
ebs:
volumeSize: 300Gi
volumeType: gp3
encrypted: true
${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""}
nodePool:
labels:
Expand Down
4 changes: 4 additions & 0 deletions ai-ml/jark-stack/terraform/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ targets=(
"module.eks"
)

# Get the snapshot ID with ./snapshot.sh
# Uncomment the command line below and replace the value with the snapshot ID
# export TF_VAR_bottlerocket_data_disk_snpashot_id=snap-0ffe58efdd845d938

# Initialize Terraform
terraform init -upgrade

Expand Down
141 changes: 0 additions & 141 deletions ai-ml/jark-stack/terraform/karpenter.tf
Original file line number Diff line number Diff line change
@@ -1,144 +1,3 @@

resource "kubectl_manifest" "karpenter_gpu_node_class" {
yaml_body = <<-YAML
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
name: default
spec:
amiFamily: Bottlerocket
role: ${module.eks_blueprints_addons.karpenter.node_iam_role_name}
securityGroupSelectorTerms:
- tags:
Name: ${module.eks.cluster_name}-node
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: ${module.eks.cluster_name}
tags:
karpenter.sh/discovery: ${module.eks.cluster_name}
blockDeviceMappings:
# Root device
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp3
encrypted: true
# Data device: Container resources such as images and logs
- deviceName: /dev/xvdb
ebs:
volumeSize: 300Gi
volumeType: gp3
encrypted: true
${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""}
YAML
depends_on = [module.eks_blueprints_addons]
}

resource "kubectl_manifest" "karpenter_gpu_node_pool" {
yaml_body = <<-YAML
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: gpu
spec:
disruption:
consolidateAfter: 600s
consolidationPolicy: WhenEmpty
expireAfter: 720h
limits:
cpu: 1k
memory: 1000Gi
nvidia.com/gpu: 50
template:
metadata:
labels:
NodeGroupType: g5-gpu-karpenter
type: karpenter
spec:
nodeClassRef:
name: default
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["g"]
- key: karpenter.k8s.aws/instance-generation
operator: Gt
values: ["4"]
- key: "karpenter.k8s.aws/instance-cpu"
operator: In
values: ["4", "8", "16"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: "karpenter.k8s.aws/instance-hypervisor"
operator: In
values: ["nitro"]
- key: "topology.kubernetes.io/zone"
operator: In
values: ${jsonencode(local.azs)}
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
taints:
- key: nvidia.com/gpu
value: "Exists"
effect: "NoSchedule"
YAML
depends_on = [module.eks_blueprints_addons]
}

resource "kubectl_manifest" "karpenter_node_pool" {
yaml_body = <<-YAML
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default
spec:
disruption:
consolidateAfter: 600s
consolidationPolicy: WhenEmpty
expireAfter: 720h
limits:
cpu: 1k
template:
metadata:
labels:
NodeGroupType: x86-cpu-karpenter
type: karpenter
spec:
kubelet:
maxPods: 110
nodeClassRef:
name: default
requirements:
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["c", "m", "r"]
- key: "karpenter.k8s.aws/instance-cpu"
operator: In
values: ["4", "8", "16"]
- key: "karpenter.k8s.aws/instance-hypervisor"
operator: In
values: ["nitro"]
- key: "topology.kubernetes.io/zone"
operator: In
values: ${jsonencode(local.azs)}
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand"]
YAML
depends_on = [module.eks_blueprints_addons]
}


resource "aws_iam_policy" "karpenter_controlloer_policy" {
description = "Additional IAM policy for Karpenter controller"
policy = data.aws_iam_policy_document.karpenter_controller_policy.json
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import json
from locust import HttpUser, task, between

class StableDiffusionUser(HttpUser):
wait_time = between(1, 2) # Seconds between requests

@task
def generate_image(self):
prompt = "A beautiful sunset over the ocean"
payload = {
"prompt": prompt
}

headers = {
"Content-Type": "application/json"
}

response = self.client.get(
"/imagine",
params=payload,
data=json.dumps(payload),
headers=headers
)

if response.status_code == 200:
print(f"Generated image for prompt: {prompt}")
else:
print(f"Error generating image: {response.text}")

# You can add more tasks here if needed
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 035bc29

Please sign in to comment.