Skip to content

Commit

Permalink
feat: Jupyterhub blueprint updates for GPU and Neuron
Browse files Browse the repository at this point in the history
  • Loading branch information
askulkarni2 authored Oct 2, 2023
2 parents f2bace4 + b215224 commit b91c245
Show file tree
Hide file tree
Showing 43 changed files with 2,448 additions and 892 deletions.
208 changes: 192 additions & 16 deletions ai-ml/jupyterhub/addons.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
data "aws_eks_cluster_auth" "this" {
name = module.eks.cluster_name
}

# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM)
data "aws_acm_certificate" "issued" {
count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
Expand All @@ -16,6 +12,7 @@ data "aws_ecrpublic_authorization_token" "token" {
locals {
cognito_custom_domain = var.cognito_custom_domain
}

#---------------------------------------------------------------
# IRSA for EBS CSI Driver
#---------------------------------------------------------------
Expand Down Expand Up @@ -69,7 +66,7 @@ module "eks_blueprints_addons" {
enable_cluster_proportional_autoscaler = true
cluster_proportional_autoscaler = {
timeout = "300"
values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
values = [templatefile("${path.module}/helm/coredns-autoscaler/values.yaml", {
target = "deployment/coredns"
})]
description = "Cluster Proportional Autoscaler for CoreDNS Service"
Expand All @@ -81,7 +78,7 @@ module "eks_blueprints_addons" {
enable_metrics_server = true
metrics_server = {
timeout = "300"
values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
values = [templatefile("${path.module}/helm/metrics-server/values.yaml", {})]
}

#---------------------------------------
Expand All @@ -91,7 +88,7 @@ module "eks_blueprints_addons" {
cluster_autoscaler = {
timeout = "300"
create_role = true
values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
values = [templatefile("${path.module}/helm/cluster-autoscaler/values.yaml", {
aws_region = var.region,
eks_cluster_id = module.eks.cluster_name
})]
Expand All @@ -109,15 +106,153 @@ module "eks_blueprints_addons" {
}

#---------------------------------------
# CloudWatch metrics for EKS
# AWS Load Balancer Controller
#---------------------------------------
enable_aws_cloudwatch_metrics = true
aws_cloudwatch_metrics = {
timeout = "300"
values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
enable_aws_load_balancer_controller = true

#---------------------------------------
# Prometheus and Grafana stack
#---------------------------------------
#---------------------------------------------------------------
# Install Monitoring Stack with Prometheus and Grafana
# 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
# 2- Grafana Admin user: admin
# 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
#---------------------------------------------------------------
enable_kube_prometheus_stack = true
kube_prometheus_stack = {
values = [templatefile("${path.module}/helm/kube-prometheus-stack/values.yaml", {})]
chart_version = "48.1.1"
set_sensitive = [
{
name = "grafana.adminPassword"
value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
}
],
}
#---------------------------------------
# AWS for FluentBit
#---------------------------------------
enable_aws_for_fluentbit = true
aws_for_fluentbit_cw_log_group = {
use_name_prefix = false
name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
retention_in_days = 30
}
aws_for_fluentbit = {
values = [templatefile("${path.module}/helm/aws-for-fluentbit/values.yaml", {
region = local.region,
cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
cluster_name = module.eks.cluster_name
})]
}

enable_aws_load_balancer_controller = true
#---------------------------------------
# Additional Helm Charts
#---------------------------------------
helm_releases = {
storageclass = {
name = "storageclass"
description = "A Helm chart for storage configurations"
chart = "${path.module}/helm/storageclass"
}
karpenter-resources-cpu = {
name = "karpenter-resources-cpu"
description = "A Helm chart for karpenter CPU based resources"
chart = "${path.module}/helm/karpenter-resources"
values = [
<<-EOT
clusterName: ${module.eks.cluster_name}
EOT
]
}
karpenter-resources-ts = {
name = "karpenter-resources-ts"
description = "A Helm chart for karpenter GPU based resources - compatible with GPU time slicing"
chart = "${path.module}/helm/karpenter-resources"
values = [
<<-EOT
name: gpu-ts
clusterName: ${module.eks.cluster_name}
instanceSizes: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
instanceFamilies: ["g5"]
taints:
- key: hub.jupyter.org/dedicated
value: "user"
effect: "NoSchedule"
- key: nvidia.com/gpu
effect: "NoSchedule"
amiFamily: Ubuntu
EOT
]
}
karpenter-resources-mig = {
name = "karpenter-resources-mig"
description = "A Helm chart for karpenter GPU based resources - compatible with GPU MIG"
chart = "${path.module}/helm/karpenter-resources"
values = [
<<-EOT
name: gpu-mig
clusterName: ${module.eks.cluster_name}
instanceSizes: ["24xlarge"]
instanceFamilies: ["p4d"]
taints:
- key: hub.jupyter.org/dedicated
value: "user"
effect: "NoSchedule"
- key: nvidia.com/gpu
effect: "NoSchedule"
amiFamily: Ubuntu
EOT
]
}
karpenter-resources-inf = {
name = "karpenter-resources-inf"
description = "A Helm chart for karpenter Inferentia based resources"
chart = "${path.module}/helm/karpenter-resources"
values = [
<<-EOT
name: inferentia
clusterName: ${module.eks.cluster_name}
instanceSizes: ["8xlarge", "24xlarge"]
instanceFamilies: ["inf2"]
taints:
- key: aws.amazon.com/neuroncore
value: "true"
effect: "NoSchedule"
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
- key: hub.jupyter.org/dedicated
value: "user"
effect: "NoSchedule"
EOT
]
}
karpenter-resources-trn = {
name = "karpenter-resources-trn"
description = "A Helm chart for karpenter Trainium based resources"
chart = "${path.module}/helm/karpenter-resources"
values = [
<<-EOT
name: trainium
clusterName: ${module.eks.cluster_name}
instanceSizes: ["2xlarge", "32xlarge"]
instanceFamilies: ["inf2"]
taints:
- key: aws.amazon.com/neuroncore
value: "true"
effect: "NoSchedule"
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
- key: hub.jupyter.org/dedicated
value: "user"
effect: "NoSchedule"
EOT
]
}
}

tags = local.tags
}
Expand All @@ -131,19 +266,25 @@ module "eks_data_addons" {

oidc_provider_arn = module.eks.oidc_provider_arn

#---------------------------------------------------------------
# Enable Neuron Device Plugin
#---------------------------------------------------------------
enable_aws_neuron_device_plugin = true

#---------------------------------------------------------------
# Enable GPU operator
#---------------------------------------------------------------
enable_nvidia_gpu_operator = var.jupyter_notebook_support == "gpu" ? true : false
enable_nvidia_gpu_operator = true
nvidia_gpu_operator_helm_config = {
values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})]
values = [templatefile("${path.module}/helm/nvidia-gpu-operator/values.yaml", {})]
}

#---------------------------------------------------------------
# JupyterHub Add-on
#---------------------------------------------------------------
enable_jupyterhub = true
jupyterhub_helm_config = {
values = [templatefile("${path.module}/helm-values/jupyterhub-values-${var.jupyter_hub_auth_mechanism}-${var.jupyter_notebook_support}.yaml", {
values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", {
ssl_cert_arn = try(data.aws_acm_certificate.issued[0].arn, "")
jupyterdomain = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "")
authorize_url = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
Expand All @@ -154,4 +295,39 @@ module "eks_data_addons" {
jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
})]
}

#---------------------------------------------------------------
# Kubecost Add-on
#---------------------------------------------------------------
enable_kubecost = true
kubecost_helm_config = {
values = [templatefile("${path.module}/helm/kubecost/values.yaml", {})]
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}
}

#---------------------------------------------------------------
# Grafana Admin credentials resources
#---------------------------------------------------------------
data "aws_secretsmanager_secret_version" "admin_password_version" {
secret_id = aws_secretsmanager_secret.grafana.id
depends_on = [aws_secretsmanager_secret_version.grafana]
}

resource "random_password" "grafana" {
length = 16
special = true
override_special = "@_"
}

#tfsec:ignore:aws-ssm-secret-use-customer-key
resource "aws_secretsmanager_secret" "grafana" {
name_prefix = "${local.name}-grafana-"
recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
}

resource "aws_secretsmanager_secret_version" "grafana" {
secret_id = aws_secretsmanager_secret.grafana.id
secret_string = random_password.grafana.result
}
7 changes: 2 additions & 5 deletions ai-ml/jupyterhub/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
set -o errexit
set -o pipefail

read -p "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com :" acm_certificate_domain
read -p "Enter sub-domain name for jupyterhub to be hosted, e.g. eks.example.com : " jupyterhub_domain

targets=(
"module.eks_data_addons"
"module.eks_blueprints_addons"
Expand Down Expand Up @@ -32,7 +29,7 @@ done
#-------------------------------------------
for target in "${targets[@]}"
do
destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty)
destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of $target completed successfully"
else
Expand All @@ -44,7 +41,7 @@ done
#-------------------------------------------
# Terraform destroy full
#-------------------------------------------
destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty)
destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of all targets completed successfully"
else
Expand Down
36 changes: 36 additions & 0 deletions ai-ml/jupyterhub/examples/create_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

# Set the AWS region and the name of the ECR repository

REGION=us-west-2
ECR_REPO_NAME=jupyterhub-pytorch-neuron-pytorch
DOCKER_FILE=docker/jupyterhub-pytorch-neuron-pytorch.Dockerfile

# Check if the ECR repository exists
if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$REGION" >/dev/null 2>&1; then
echo "ECR repository '$ECR_REPO_NAME' already exists."

# Get the ECR_REPO_URI for the existing repository
ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text)
echo "Repository URL: $ECR_REPO_URI"
else
# Create a new ECR repository with the specified name and region
aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$REGION"

# Retrieve the URL of the newly created ECR repository
ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text)
echo "Repository URL: $ECR_REPO_URI"
fi

# Log in to Amazon ECR using docker
echo -e "Logging in to Amazon ECR..."
aws ecr get-login-password --region "$REGION" | docker login --username AWS --password-stdin "$ECR_REPO_URI"

# Build the docker image using the provided jupyterhub-pytorch-neuron.Dockerfile and tag it with the ECR repository URI
echo -e "Building, tagging and pushing docker image... $ECR_REPO_URI:latest"
# docker build -f docker/jupyterhub-pytorch-neuron.Dockerfile-jupterhub-inferentia-pytorch -t "$ECR_REPO_URI:latest" .
docker buildx build --push --tag "$ECR_REPO_URI:latest" -o type=image --platform=linux/amd64 -f $DOCKER_FILE .

# Wait for 5 seconds
sleep 5
echo -e "Sleeping for 5 seconds..."
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Use the Jupyter base notebook with Python 3.10 as the base image
FROM jupyter/base-notebook:python-3.10

# Maintainer label
LABEL maintainer="DoEKS"

# Set environment variables to non-interactive (this prevents some prompts)
ENV DEBIAN_FRONTEND=non-interactive

# Switch to root to add Neuron repo and install necessary packages
USER root

# Install gnupg and other required packages
RUN apt-get update -y && \
apt-get install -y gnupg git g++

RUN \
. /etc/os-release && \
echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
apt-get update -y && \
apt-get install aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y

# Switch back to jovyan user for Python package installations
USER jovyan

# Set pip repository pointing to the Neuron repository and install required Python packages
RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
pip install transformers-neuronx sentencepiece transformers wget awscli ipywidgets neuronx-cc==2.* torch-neuronx torchvision ipykernel environment_kernels && \
# Install new Jupyter Notebook kernel
python -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"

# Add Neuron path to PATH
ENV PATH /opt/aws/neuron/bin:$PATH
Loading

0 comments on commit b91c245

Please sign in to comment.