feat: Bionemo on Eks (#457)

Co-authored-by: Vara Bonthu <[email protected]>
awslabs · Mar 13, 2024 · 3ca25f4 · 3ca25f4
1 parent 51ee1cf
commit 3ca25f4
Show file tree

Hide file tree

Showing 19 changed files with 1,176 additions and 1 deletion.
diff --git a/ai-ml/bionemo/README.md b/ai-ml/bionemo/README.md
@@ -0,0 +1,63 @@
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0.0 |
+| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 3.72 |
+| <a name="requirement_helm"></a> [helm](#requirement\_helm) | >= 2.4.1 |
+| <a name="requirement_http"></a> [http](#requirement\_http) | >= 3.3 |
+| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.14 |
+| <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | >= 2.10 |
+| <a name="requirement_random"></a> [random](#requirement\_random) | 3.3.2 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_aws"></a> [aws](#provider\_aws) | 5.38.0 |
+| <a name="provider_http"></a> [http](#provider\_http) | 3.4.1 |
+| <a name="provider_kubectl"></a> [kubectl](#provider\_kubectl) | 1.14.0 |
+
+## Modules
+
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
+| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
+| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.3 |
+| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.3 |
+| <a name="module_fsx_s3_bucket"></a> [fsx\_s3\_bucket](#module\_fsx\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
+| <a name="module_vpc"></a> [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 |
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [aws_fsx_data_repository_association.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/fsx_data_repository_association) | resource |
+| [aws_fsx_lustre_file_system.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/fsx_lustre_file_system) | resource |
+| [aws_security_group.fsx](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource |
+| [kubectl_manifest.mpi_operator](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
+| [kubectl_manifest.static_pv](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
+| [kubectl_manifest.static_pvc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
+| [kubectl_manifest.storage_class](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
+| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
+| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |
+| [http_http.mpi_operator_yaml](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source |
+| [kubectl_file_documents.mpi_operator_yaml](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/file_documents) | data source |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.29"` | no |
+| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"bionemo-on-eks"` | no |
+| <a name="input_region"></a> [region](#input\_region) | Region | `string` | `"us-west-2"` | no |
+| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br>  "100.64.0.0/16"<br>]</pre> | no |
+| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.1.0.0/21"` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_configure_kubectl"></a> [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig |
+| <a name="output_eks_api_server_url"></a> [eks\_api\_server\_url](#output\_eks\_api\_server\_url) | Your eks API server endpoint |
diff --git a/ai-ml/bionemo/addons.tf b/ai-ml/bionemo/addons.tf
@@ -0,0 +1,54 @@
+#---------------------------------------------------------------
+# EKS Blueprints Kubernetes Addons
+#---------------------------------------------------------------
+module "eks_blueprints_addons" {
+  source  = "aws-ia/eks-blueprints-addons/aws"
+  version = "~> 1.3"
+
+  cluster_name      = module.eks.cluster_name
+  cluster_endpoint  = module.eks.cluster_endpoint
+  cluster_version   = module.eks.cluster_version
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  #---------------------------------------
+  # Amazon EKS Managed Add-ons
+  #---------------------------------------
+  eks_addons = {
+    coredns = {
+      preserve = true
+    }
+    vpc-cni = {
+      preserve = true
+    }
+    kube-proxy = {
+      preserve = true
+    }
+  }
+  #---------------------------------------
+  # CloudWatch metrics for EKS
+  #---------------------------------------
+  enable_aws_cloudwatch_metrics = true
+  aws_cloudwatch_metrics = {
+    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
+  }
+
+  #---------------------------------------
+  # Enable FSx for Lustre CSI Driver
+  #---------------------------------------
+  enable_aws_fsx_csi_driver = true
+
+  tags = local.tags
+
+}
+
+#---------------------------------------------------------------
+# Data on EKS Kubernetes Addons
+#---------------------------------------------------------------
+module "eks_data_addons" {
+  source  = "aws-ia/eks-data-addons/aws"
+  version = "~> 1.30" # ensure to update this to the latest/desired version
+
+  oidc_provider_arn           = module.eks.oidc_provider_arn
+  enable_nvidia_device_plugin = true
+
+}
diff --git a/ai-ml/bionemo/cleanup.sh b/ai-ml/bionemo/cleanup.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -o errexit
+set -o pipefail
+
+targets=(
+  "module.eks"
+  "module.vpc"
+)
+
+#-------------------------------------------
+# Helpful to delete the stuck in "Terminating" namespaces
+# Rerun the cleanup.sh script to detect and delete the stuck resources
+#-------------------------------------------
+terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
+
+# If there are no terminating namespaces, exit the script
+if [[ -z $terminating_namespaces ]]; then
+    echo "No terminating namespaces found"
+fi
+
+for ns in $terminating_namespaces; do
+    echo "Terminating namespace: $ns"
+    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
+done
+
+for target in "${targets[@]}"
+do
+  terraform destroy -target="$target" -auto-approve
+  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
+  if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
+    echo "SUCCESS: Terraform destroy of $target completed successfully"
+  else
+    echo "FAILED: Terraform destroy of $target failed"
+    exit 1
+  fi
+done
+
+terraform destroy -auto-approve
+destroy_output=$(terraform destroy -auto-approve 2>&1)
+if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
+  echo "SUCCESS: Terraform destroy of all targets completed successfully"
+else
+  echo "FAILED: Terraform destroy of all targets failed"
+  exit 1
+fi
diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf
@@ -0,0 +1,146 @@
+#---------------------------------------------------------------
+# EKS Cluster
+#---------------------------------------------------------------
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.15"
+
+  cluster_name                   = local.name
+  cluster_version                = var.eks_cluster_version
+  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
+  vpc_id                         = module.vpc.vpc_id
+  subnet_ids                     = module.vpc.private_subnets
+  manage_aws_auth_configmap      = true
+
+  #---------------------------------------
+  # Note: This can further restricted to specific required for each Add-on and your application
+  #---------------------------------------
+  # Extend cluster security group rules
+  cluster_security_group_additional_rules = {
+    ingress_nodes_ephemeral_ports_tcp = {
+      description                = "Nodes on ephemeral ports"
+      protocol                   = "tcp"
+      from_port                  = 1025
+      to_port                    = 65535
+      type                       = "ingress"
+      source_node_security_group = true
+    }
+  }
+
+  # Extend node-to-node security group rules
+  node_security_group_additional_rules = {
+    ingress_self_all = {
+      description = "Node to node all ports/protocols"
+      protocol    = "-1"
+      from_port   = 0
+      to_port     = 0
+      type        = "ingress"
+      self        = true
+    }
+    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
+    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
+    # Change this according to your security requirements if needed
+    ingress_cluster_to_node_all_traffic = {
+      description                   = "Cluster API to Nodegroup all traffic"
+      protocol                      = "-1"
+      from_port                     = 0
+      to_port                       = 0
+      type                          = "ingress"
+      source_cluster_security_group = true
+    }
+  }
+
+  eks_managed_node_group_defaults = {
+    iam_role_additional_policies = {
+      # Not required, but used in the example to access the nodes to inspect mounted volumes
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
+
+  eks_managed_node_groups = {
+    #  We recommend to have a MNG to place your critical workloads and add-ons
+    #  Then rely on Karpenter to scale your workloads
+    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
+
+    core_node_group = {
+      name        = "core-node-group"
+      description = "EKS Core node group for hosting critical add-ons"
+      # Filtering only Secondary CIDR private subnets starting with "100.".
+      # Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+      )
+
+      min_size     = 3
+      max_size     = 9
+      desired_size = 3
+
+      instance_types = ["m5.xlarge"]
+
+      ebs_optimized = true
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/xvda"
+          ebs = {
+            volume_size = 100
+            volume_type = "gp3"
+          }
+        }
+      }
+
+      labels = {
+        WorkerType    = "ON_DEMAND"
+        NodeGroupType = "core"
+      }
+
+      tags = merge(local.tags, {
+        Name                     = "core-node-grp",
+        "karpenter.sh/discovery" = local.name
+      })
+    }
+
+    gpu1 = {
+      name        = "gpu-node-grp"
+      description = "EKS Node Group to run GPU workloads"
+      # Filtering only Secondary CIDR private subnets starting with "100.".
+      # Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+      )
+
+      ami_type            = "AL2_x86_64_GPU"
+      ami_release_version = "1.29.0-20240213"
+      min_size            = 2
+      max_size            = 3
+      desired_size        = 2
+
+      instance_types = ["p3.16xlarge"]
+      ebs_optimized  = true
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/xvda"
+          ebs = {
+            volume_size = 200
+            volume_type = "gp3"
+          }
+        }
+      }
+      taints = {
+        gpu = {
+          key      = "nvidia.com/gpu"
+          effect   = "NO_SCHEDULE"
+          operator = "EXISTS"
+        }
+      }
+      labels = {
+        WorkerType = "ON_DEMAND"
+        eks-node   = "gpu"
+      }
+
+      tags = merge(local.tags, {
+        Name                     = "gpu-node-grp",
+        "karpenter.sh/discovery" = local.name
+      })
+    }
+  }
+}
diff --git a/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml b/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml
@@ -0,0 +1,76 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: esm1nv-pretraining
+spec:
+  elasticPolicy:
+    rdzvBackend: c10d
+    minReplicas: 1
+    maxReplicas: 16
+    maxRestarts: 100
+    metrics:
+      - type: Resource
+        resource:
+          name: cpu
+          target:
+            type: Utilization
+            averageUtilization: 80
+  nprocPerNode: "8"
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: 16
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+        spec:
+          tolerations:
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+          volumes:
+          - name: fsx-pv-storage
+            persistentVolumeClaim:
+              claimName: fsx-static-pvc
+          containers:
+            - name: pytorch
+              image: nvcr.io/nvidia/clara/bionemo-framework:1.2
+              resources:
+                limits:
+                  nvidia.com/gpu: 1
+              env:
+                - name: NCCL_DEBUG
+                  value: "INFO"
+                - name: DATA_PATH
+                  value: "/fsx"
+                - name: HYDRA_FULL_ERROR
+                  value: "1"
+              volumeMounts:
+                - mountPath: "/fsx"
+                  name: fsx-pv-storage
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "-m"
+                - "torch.distributed.run"
+                - "/workspace/bionemo/examples/protein/esm1nv/pretrain.py"
+                - "--config-path=/workspace/bionemo/examples/protein/esm1nv/conf"
+                - "--config-name=pretrain_small"
+                - "exp_manager.exp_dir=/fsx/esm1nv-train/esm1nv_pretraining/esm1nv_batch256_gradacc1_nodes2-small/results"
+                - "exp_manager.create_wandb_logger=False"
+                - "exp_manager.wandb_logger_kwargs.name=esm1nv_batch256_gradacc1_nodes2-small"
+                - "exp_manager.wandb_logger_kwargs.project=esm1nv_pretraining"
+                - "++exp_manager.wandb_logger_kwargs.offline=False"
+                - "trainer.num_nodes=2"
+                - "trainer.devices=8"
+                - "trainer.max_steps=1000000"
+                - "trainer.accumulate_grad_batches=1"
+                - "trainer.val_check_interval=500"
+                - "model.micro_batch_size=8"
+                - "model.tensor_model_parallel_size=1"
+                - "model.data.dataset_path=/fsx/processed"
+                - "model.data.dataset.train='x_OP_000..049_CL_'"
+                - "model.data.dataset.val='x_OP_000..049_CL_'"
+                - "model.data.dataset.test='x_OP_000..049_CL_'"
+                - "model.data.index_mapping_dir=/fsx/processed"
+                - "++model.dwnstr_task_validation.enabled=False"