From f94cebb6424ddcac0918730296b068098505aaaa Mon Sep 17 00:00:00 2001
From: John Dzialo <jmd9295@gmail.com>
Date: Fri, 11 Oct 2024 16:50:13 -0400
Subject: [PATCH] feat: BioNemo EKS cluster Using Cloudwatch Observability
 Plugin (#641)

Co-authored-by: John Dzialo <johndzi@amazon.com>
---
 ai-ml/bionemo/addons.tf                       | 49 +++++++++-
 ai-ml/bionemo/cleanup.sh                      |  0
 ai-ml/bionemo/eks.tf                          | 11 +--
 ai-ml/bionemo/examples/README.md              |  9 ++
 .../esm1nv_pretrain-job.yaml                  |  8 +-
 .../{training => esm1nv}/uniref50-job.yaml    |  0
 ai-ml/bionemo/examples/esm2nv/data-job.yaml   | 43 ++++++++
 .../examples/esm2nv/esm2nv-training-job.yaml  | 97 +++++++++++++++++++
 .../aws-cloudwatch-metrics-values.yaml        | 11 ---
 ai-ml/bionemo/install.sh                      |  0
 10 files changed, 202 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 ai-ml/bionemo/cleanup.sh
 create mode 100644 ai-ml/bionemo/examples/README.md
 rename ai-ml/bionemo/examples/{training => esm1nv}/esm1nv_pretrain-job.yaml (96%)
 rename ai-ml/bionemo/examples/{training => esm1nv}/uniref50-job.yaml (100%)
 create mode 100644 ai-ml/bionemo/examples/esm2nv/data-job.yaml
 create mode 100644 ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml
 delete mode 100755 ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml
 mode change 100644 => 100755 ai-ml/bionemo/install.sh

diff --git a/ai-ml/bionemo/addons.tf b/ai-ml/bionemo/addons.tf
index 74c152dc4..6b47a5ffe 100644
--- a/ai-ml/bionemo/addons.tf
+++ b/ai-ml/bionemo/addons.tf
@@ -23,14 +23,22 @@ module "eks_blueprints_addons" {
     kube-proxy = {
       preserve = true
     }
+    amazon-cloudwatch-observability = {
+      preserve                 = true
+      service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn
+    }
   }
+
   #---------------------------------------
-  # CloudWatch metrics for EKS
+  # ALB Controller
   #---------------------------------------
-  enable_aws_cloudwatch_metrics = true
-  aws_cloudwatch_metrics = {
-    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
-  }
+  enable_aws_load_balancer_controller = true
+
+  #---------------------------------------
+  # Kubernetes Metrics Server
+  #---------------------------------------
+  enable_metrics_server = true
+
 
   #---------------------------------------
   # Enable FSx for Lustre CSI Driver
@@ -52,3 +60,34 @@ module "eks_data_addons" {
   enable_nvidia_device_plugin = true
 
 }
+
+#---------------------------------------------------------------
+# EKS Amazon CloudWatch Observability Role
+#---------------------------------------------------------------
+resource "aws_iam_role" "cloudwatch_observability_role" {
+  name = "eks-cloudwatch-agent-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = "sts:AssumeRoleWithWebIdentity"
+        Effect = "Allow"
+        Principal = {
+          Federated = module.eks.oidc_provider_arn
+        }
+        Condition = {
+          StringEquals = {
+            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent",
+            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com"
+          }
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" {
+  policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+  role       = aws_iam_role.cloudwatch_observability_role.name
+}
diff --git a/ai-ml/bionemo/cleanup.sh b/ai-ml/bionemo/cleanup.sh
old mode 100644
new mode 100755
diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf
index 2f2c5d2f1..e45e5a816 100644
--- a/ai-ml/bionemo/eks.tf
+++ b/ai-ml/bionemo/eks.tf
@@ -108,13 +108,12 @@ module "eks" {
         substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
       )
 
-      ami_type            = "AL2_x86_64_GPU"
-      ami_release_version = "1.29.0-20240213"
-      min_size            = 2
-      max_size            = 3
-      desired_size        = 2
+      ami_type     = "AL2_x86_64_GPU"
+      min_size     = 2
+      max_size     = 3
+      desired_size = 2
 
-      instance_types = ["p3.16xlarge"]
+      instance_types = ["g5.12xlarge"]
       ebs_optimized  = true
       block_device_mappings = {
         xvda = {
diff --git a/ai-ml/bionemo/examples/README.md b/ai-ml/bionemo/examples/README.md
new file mode 100644
index 000000000..0ed4140bd
--- /dev/null
+++ b/ai-ml/bionemo/examples/README.md
@@ -0,0 +1,9 @@
+# BioNemo Examples
+
+Examples make use of the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator) for Kubernetes ML Training jobs with PyTorch.
+
+Install the Kubeflow Training Operator in the cluster before running these Kustomization scripts with the kubectl command line tool.
+
+```
+kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0"
+```
diff --git a/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml b/ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml
similarity index 96%
rename from ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml
rename to ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml
index 36479b99e..7bd40e212 100644
--- a/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml
+++ b/ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml
@@ -6,7 +6,7 @@ spec:
   elasticPolicy:
     rdzvBackend: c10d
     minReplicas: 1
-    maxReplicas: 16
+    maxReplicas: 8
     maxRestarts: 100
     metrics:
       - type: Resource
@@ -15,10 +15,10 @@ spec:
           target:
             type: Utilization
             averageUtilization: 80
-  nprocPerNode: "8"
+  nprocPerNode: "4"
   pytorchReplicaSpecs:
     Worker:
-      replicas: 16
+      replicas: 8
       template:
         metadata:
           annotations:
@@ -62,7 +62,7 @@ spec:
                 - "exp_manager.wandb_logger_kwargs.project=esm1nv_pretraining"
                 - "++exp_manager.wandb_logger_kwargs.offline=False"
                 - "trainer.num_nodes=2"
-                - "trainer.devices=8"
+                - "trainer.devices=4"
                 - "trainer.max_steps=1000000"
                 - "trainer.accumulate_grad_batches=1"
                 - "trainer.val_check_interval=500"
diff --git a/ai-ml/bionemo/examples/training/uniref50-job.yaml b/ai-ml/bionemo/examples/esm1nv/uniref50-job.yaml
similarity index 100%
rename from ai-ml/bionemo/examples/training/uniref50-job.yaml
rename to ai-ml/bionemo/examples/esm1nv/uniref50-job.yaml
diff --git a/ai-ml/bionemo/examples/esm2nv/data-job.yaml b/ai-ml/bionemo/examples/esm2nv/data-job.yaml
new file mode 100644
index 000000000..df770e2f8
--- /dev/null
+++ b/ai-ml/bionemo/examples/esm2nv/data-job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: data-download
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+    spec:
+      volumes:
+      - name: fsx-pv-storage
+        persistentVolumeClaim:
+          claimName: fsx-static-pvc
+      containers:
+      - name: bionemo
+        image: nvcr.io/nvidia/clara/bionemo-framework:1.7
+        resources:
+          limits:
+            cpu: 2000m
+            memory: 4Gi
+          requests:
+            cpu: 1000m
+            memory: 2Gi
+        env:
+        - name: DATA_PATH
+          value: "/fsx"
+        command: ["/bin/sh", "-c"]
+        args:
+          - |
+            mkdir -p /fsx/esm2nv-train/
+            unzip -o ${BIONEMO_HOME}/examples/tests/test_data/uniref202104_esm2_qc_test200_val200.zip -d /fsx/esm2nv-train/
+            python examples/protein/esm2nv/pretrain.py \
+              --config-path=conf \
+              ++do_training=False \
+              ++model.data.val_size=500 \
+              ++model.data.test_size=100 \
+              ++model.data.train.uf50_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uniref50_train_filt.fasta \
+              ++model.data.train.uf90_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/ur90_ur50_sampler.fasta \
+              ++model.data.train.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv \
+              ++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200
+        volumeMounts:
+          - mountPath: "/fsx"
+            name: fsx-pv-storage
+      restartPolicy: Never
diff --git a/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml b/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml
new file mode 100644
index 000000000..b7adc180a
--- /dev/null
+++ b/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml
@@ -0,0 +1,97 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: esm2nv-training
+spec:
+  elasticPolicy:
+    rdzvBackend: c10d
+    minReplicas: 1
+    maxReplicas: 16
+    maxRestarts: 100
+    metrics:
+      - type: Resource
+        resource:
+          name: cpu
+          target:
+            type: Utilization
+            averageUtilization: 80
+  nprocPerNode: "4"
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: 8
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+        spec:
+          tolerations:
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+          volumes:
+          - name: fsx-pv-storage
+            persistentVolumeClaim:
+              claimName: fsx-static-pvc
+          containers:
+            - name: pytorch
+              image: nvcr.io/nvidia/clara/bionemo-framework:1.7
+              resources:
+                limits:
+                  nvidia.com/gpu: 1
+              env:
+                - name: NCCL_DEBUG
+                  value: "INFO"
+                - name: DATA_PATH
+                  value: "/fsx"
+                - name: HYDRA_FULL_ERROR
+                  value: "1"
+              volumeMounts:
+                - mountPath: "/fsx"
+                  name: fsx-pv-storage
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "-m"
+                # https://pytorch.org/docs/stable/elastic/run.html#transitioning-from-torch-distributed-launch-to-torchrun
+                - "torch.distributed.run"
+                # pytorch script to run from within the bionemo-framework container
+                - "/workspace/bionemo/examples/protein/esm2nv/pretrain.py"
+                # bionemo configuration directory including the base-config and the override configuration
+                - "--config-path=/workspace/bionemo/examples/protein/esm2nv/conf"
+                # name of the override configuration file to use when running bionemo, this will use base-config
+                - "--config-name=pretrain_esm2_8M"
+                # a passed variable to the pytorch script, this determines whether the script will parse data or run active training on pre-parsed data
+                - "++do_training=True"
+                # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html
+                # number of nodes this pytorch job will run on
+                - "++trainer.num_nodes=2"
+                # number of GPU cores per node
+                - "++trainer.devices=4"
+                # stop training after this many steps, can be removed by setting to -1
+                - "++trainer.max_steps=100000"
+                # accumulates gradients over k batches before stepping the optimizer. Default: 1.
+                - "++trainer.accumulate_grad_batches=1"
+                # how often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch
+                - "++trainer.val_check_interval=1.0"
+                # double precision (64, ‘64’ or ‘64-true’), full precision (32, ‘32’ or ‘32-true’), 16bit mixed precision (16, ‘16’, ‘16-mixed’) or bfloat16 mixed precision (‘bf16’, ‘bf16-mixed’). Can be used on CPU, GPU, TPUs, or HPUs. Default: '32-true'.
+                - "++trainer.precision=16-mixed"
+                # https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#batch-size
+                # Configure with: model.micro_batch_size=N (per GPU batch size)
+                # Recommended value: use N resulting in 85-90% GPU memory utilization
+                # Keep model.global_batch_size=null to compute global batch size at run-time.
+                # Further increase the effective global batch size by using gradient accumulation (for example, trainer.accumulate_grad_batches=2).
+                - "++model.micro_batch_size=2"
+                # https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#model-parallelism
+                # For large models (that is > 1B parameters) use model tensor parallelism model.tensor_model_parallel_size=N
+                # For larger models (that is > 5B parameters) add also model pipeline parallelism model.pipeline_model_parallel_size=N
+                # The various parallelism options are independent and can be combined as needed.
+                - "++model.tensor_model_parallel_size=1"
+                - "++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200"
+                - "++model.data.uf90.uniref90_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uf90"
+                - "++model.data.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv"
+                - "++model.validation.validation_enabled=False"
+                - "++model.dwnstr_task_validation.enabled=False"
+                - "++exp_manager.create_wandb_logger=False"
+                - "++exp_manager.checkpoint_callback_params.always_save_nemo=False"
+                - "++exp_manager.exp_dir=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/esm2_pretraining"
+                - "++exp_manager.resume_if_exists=False"
diff --git a/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml
deleted file mode 100755
index ae3c41d44..000000000
--- a/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-resources:
-  limits:
-    cpu: 500m
-    memory: 2Gi
-  requests:
-    cpu: 200m
-    memory: 1Gi
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/bionemo/install.sh b/ai-ml/bionemo/install.sh
old mode 100644
new mode 100755