From f94cebb6424ddcac0918730296b068098505aaaa Mon Sep 17 00:00:00 2001 From: John Dzialo Date: Fri, 11 Oct 2024 16:50:13 -0400 Subject: [PATCH] feat: BioNemo EKS cluster Using Cloudwatch Observability Plugin (#641) Co-authored-by: John Dzialo --- ai-ml/bionemo/addons.tf | 49 +++++++++- ai-ml/bionemo/cleanup.sh | 0 ai-ml/bionemo/eks.tf | 11 +-- ai-ml/bionemo/examples/README.md | 9 ++ .../esm1nv_pretrain-job.yaml | 8 +- .../{training => esm1nv}/uniref50-job.yaml | 0 ai-ml/bionemo/examples/esm2nv/data-job.yaml | 43 ++++++++ .../examples/esm2nv/esm2nv-training-job.yaml | 97 +++++++++++++++++++ .../aws-cloudwatch-metrics-values.yaml | 11 --- ai-ml/bionemo/install.sh | 0 10 files changed, 202 insertions(+), 26 deletions(-) mode change 100644 => 100755 ai-ml/bionemo/cleanup.sh create mode 100644 ai-ml/bionemo/examples/README.md rename ai-ml/bionemo/examples/{training => esm1nv}/esm1nv_pretrain-job.yaml (96%) rename ai-ml/bionemo/examples/{training => esm1nv}/uniref50-job.yaml (100%) create mode 100644 ai-ml/bionemo/examples/esm2nv/data-job.yaml create mode 100644 ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml delete mode 100755 ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml mode change 100644 => 100755 ai-ml/bionemo/install.sh diff --git a/ai-ml/bionemo/addons.tf b/ai-ml/bionemo/addons.tf index 74c152dc4..6b47a5ffe 100644 --- a/ai-ml/bionemo/addons.tf +++ b/ai-ml/bionemo/addons.tf @@ -23,14 +23,22 @@ module "eks_blueprints_addons" { kube-proxy = { preserve = true } + amazon-cloudwatch-observability = { + preserve = true + service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn + } } + #--------------------------------------- - # CloudWatch metrics for EKS + # ALB Controller #--------------------------------------- - enable_aws_cloudwatch_metrics = true - aws_cloudwatch_metrics = { - values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] - } + enable_aws_load_balancer_controller = true + + #--------------------------------------- + # Kubernetes Metrics Server + #--------------------------------------- + enable_metrics_server = true + #--------------------------------------- # Enable FSx for Lustre CSI Driver @@ -52,3 +60,34 @@ module "eks_data_addons" { enable_nvidia_device_plugin = true } + +#--------------------------------------------------------------- +# EKS Amazon CloudWatch Observability Role +#--------------------------------------------------------------- +resource "aws_iam_role" "cloudwatch_observability_role" { + name = "eks-cloudwatch-agent-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Federated = module.eks.oidc_provider_arn + } + Condition = { + StringEquals = { + "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent", + "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com" + } + } + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.cloudwatch_observability_role.name +} diff --git a/ai-ml/bionemo/cleanup.sh b/ai-ml/bionemo/cleanup.sh old mode 100644 new mode 100755 diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf index 2f2c5d2f1..e45e5a816 100644 --- a/ai-ml/bionemo/eks.tf +++ b/ai-ml/bionemo/eks.tf @@ -108,13 +108,12 @@ module "eks" { substr(cidr_block, 0, 4) == "100." ? subnet_id : null] ) - ami_type = "AL2_x86_64_GPU" - ami_release_version = "1.29.0-20240213" - min_size = 2 - max_size = 3 - desired_size = 2 + ami_type = "AL2_x86_64_GPU" + min_size = 2 + max_size = 3 + desired_size = 2 - instance_types = ["p3.16xlarge"] + instance_types = ["g5.12xlarge"] ebs_optimized = true block_device_mappings = { xvda = { diff --git a/ai-ml/bionemo/examples/README.md b/ai-ml/bionemo/examples/README.md new file mode 100644 index 000000000..0ed4140bd --- /dev/null +++ b/ai-ml/bionemo/examples/README.md @@ -0,0 +1,9 @@ +# BioNemo Examples + +Examples make use of the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator) for Kubernetes ML Training jobs with PyTorch. + +Install the Kubeflow Training Operator in the cluster before running these Kustomization scripts with the kubectl command line tool. + +``` +kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0" +``` diff --git a/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml b/ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml similarity index 96% rename from ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml rename to ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml index 36479b99e..7bd40e212 100644 --- a/ai-ml/bionemo/examples/training/esm1nv_pretrain-job.yaml +++ b/ai-ml/bionemo/examples/esm1nv/esm1nv_pretrain-job.yaml @@ -6,7 +6,7 @@ spec: elasticPolicy: rdzvBackend: c10d minReplicas: 1 - maxReplicas: 16 + maxReplicas: 8 maxRestarts: 100 metrics: - type: Resource @@ -15,10 +15,10 @@ spec: target: type: Utilization averageUtilization: 80 - nprocPerNode: "8" + nprocPerNode: "4" pytorchReplicaSpecs: Worker: - replicas: 16 + replicas: 8 template: metadata: annotations: @@ -62,7 +62,7 @@ spec: - "exp_manager.wandb_logger_kwargs.project=esm1nv_pretraining" - "++exp_manager.wandb_logger_kwargs.offline=False" - "trainer.num_nodes=2" - - "trainer.devices=8" + - "trainer.devices=4" - "trainer.max_steps=1000000" - "trainer.accumulate_grad_batches=1" - "trainer.val_check_interval=500" diff --git a/ai-ml/bionemo/examples/training/uniref50-job.yaml b/ai-ml/bionemo/examples/esm1nv/uniref50-job.yaml similarity index 100% rename from ai-ml/bionemo/examples/training/uniref50-job.yaml rename to ai-ml/bionemo/examples/esm1nv/uniref50-job.yaml diff --git a/ai-ml/bionemo/examples/esm2nv/data-job.yaml b/ai-ml/bionemo/examples/esm2nv/data-job.yaml new file mode 100644 index 000000000..df770e2f8 --- /dev/null +++ b/ai-ml/bionemo/examples/esm2nv/data-job.yaml @@ -0,0 +1,43 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: data-download +spec: + ttlSecondsAfterFinished: 100 + template: + spec: + volumes: + - name: fsx-pv-storage + persistentVolumeClaim: + claimName: fsx-static-pvc + containers: + - name: bionemo + image: nvcr.io/nvidia/clara/bionemo-framework:1.7 + resources: + limits: + cpu: 2000m + memory: 4Gi + requests: + cpu: 1000m + memory: 2Gi + env: + - name: DATA_PATH + value: "/fsx" + command: ["/bin/sh", "-c"] + args: + - | + mkdir -p /fsx/esm2nv-train/ + unzip -o ${BIONEMO_HOME}/examples/tests/test_data/uniref202104_esm2_qc_test200_val200.zip -d /fsx/esm2nv-train/ + python examples/protein/esm2nv/pretrain.py \ + --config-path=conf \ + ++do_training=False \ + ++model.data.val_size=500 \ + ++model.data.test_size=100 \ + ++model.data.train.uf50_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uniref50_train_filt.fasta \ + ++model.data.train.uf90_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/ur90_ur50_sampler.fasta \ + ++model.data.train.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv \ + ++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200 + volumeMounts: + - mountPath: "/fsx" + name: fsx-pv-storage + restartPolicy: Never diff --git a/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml b/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml new file mode 100644 index 000000000..b7adc180a --- /dev/null +++ b/ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml @@ -0,0 +1,97 @@ +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: esm2nv-training +spec: + elasticPolicy: + rdzvBackend: c10d + minReplicas: 1 + maxReplicas: 16 + maxRestarts: 100 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + nprocPerNode: "4" + pytorchReplicaSpecs: + Worker: + replicas: 8 + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + volumes: + - name: fsx-pv-storage + persistentVolumeClaim: + claimName: fsx-static-pvc + containers: + - name: pytorch + image: nvcr.io/nvidia/clara/bionemo-framework:1.7 + resources: + limits: + nvidia.com/gpu: 1 + env: + - name: NCCL_DEBUG + value: "INFO" + - name: DATA_PATH + value: "/fsx" + - name: HYDRA_FULL_ERROR + value: "1" + volumeMounts: + - mountPath: "/fsx" + name: fsx-pv-storage + imagePullPolicy: Always + command: + - "python3" + - "-m" + # https://pytorch.org/docs/stable/elastic/run.html#transitioning-from-torch-distributed-launch-to-torchrun + - "torch.distributed.run" + # pytorch script to run from within the bionemo-framework container + - "/workspace/bionemo/examples/protein/esm2nv/pretrain.py" + # bionemo configuration directory including the base-config and the override configuration + - "--config-path=/workspace/bionemo/examples/protein/esm2nv/conf" + # name of the override configuration file to use when running bionemo, this will use base-config + - "--config-name=pretrain_esm2_8M" + # a passed variable to the pytorch script, this determines whether the script will parse data or run active training on pre-parsed data + - "++do_training=True" + # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html + # number of nodes this pytorch job will run on + - "++trainer.num_nodes=2" + # number of GPU cores per node + - "++trainer.devices=4" + # stop training after this many steps, can be removed by setting to -1 + - "++trainer.max_steps=100000" + # accumulates gradients over k batches before stepping the optimizer. Default: 1. + - "++trainer.accumulate_grad_batches=1" + # how often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch + - "++trainer.val_check_interval=1.0" + # double precision (64, ‘64’ or ‘64-true’), full precision (32, ‘32’ or ‘32-true’), 16bit mixed precision (16, ‘16’, ‘16-mixed’) or bfloat16 mixed precision (‘bf16’, ‘bf16-mixed’). Can be used on CPU, GPU, TPUs, or HPUs. Default: '32-true'. + - "++trainer.precision=16-mixed" + # https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#batch-size + # Configure with: model.micro_batch_size=N (per GPU batch size) + # Recommended value: use N resulting in 85-90% GPU memory utilization + # Keep model.global_batch_size=null to compute global batch size at run-time. + # Further increase the effective global batch size by using gradient accumulation (for example, trainer.accumulate_grad_batches=2). + - "++model.micro_batch_size=2" + # https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#model-parallelism + # For large models (that is > 1B parameters) use model tensor parallelism model.tensor_model_parallel_size=N + # For larger models (that is > 5B parameters) add also model pipeline parallelism model.pipeline_model_parallel_size=N + # The various parallelism options are independent and can be combined as needed. + - "++model.tensor_model_parallel_size=1" + - "++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200" + - "++model.data.uf90.uniref90_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uf90" + - "++model.data.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv" + - "++model.validation.validation_enabled=False" + - "++model.dwnstr_task_validation.enabled=False" + - "++exp_manager.create_wandb_logger=False" + - "++exp_manager.checkpoint_callback_params.always_save_nemo=False" + - "++exp_manager.exp_dir=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/esm2_pretraining" + - "++exp_manager.resume_if_exists=False" diff --git a/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml deleted file mode 100755 index ae3c41d44..000000000 --- a/ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 200m - memory: 1Gi - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/bionemo/install.sh b/ai-ml/bionemo/install.sh old mode 100644 new mode 100755