Skip to content

Commit

Permalink
feat: BioNemo EKS cluster Using Cloudwatch Observability Plugin (awsl…
Browse files Browse the repository at this point in the history
…abs#641)

Co-authored-by: John Dzialo <[email protected]>
  • Loading branch information
JohnDzialo and John Dzialo authored Oct 11, 2024
1 parent 2a8e5e6 commit f94cebb
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 26 deletions.
49 changes: 44 additions & 5 deletions ai-ml/bionemo/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,22 @@ module "eks_blueprints_addons" {
kube-proxy = {
preserve = true
}
amazon-cloudwatch-observability = {
preserve = true
service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn
}
}

#---------------------------------------
# CloudWatch metrics for EKS
# ALB Controller
#---------------------------------------
enable_aws_cloudwatch_metrics = true
aws_cloudwatch_metrics = {
values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
}
enable_aws_load_balancer_controller = true

#---------------------------------------
# Kubernetes Metrics Server
#---------------------------------------
enable_metrics_server = true


#---------------------------------------
# Enable FSx for Lustre CSI Driver
Expand All @@ -52,3 +60,34 @@ module "eks_data_addons" {
enable_nvidia_device_plugin = true

}

#---------------------------------------------------------------
# EKS Amazon CloudWatch Observability Role
#---------------------------------------------------------------
resource "aws_iam_role" "cloudwatch_observability_role" {
name = "eks-cloudwatch-agent-role"

assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRoleWithWebIdentity"
Effect = "Allow"
Principal = {
Federated = module.eks.oidc_provider_arn
}
Condition = {
StringEquals = {
"${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent",
"${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com"
}
}
}
]
})
}

resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" {
policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
role = aws_iam_role.cloudwatch_observability_role.name
}
Empty file modified ai-ml/bionemo/cleanup.sh
100644 → 100755
Empty file.
11 changes: 5 additions & 6 deletions ai-ml/bionemo/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,12 @@ module "eks" {
substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
)

ami_type = "AL2_x86_64_GPU"
ami_release_version = "1.29.0-20240213"
min_size = 2
max_size = 3
desired_size = 2
ami_type = "AL2_x86_64_GPU"
min_size = 2
max_size = 3
desired_size = 2

instance_types = ["p3.16xlarge"]
instance_types = ["g5.12xlarge"]
ebs_optimized = true
block_device_mappings = {
xvda = {
Expand Down
9 changes: 9 additions & 0 deletions ai-ml/bionemo/examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# BioNemo Examples

Examples make use of the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator) for Kubernetes ML Training jobs with PyTorch.

Install the Kubeflow Training Operator in the cluster before running these Kustomization scripts with the kubectl command line tool.

```
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0"
```
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 16
maxReplicas: 8
maxRestarts: 100
metrics:
- type: Resource
Expand All @@ -15,10 +15,10 @@ spec:
target:
type: Utilization
averageUtilization: 80
nprocPerNode: "8"
nprocPerNode: "4"
pytorchReplicaSpecs:
Worker:
replicas: 16
replicas: 8
template:
metadata:
annotations:
Expand Down Expand Up @@ -62,7 +62,7 @@ spec:
- "exp_manager.wandb_logger_kwargs.project=esm1nv_pretraining"
- "++exp_manager.wandb_logger_kwargs.offline=False"
- "trainer.num_nodes=2"
- "trainer.devices=8"
- "trainer.devices=4"
- "trainer.max_steps=1000000"
- "trainer.accumulate_grad_batches=1"
- "trainer.val_check_interval=500"
Expand Down
43 changes: 43 additions & 0 deletions ai-ml/bionemo/examples/esm2nv/data-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: batch/v1
kind: Job
metadata:
name: data-download
spec:
ttlSecondsAfterFinished: 100
template:
spec:
volumes:
- name: fsx-pv-storage
persistentVolumeClaim:
claimName: fsx-static-pvc
containers:
- name: bionemo
image: nvcr.io/nvidia/clara/bionemo-framework:1.7
resources:
limits:
cpu: 2000m
memory: 4Gi
requests:
cpu: 1000m
memory: 2Gi
env:
- name: DATA_PATH
value: "/fsx"
command: ["/bin/sh", "-c"]
args:
- |
mkdir -p /fsx/esm2nv-train/
unzip -o ${BIONEMO_HOME}/examples/tests/test_data/uniref202104_esm2_qc_test200_val200.zip -d /fsx/esm2nv-train/
python examples/protein/esm2nv/pretrain.py \
--config-path=conf \
++do_training=False \
++model.data.val_size=500 \
++model.data.test_size=100 \
++model.data.train.uf50_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uniref50_train_filt.fasta \
++model.data.train.uf90_datapath=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/ur90_ur50_sampler.fasta \
++model.data.train.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv \
++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200
volumeMounts:
- mountPath: "/fsx"
name: fsx-pv-storage
restartPolicy: Never
97 changes: 97 additions & 0 deletions ai-ml/bionemo/examples/esm2nv/esm2nv-training-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: esm2nv-training
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 16
maxRestarts: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
nprocPerNode: "4"
pytorchReplicaSpecs:
Worker:
replicas: 8
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
volumes:
- name: fsx-pv-storage
persistentVolumeClaim:
claimName: fsx-static-pvc
containers:
- name: pytorch
image: nvcr.io/nvidia/clara/bionemo-framework:1.7
resources:
limits:
nvidia.com/gpu: 1
env:
- name: NCCL_DEBUG
value: "INFO"
- name: DATA_PATH
value: "/fsx"
- name: HYDRA_FULL_ERROR
value: "1"
volumeMounts:
- mountPath: "/fsx"
name: fsx-pv-storage
imagePullPolicy: Always
command:
- "python3"
- "-m"
# https://pytorch.org/docs/stable/elastic/run.html#transitioning-from-torch-distributed-launch-to-torchrun
- "torch.distributed.run"
# pytorch script to run from within the bionemo-framework container
- "/workspace/bionemo/examples/protein/esm2nv/pretrain.py"
# bionemo configuration directory including the base-config and the override configuration
- "--config-path=/workspace/bionemo/examples/protein/esm2nv/conf"
# name of the override configuration file to use when running bionemo, this will use base-config
- "--config-name=pretrain_esm2_8M"
# a passed variable to the pytorch script, this determines whether the script will parse data or run active training on pre-parsed data
- "++do_training=True"
# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html
# number of nodes this pytorch job will run on
- "++trainer.num_nodes=2"
# number of GPU cores per node
- "++trainer.devices=4"
# stop training after this many steps, can be removed by setting to -1
- "++trainer.max_steps=100000"
# accumulates gradients over k batches before stepping the optimizer. Default: 1.
- "++trainer.accumulate_grad_batches=1"
# how often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch
- "++trainer.val_check_interval=1.0"
# double precision (64, ‘64’ or ‘64-true’), full precision (32, ‘32’ or ‘32-true’), 16bit mixed precision (16, ‘16’, ‘16-mixed’) or bfloat16 mixed precision (‘bf16’, ‘bf16-mixed’). Can be used on CPU, GPU, TPUs, or HPUs. Default: '32-true'.
- "++trainer.precision=16-mixed"
# https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#batch-size
# Configure with: model.micro_batch_size=N (per GPU batch size)
# Recommended value: use N resulting in 85-90% GPU memory utilization
# Keep model.global_batch_size=null to compute global batch size at run-time.
# Further increase the effective global batch size by using gradient accumulation (for example, trainer.accumulate_grad_batches=2).
- "++model.micro_batch_size=2"
# https://docs.nvidia.com/bionemo-framework/0.4.0/hyperparameters-fw.html#model-parallelism
# For large models (that is > 1B parameters) use model tensor parallelism model.tensor_model_parallel_size=N
# For larger models (that is > 5B parameters) add also model pipeline parallelism model.pipeline_model_parallel_size=N
# The various parallelism options are independent and can be combined as needed.
- "++model.tensor_model_parallel_size=1"
- "++model.data.dataset_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200"
- "++model.data.uf90.uniref90_path=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/uf90"
- "++model.data.cluster_mapping_tsv=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/mapping.tsv"
- "++model.validation.validation_enabled=False"
- "++model.dwnstr_task_validation.enabled=False"
- "++exp_manager.create_wandb_logger=False"
- "++exp_manager.checkpoint_callback_params.always_save_nemo=False"
- "++exp_manager.exp_dir=/fsx/esm2nv-train/uniref202104_esm2_qc_test200_val200/esm2_pretraining"
- "++exp_manager.resume_if_exists=False"
11 changes: 0 additions & 11 deletions ai-ml/bionemo/helm-values/aws-cloudwatch-metrics-values.yaml

This file was deleted.

Empty file modified ai-ml/bionemo/install.sh
100644 → 100755
Empty file.

0 comments on commit f94cebb

Please sign in to comment.