From 5a331d99be9d9fb52a320f5351c4b60fc8561404 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Thu, 14 Mar 2024 21:33:12 +0800 Subject: [PATCH] Bump dcgm exporter version to correctly capture GPU utilization --- .../base-config/utils/install_dcgm_exporter.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh index f3a842fa..e7c546b2 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh @@ -4,18 +4,18 @@ if nvidia-smi; then echo "NVIDIA GPU found. Proceeding with script..." # Set DCGM Exporter version - DCGM_EXPORTER_VERSION=2.1.4-2.3.1 + DCGM_EXPORTER_VERSION=3.3.5-3.4.0-ubuntu22.04 # Run the DCGM Exporter Docker container sudo docker run -d --rm \ --gpus all \ --net host \ --cap-add SYS_ADMIN \ - nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION}-ubuntu20.04 \ + nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \ -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } echo "Running DCGM exporter in a Docker container on port 9400..." else echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is controller node, you can safelly ignore this warning. Exiting gracefully..." exit 0 -fi \ No newline at end of file +fi