Skip to content

Commit

Permalink
Updates to Observability scripts include: add idempotency, improve lo…
Browse files Browse the repository at this point in the history
…gging, use pre-built ECR image for EFA-Node-Exporter, update dcgm container version to latest from nvidia. tested on g5.48xl and p5.48xl (#443)
  • Loading branch information
nghtm authored Sep 26, 2024
1 parent 18c6cd1 commit ee14c53
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 56 deletions.
Original file line number Diff line number Diff line change
@@ -1,32 +1,40 @@
#!/bin/bash

if nvidia-smi; then
echo "NVIDIA GPU found. Proceeding with script..."
# Define the container name
CONTAINER_NAME="dcgm-exporter"

# Get the instance-type from EC2 instance metadata
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)
# Check if the container exists and is running
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then
echo "Container $CONTAINER_NAME is already running."
else
echo "Container $CONTAINER_NAME is not running or does not exist..."
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed."

# Set DCGM-Exporter-Version, for g5s, use older version (https://github.com/NVIDIA/dcgm-exporter/issues/319)
if [[ $INSTANCE_TYPE == *"g5"* ]]; then
echo "Instance Type is recognized as $INSTANCE_TYPE setting DCGM_EXPORTER_VERSION to 2.1.4-2.3.1-ubuntu20.04"
DCGM_EXPORTER_VERSION=2.1.4-2.3.1-ubuntu20.04
else
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to 3.3.5-3.4.0-ubuntu22.04"
DCGM_EXPORTER_VERSION=3.3.5-3.4.1-ubuntu22.04
fi
echo "DCGM_EXPORTER_VERSION = $DCGM_EXPORTER_VERSION"
# Check for GPU, then proceed with script
if nvidia-smi > /dev/null 2>&1; then
echo "NVIDIA GPU found. Proceeding with script..."

# Run the DCGM Exporter Docker container
sudo docker run -d --restart always \
--gpus all \
--net host \
--cap-add SYS_ADMIN \
nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
-f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; }
# Get the instance-type from EC2 instance metadata
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)

DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04

echo "Running DCGM exporter in a Docker container on port 9400..."
else
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
exit 0
fi
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION"

# Run the DCGM Exporter Docker container
sudo docker run -d --restart always \
--name $CONTAINER_NAME \
--gpus all \
--net host \
--cap-add SYS_ADMIN \
nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
-f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; }

echo "Running DCGM exporter in a Docker container on port 9400..."
else
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
exit 0
fi
fi
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
#!/bin/bash

# Define variables
REPO_DIR="awsome-distributed-training"
REPO_URL="https://github.com/aws-samples/awsome-distributed-training.git"
# Define the container name
CONTAINER_NAME="efa-node-exporter"

# Check if the repository directory exists
if [ -d "$REPO_DIR" ]; then
echo "Repository already exists, skipping cloning."
# Check if the container exists and is running
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then
echo "Container $CONTAINER_NAME is already running."
else
# Clone the repository
git clone --depth=1 "$REPO_URL" || { echo "Failed to clone the repository"; exit 1; }
fi

# Change directory to the desired location
cd "$REPO_DIR/4.validation_and_observability/3.efa-node-exporter" || { echo "Failed to change directory"; exit 1; }

# Build the Docker image explicitly
sudo docker build -t node_exporter_efa:latest . || { echo "Failed to build Docker image"; exit 1; }

# Run the Docker container with appropriate configurations
sudo docker run -d --restart always \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
node_exporter_efa:latest \
--path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
echo "Container $CONTAINER_NAME is not running or does not exist..."
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed."
echo "Proceeding with script..."

# Run the Docker container with appropriate configurations
sudo docker run -d --restart always \
--name=$CONTAINER_NAME \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
public.ecr.aws/hpc-cloud/efa-node-exporter:latest \
--path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
fi
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,24 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Define the container name
CONTAINER_NAME="headnode-exporter"

# Run the Docker container with appropriate configurations
sudo docker run -d --restart always \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
public.ecr.aws/bitnami/node-exporter:latest \
--path.rootfs=/host && { echo "Successfully started Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
# Check if the container exists and is running
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then
echo "Container $CONTAINER_NAME is already running."
else
echo "Container $CONTAINER_NAME is not running or does not exist..."
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed."
echo "Proceeding with script..."

# Run the Docker container with appropriate configurations
sudo docker run -d --restart always \
--name=$CONTAINER_NAME \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
public.ecr.aws/bitnami/node-exporter:latest \
--path.rootfs=/host && { echo "Successfully started Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
fi

0 comments on commit ee14c53

Please sign in to comment.