-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updates to Observability scripts include: add idempotency, improve lo…
…gging, use pre-built ECR image for EFA-Node-Exporter, update dcgm container version to latest from nvidia. tested on g5.48xl and p5.48xl (#443)
- Loading branch information
Showing
3 changed files
with
73 additions
and
56 deletions.
There are no files selected for viewing
60 changes: 34 additions & 26 deletions
60
...tectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,40 @@ | ||
#!/bin/bash | ||
|
||
if nvidia-smi; then | ||
echo "NVIDIA GPU found. Proceeding with script..." | ||
# Define the container name | ||
CONTAINER_NAME="dcgm-exporter" | ||
|
||
# Get the instance-type from EC2 instance metadata | ||
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") | ||
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type) | ||
# Check if the container exists and is running | ||
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then | ||
echo "Container $CONTAINER_NAME is already running." | ||
else | ||
echo "Container $CONTAINER_NAME is not running or does not exist..." | ||
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..." | ||
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed." | ||
|
||
# Set DCGM-Exporter-Version, for g5s, use older version (https://github.com/NVIDIA/dcgm-exporter/issues/319) | ||
if [[ $INSTANCE_TYPE == *"g5"* ]]; then | ||
echo "Instance Type is recognized as $INSTANCE_TYPE setting DCGM_EXPORTER_VERSION to 2.1.4-2.3.1-ubuntu20.04" | ||
DCGM_EXPORTER_VERSION=2.1.4-2.3.1-ubuntu20.04 | ||
else | ||
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to 3.3.5-3.4.0-ubuntu22.04" | ||
DCGM_EXPORTER_VERSION=3.3.5-3.4.1-ubuntu22.04 | ||
fi | ||
echo "DCGM_EXPORTER_VERSION = $DCGM_EXPORTER_VERSION" | ||
# Check for GPU, then proceed with script | ||
if nvidia-smi > /dev/null 2>&1; then | ||
echo "NVIDIA GPU found. Proceeding with script..." | ||
|
||
# Run the DCGM Exporter Docker container | ||
sudo docker run -d --restart always \ | ||
--gpus all \ | ||
--net host \ | ||
--cap-add SYS_ADMIN \ | ||
nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \ | ||
-f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } | ||
# Get the instance-type from EC2 instance metadata | ||
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") | ||
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type) | ||
|
||
DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04 | ||
|
||
echo "Running DCGM exporter in a Docker container on port 9400..." | ||
else | ||
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..." | ||
exit 0 | ||
fi | ||
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION" | ||
|
||
# Run the DCGM Exporter Docker container | ||
sudo docker run -d --restart always \ | ||
--name $CONTAINER_NAME \ | ||
--gpus all \ | ||
--net host \ | ||
--cap-add SYS_ADMIN \ | ||
nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \ | ||
-f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } | ||
|
||
echo "Running DCGM exporter in a Docker container on port 9400..." | ||
else | ||
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..." | ||
exit 0 | ||
fi | ||
fi |
42 changes: 19 additions & 23 deletions
42
...ures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,23 @@ | ||
#!/bin/bash | ||
|
||
# Define variables | ||
REPO_DIR="awsome-distributed-training" | ||
REPO_URL="https://github.com/aws-samples/awsome-distributed-training.git" | ||
# Define the container name | ||
CONTAINER_NAME="efa-node-exporter" | ||
|
||
# Check if the repository directory exists | ||
if [ -d "$REPO_DIR" ]; then | ||
echo "Repository already exists, skipping cloning." | ||
# Check if the container exists and is running | ||
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then | ||
echo "Container $CONTAINER_NAME is already running." | ||
else | ||
# Clone the repository | ||
git clone --depth=1 "$REPO_URL" || { echo "Failed to clone the repository"; exit 1; } | ||
fi | ||
|
||
# Change directory to the desired location | ||
cd "$REPO_DIR/4.validation_and_observability/3.efa-node-exporter" || { echo "Failed to change directory"; exit 1; } | ||
|
||
# Build the Docker image explicitly | ||
sudo docker build -t node_exporter_efa:latest . || { echo "Failed to build Docker image"; exit 1; } | ||
|
||
# Run the Docker container with appropriate configurations | ||
sudo docker run -d --restart always \ | ||
--net="host" \ | ||
--pid="host" \ | ||
-v "/:/host:ro,rslave" \ | ||
node_exporter_efa:latest \ | ||
--path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } | ||
echo "Container $CONTAINER_NAME is not running or does not exist..." | ||
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..." | ||
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed." | ||
echo "Proceeding with script..." | ||
|
||
# Run the Docker container with appropriate configurations | ||
sudo docker run -d --restart always \ | ||
--name=$CONTAINER_NAME \ | ||
--net="host" \ | ||
--pid="host" \ | ||
-v "/:/host:ro,rslave" \ | ||
public.ecr.aws/hpc-cloud/efa-node-exporter:latest \ | ||
--path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters