diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh index 62a5ec7a..d50d8237 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh @@ -1,32 +1,40 @@ #!/bin/bash -if nvidia-smi; then - echo "NVIDIA GPU found. Proceeding with script..." +# Define the container name +CONTAINER_NAME="dcgm-exporter" - # Get the instance-type from EC2 instance metadata - TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") - INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type) +# Check if the container exists and is running +if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then + echo "Container $CONTAINER_NAME is already running." +else + echo "Container $CONTAINER_NAME is not running or does not exist..." + echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..." + docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed." - # Set DCGM-Exporter-Version, for g5s, use older version (https://github.com/NVIDIA/dcgm-exporter/issues/319) - if [[ $INSTANCE_TYPE == *"g5"* ]]; then - echo "Instance Type is recognized as $INSTANCE_TYPE setting DCGM_EXPORTER_VERSION to 2.1.4-2.3.1-ubuntu20.04" - DCGM_EXPORTER_VERSION=2.1.4-2.3.1-ubuntu20.04 - else - echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to 3.3.5-3.4.0-ubuntu22.04" - DCGM_EXPORTER_VERSION=3.3.5-3.4.1-ubuntu22.04 - fi - echo "DCGM_EXPORTER_VERSION = $DCGM_EXPORTER_VERSION" + # Check for GPU, then proceed with script + if nvidia-smi > /dev/null 2>&1; then + echo "NVIDIA GPU found. Proceeding with script..." - # Run the DCGM Exporter Docker container - sudo docker run -d --restart always \ - --gpus all \ - --net host \ - --cap-add SYS_ADMIN \ - nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \ - -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } + # Get the instance-type from EC2 instance metadata + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type) + + DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04 - echo "Running DCGM exporter in a Docker container on port 9400..." -else - echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..." - exit 0 -fi + echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION" + + # Run the DCGM Exporter Docker container + sudo docker run -d --restart always \ + --name $CONTAINER_NAME \ + --gpus all \ + --net host \ + --cap-add SYS_ADMIN \ + nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \ + -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } + + echo "Running DCGM exporter in a Docker container on port 9400..." + else + echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..." + exit 0 + fi +fi \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh index cca52d19..a76c6f27 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh @@ -1,27 +1,23 @@ #!/bin/bash -# Define variables -REPO_DIR="awsome-distributed-training" -REPO_URL="https://github.com/aws-samples/awsome-distributed-training.git" +# Define the container name +CONTAINER_NAME="efa-node-exporter" -# Check if the repository directory exists -if [ -d "$REPO_DIR" ]; then - echo "Repository already exists, skipping cloning." +# Check if the container exists and is running +if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then + echo "Container $CONTAINER_NAME is already running." else - # Clone the repository - git clone --depth=1 "$REPO_URL" || { echo "Failed to clone the repository"; exit 1; } -fi - -# Change directory to the desired location -cd "$REPO_DIR/4.validation_and_observability/3.efa-node-exporter" || { echo "Failed to change directory"; exit 1; } - -# Build the Docker image explicitly -sudo docker build -t node_exporter_efa:latest . || { echo "Failed to build Docker image"; exit 1; } - -# Run the Docker container with appropriate configurations -sudo docker run -d --restart always \ - --net="host" \ - --pid="host" \ - -v "/:/host:ro,rslave" \ - node_exporter_efa:latest \ - --path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } + echo "Container $CONTAINER_NAME is not running or does not exist..." + echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..." + docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed." + echo "Proceeding with script..." + + # Run the Docker container with appropriate configurations + sudo docker run -d --restart always \ + --name=$CONTAINER_NAME \ + --net="host" \ + --pid="host" \ + -v "/:/host:ro,rslave" \ + public.ecr.aws/hpc-cloud/efa-node-exporter:latest \ + --path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } +fi \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh index 1ce225ee..30d9c45a 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh @@ -3,11 +3,24 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 +# Define the container name +CONTAINER_NAME="headnode-exporter" -# Run the Docker container with appropriate configurations -sudo docker run -d --restart always \ - --net="host" \ - --pid="host" \ - -v "/:/host:ro,rslave" \ - public.ecr.aws/bitnami/node-exporter:latest \ - --path.rootfs=/host && { echo "Successfully started Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } +# Check if the container exists and is running +if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then + echo "Container $CONTAINER_NAME is already running." +else + echo "Container $CONTAINER_NAME is not running or does not exist..." + echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..." + docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed." + echo "Proceeding with script..." + + # Run the Docker container with appropriate configurations + sudo docker run -d --restart always \ + --name=$CONTAINER_NAME \ + --net="host" \ + --pid="host" \ + -v "/:/host:ro,rslave" \ + public.ecr.aws/bitnami/node-exporter:latest \ + --path.rootfs=/host && { echo "Successfully started Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } +fi \ No newline at end of file