Skip to content

Commit

Permalink
Use aznhc for node health checks (#1879)
Browse files Browse the repository at this point in the history
* use aznhc 0.2.9

* remove NCv3

* run aznhc every hour on idle nodes

* set jetpack bin path

* disable HealthCheck on IDLE nodes
  • Loading branch information
xpillons authored Mar 8, 2024
1 parent e41cfd4 commit de85928
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 56 deletions.
6 changes: 0 additions & 6 deletions .github/workflows/configs/almalinux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@ queues:
ColocateNodes: true
EnableAcceleratedNetworking: true

- name: nc24v3
vm_size: Standard_NC24rs_v3
max_count: 4
image: azhpc:azhop-compute:almalinux-8_7:latest
ColocateNodes: true

# Remote Viz Queues
- name: viz3d
type: remoteviz
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/configs/centos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,6 @@ queues:
ColocateNodes: true
EnableAcceleratedNetworking: true

- name: nc24v3
vm_size: Standard_NC24rs_v3
max_count: 4
image: azhpc:azhop-compute:centos-7_9:latest
ColocateNodes: true

# Remote Viz Queues
- name: viz3d
type: remoteviz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
### NHC Configuration Variables
###
# * || export MARK_OFFLINE=1 NHC_CHECK_ALL=0
* || export OFFLINE_NODE=/usr/libexec/nhc/azhop-node-offline.sh
* || export ONLINE_NODE=/usr/libexec/nhc/node-mark-online
# moved the 2 variables below in the script calling NHC as it's OS dependent
# * || export OFFLINE_NODE=/usr/libexec/nhc/azhop-node-offline.sh
# * || export ONLINE_NODE=/usr/libexec/nhc/node-mark-online
* || export TIMEOUT=300
* || export VERBOSE=1
* || export DETACHED_MODE=0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,63 +1,72 @@
#!/bin/bash
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
JETPACK=/opt/cycle/jetpack/bin/jetpack

# Don't configure NHC if not enabled
enabled_nhc=$(jetpack config healthchecks.enabled | tr '[:upper:]' '[:lower:]')
enabled_nhc=$($JETPACK config healthchecks.enabled | tr '[:upper:]' '[:lower:]')
if [[ $enabled_nhc != "true" ]]; then
exit 0
fi


# Install NHC if not already installed in the image
az_nhc_installed_version=$(grep Azure-NHC /opt/azurehpc/test/azurehpc-health-checks/docs/version.log | cut -d':' -f2 | xargs)
az_nhc_target_version="v0.2.7"
az_nhc_target_version="v0.2.9"

if [ "$az_nhc_installed_version" != "$az_nhc_target_version" ] ; then
if [ -d /opt/azurehpc/test/azurehpc-health-checks ]; then
rm -rf /opt/azurehpc/test/azurehpc-health-checks
fi
mkdir -p /opt/azurehpc/test/
cd /opt/azurehpc/test/
git clone https://github.com/Azure/azurehpc-health-checks.git -b v0.2.7
git clone https://github.com/Azure/azurehpc-health-checks.git -b $az_nhc_target_version
cd azurehpc-health-checks
./install-nhc.sh
fi

. /etc/os-release
case $ID in
ubuntu)
LIBEXEDIR=/usr/lib;;
*)
LIBEXEDIR=/usr/libexec;;
esac

# Install azhop-node-offline.sh
cp -v $SCRIPT_DIR/../files/nhc/azhop-node-offline.sh /usr/libexec/nhc/
chmod 755 /usr/libexec/nhc/azhop-node-offline.sh
cp -v $SCRIPT_DIR/../files/nhc/azhop-node-offline.sh $LIBEXEDIR/nhc/
chmod 755 $LIBEXEDIR/nhc/azhop-node-offline.sh

# Use our own NHC config files
NHC_CONFIG_FILE="/etc/nhc/nhc.conf"
VM_SIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]' | sed 's/standard_//')
# # Use our own NHC config files
# NHC_CONFIG_FILE="/etc/nhc/nhc.conf"
# VM_SIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]' | sed 's/standard_//')

NHC_CONFIG_EXTRA="$SCRIPT_DIR/../files/nhc/nhc_${VM_SIZE}.conf"
# NHC_CONFIG_EXTRA="$SCRIPT_DIR/../files/nhc/nhc_${VM_SIZE}.conf"

# Use common config for all compute nodes
if [ -e $NHC_CONFIG_FILE ]; then
rm -f ${NHC_CONFIG_FILE}.bak
mv $NHC_CONFIG_FILE ${NHC_CONFIG_FILE}.bak
fi
cp -fv $SCRIPT_DIR/../files/nhc/nhc_common.conf $NHC_CONFIG_FILE
# # Use common config for all compute nodes
# if [ -e $NHC_CONFIG_FILE ]; then
# rm -f ${NHC_CONFIG_FILE}.bak
# mv $NHC_CONFIG_FILE ${NHC_CONFIG_FILE}.bak
# fi
# cp -fv $SCRIPT_DIR/../files/nhc/nhc_common.conf $NHC_CONFIG_FILE

# Append VM size specific config if exists
if [ -e $NHC_CONFIG_EXTRA ]; then
cat $NHC_CONFIG_EXTRA >> $NHC_CONFIG_FILE
fi
# # Append VM size specific config if exists
# if [ -e $NHC_CONFIG_EXTRA ]; then
# cat $NHC_CONFIG_EXTRA >> $NHC_CONFIG_FILE
# fi

# # Add nvidia-smi health checks for GPU SKUs except NV_v4 as they don't have NVIDIA device
# case $VM_SIZE in
# nv*v4)
# ;;
# nc*|nv*|nd*)
# echo " * || check_nvsmi_healthmon" >> $NHC_CONFIG_FILE
# ;;
# # Check HDR InfiniBand on all HBv2 and HBv3 SKUs
# hb*v2|hb*v3)
# echo " * || check_hw_ib 200 mlx5_ib0:1" >> $NHC_CONFIG_FILE
# ;;
# hc44rs)
# echo " * || check_hw_ib 100 mlx5_ib0:1" >> $NHC_CONFIG_FILE
# ;;

# Add nvidia-smi health checks for GPU SKUs except NV_v4 as they don't have NVIDIA device
case $VM_SIZE in
nv*v4)
;;
nc*|nv*|nd*)
echo " * || check_nvsmi_healthmon" >> $NHC_CONFIG_FILE
;;
# Check HDR InfiniBand on all HBv2 and HBv3 SKUs
hb*v2|hb*v3)
echo " * || check_hw_ib 200 mlx5_ib0:1" >> $NHC_CONFIG_FILE
;;
hc44rs)
echo " * || check_hw_ib 100 mlx5_ib0:1" >> $NHC_CONFIG_FILE
;;

esac
# esac
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
#!/bin/bash
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
JETPACK=/opt/cycle/jetpack/bin/jetpack

# Don't run health checks if not enabled
enabled_nhc=$(jetpack config healthchecks.enabled | tr '[:upper:]' '[:lower:]')
enabled_nhc=$($JETPACK config healthchecks.enabled | tr '[:upper:]' '[:lower:]')
if [[ $enabled_nhc != "true" ]]; then
exit 0
fi
NHC_CONFIG_FILE="/etc/nhc/nhc.conf"

# if run-health-checks.sh exists, then runit
if [ -e /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh ]; then
errormessage=$( /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh -c $NHC_CONFIG_FILE 2>&1)
. /etc/os-release
case $ID in
ubuntu)
LIBEXEDIR=/usr/lib;;
*)
LIBEXEDIR=/usr/libexec;;
esac
NHC_COMMON_FILE=$SCRIPT_DIR/../files/nhc/nhc_common.conf
export OFFLINE_NODE=$LIBEXEDIR/nhc/azhop-node-offline.sh
export ONLINE_NODE=$LIBEXEDIR/nhc/node-mark-online

errormessage=$( /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh -e $NHC_COMMON_FILE 2>&1)
error=$?

# In case of health check failure, shutdown the node by calling the script /usr/libexec/nhc/azhop-node-offline.sh
if [ $error -eq 1 ]; then
/usr/libexec/nhc/azhop-node-offline.sh $(hostname) "$errormessage"
JETPACK=/opt/cycle/jetpack/bin/jetpack
$OFFLINE_NODE $(hostname) "$errormessage"
$JETPACK shutdown --unhealthy
fi
else
Expand Down
4 changes: 2 additions & 2 deletions playbooks/roles/slurm/templates/azhop.conf.j2
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
HealthCheckProgram=/usr/sbin/nhc
HealthCheckInterval=300
#HealthCheckProgram=/mnt/cluster-init/common/default/scripts/99-healthcheck.sh
HealthCheckInterval=0
HealthCheckNodeState=IDLE
SrunPortRange=59000-61000
Prolog=/sched/{{ slurm_cluster_name }}/prolog.sh
Expand Down

0 comments on commit de85928

Please sign in to comment.