diff --git a/.github/workflows/configs/almalinux.yml b/.github/workflows/configs/almalinux.yml index 22c6234dd..c3c982de7 100644 --- a/.github/workflows/configs/almalinux.yml +++ b/.github/workflows/configs/almalinux.yml @@ -51,12 +51,6 @@ queues: ColocateNodes: true EnableAcceleratedNetworking: true - - name: nc24v3 - vm_size: Standard_NC24rs_v3 - max_count: 4 - image: azhpc:azhop-compute:almalinux-8_7:latest - ColocateNodes: true - # Remote Viz Queues - name: viz3d type: remoteviz diff --git a/.github/workflows/configs/centos.yml b/.github/workflows/configs/centos.yml index b45608530..9787cad79 100644 --- a/.github/workflows/configs/centos.yml +++ b/.github/workflows/configs/centos.yml @@ -57,12 +57,6 @@ queues: ColocateNodes: true EnableAcceleratedNetworking: true - - name: nc24v3 - vm_size: Standard_NC24rs_v3 - max_count: 4 - image: azhpc:azhop-compute:centos-7_9:latest - ColocateNodes: true - # Remote Viz Queues - name: viz3d type: remoteviz diff --git a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/files/nhc/nhc_common.conf.j2 b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/files/nhc/nhc_common.conf.j2 index 14b4647da..e7e20ee6f 100644 --- a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/files/nhc/nhc_common.conf.j2 +++ b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/files/nhc/nhc_common.conf.j2 @@ -10,8 +10,9 @@ ### NHC Configuration Variables ### # * || export MARK_OFFLINE=1 NHC_CHECK_ALL=0 - * || export OFFLINE_NODE=/usr/libexec/nhc/azhop-node-offline.sh - * || export ONLINE_NODE=/usr/libexec/nhc/node-mark-online +# moved the 2 variables below in the script calling NHC as it's OS dependent +# * || export OFFLINE_NODE=/usr/libexec/nhc/azhop-node-offline.sh +# * || export ONLINE_NODE=/usr/libexec/nhc/node-mark-online * || export TIMEOUT=300 * || export VERBOSE=1 * || export DETACHED_MODE=0 diff --git a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/8-configure-nhc.sh b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/8-configure-nhc.sh index 831b86bd6..554fd5c3b 100644 --- a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/8-configure-nhc.sh +++ b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/8-configure-nhc.sh @@ -1,8 +1,9 @@ #!/bin/bash SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +JETPACK=/opt/cycle/jetpack/bin/jetpack # Don't configure NHC if not enabled -enabled_nhc=$(jetpack config healthchecks.enabled | tr '[:upper:]' '[:lower:]') +enabled_nhc=$($JETPACK config healthchecks.enabled | tr '[:upper:]' '[:lower:]') if [[ $enabled_nhc != "true" ]]; then exit 0 fi @@ -10,7 +11,7 @@ fi # Install NHC if not already installed in the image az_nhc_installed_version=$(grep Azure-NHC /opt/azurehpc/test/azurehpc-health-checks/docs/version.log | cut -d':' -f2 | xargs) -az_nhc_target_version="v0.2.7" +az_nhc_target_version="v0.2.9" if [ "$az_nhc_installed_version" != "$az_nhc_target_version" ] ; then if [ -d /opt/azurehpc/test/azurehpc-health-checks ]; then @@ -18,46 +19,54 @@ if [ "$az_nhc_installed_version" != "$az_nhc_target_version" ] ; then fi mkdir -p /opt/azurehpc/test/ cd /opt/azurehpc/test/ - git clone https://github.com/Azure/azurehpc-health-checks.git -b v0.2.7 + git clone https://github.com/Azure/azurehpc-health-checks.git -b $az_nhc_target_version cd azurehpc-health-checks ./install-nhc.sh fi +. /etc/os-release +case $ID in + ubuntu) + LIBEXEDIR=/usr/lib;; + *) + LIBEXEDIR=/usr/libexec;; +esac + # Install azhop-node-offline.sh -cp -v $SCRIPT_DIR/../files/nhc/azhop-node-offline.sh /usr/libexec/nhc/ -chmod 755 /usr/libexec/nhc/azhop-node-offline.sh +cp -v $SCRIPT_DIR/../files/nhc/azhop-node-offline.sh $LIBEXEDIR/nhc/ +chmod 755 $LIBEXEDIR/nhc/azhop-node-offline.sh -# Use our own NHC config files -NHC_CONFIG_FILE="/etc/nhc/nhc.conf" -VM_SIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]' | sed 's/standard_//') +# # Use our own NHC config files +# NHC_CONFIG_FILE="/etc/nhc/nhc.conf" +# VM_SIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]' | sed 's/standard_//') -NHC_CONFIG_EXTRA="$SCRIPT_DIR/../files/nhc/nhc_${VM_SIZE}.conf" +# NHC_CONFIG_EXTRA="$SCRIPT_DIR/../files/nhc/nhc_${VM_SIZE}.conf" -# Use common config for all compute nodes -if [ -e $NHC_CONFIG_FILE ]; then - rm -f ${NHC_CONFIG_FILE}.bak - mv $NHC_CONFIG_FILE ${NHC_CONFIG_FILE}.bak -fi -cp -fv $SCRIPT_DIR/../files/nhc/nhc_common.conf $NHC_CONFIG_FILE +# # Use common config for all compute nodes +# if [ -e $NHC_CONFIG_FILE ]; then +# rm -f ${NHC_CONFIG_FILE}.bak +# mv $NHC_CONFIG_FILE ${NHC_CONFIG_FILE}.bak +# fi +# cp -fv $SCRIPT_DIR/../files/nhc/nhc_common.conf $NHC_CONFIG_FILE -# Append VM size specific config if exists -if [ -e $NHC_CONFIG_EXTRA ]; then - cat $NHC_CONFIG_EXTRA >> $NHC_CONFIG_FILE -fi +# # Append VM size specific config if exists +# if [ -e $NHC_CONFIG_EXTRA ]; then +# cat $NHC_CONFIG_EXTRA >> $NHC_CONFIG_FILE +# fi + +# # Add nvidia-smi health checks for GPU SKUs except NV_v4 as they don't have NVIDIA device +# case $VM_SIZE in +# nv*v4) +# ;; +# nc*|nv*|nd*) +# echo " * || check_nvsmi_healthmon" >> $NHC_CONFIG_FILE +# ;; +# # Check HDR InfiniBand on all HBv2 and HBv3 SKUs +# hb*v2|hb*v3) +# echo " * || check_hw_ib 200 mlx5_ib0:1" >> $NHC_CONFIG_FILE +# ;; +# hc44rs) +# echo " * || check_hw_ib 100 mlx5_ib0:1" >> $NHC_CONFIG_FILE +# ;; -# Add nvidia-smi health checks for GPU SKUs except NV_v4 as they don't have NVIDIA device -case $VM_SIZE in - nv*v4) - ;; - nc*|nv*|nd*) - echo " * || check_nvsmi_healthmon" >> $NHC_CONFIG_FILE - ;; - # Check HDR InfiniBand on all HBv2 and HBv3 SKUs - hb*v2|hb*v3) - echo " * || check_hw_ib 200 mlx5_ib0:1" >> $NHC_CONFIG_FILE - ;; - hc44rs) - echo " * || check_hw_ib 100 mlx5_ib0:1" >> $NHC_CONFIG_FILE - ;; - -esac \ No newline at end of file +# esac \ No newline at end of file diff --git a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/99-healthcheck.sh b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/99-healthcheck.sh index 76d02b548..e4ea4055d 100755 --- a/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/99-healthcheck.sh +++ b/playbooks/roles/cyclecloud_cluster/projects/common/cluster-init/scripts/99-healthcheck.sh @@ -1,20 +1,32 @@ #!/bin/bash +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +JETPACK=/opt/cycle/jetpack/bin/jetpack + # Don't run health checks if not enabled -enabled_nhc=$(jetpack config healthchecks.enabled | tr '[:upper:]' '[:lower:]') +enabled_nhc=$($JETPACK config healthchecks.enabled | tr '[:upper:]' '[:lower:]') if [[ $enabled_nhc != "true" ]]; then exit 0 fi -NHC_CONFIG_FILE="/etc/nhc/nhc.conf" # if run-health-checks.sh exists, then runit if [ -e /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh ]; then - errormessage=$( /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh -c $NHC_CONFIG_FILE 2>&1) + . /etc/os-release + case $ID in + ubuntu) + LIBEXEDIR=/usr/lib;; + *) + LIBEXEDIR=/usr/libexec;; + esac + NHC_COMMON_FILE=$SCRIPT_DIR/../files/nhc/nhc_common.conf + export OFFLINE_NODE=$LIBEXEDIR/nhc/azhop-node-offline.sh + export ONLINE_NODE=$LIBEXEDIR/nhc/node-mark-online + + errormessage=$( /opt/azurehpc/test/azurehpc-health-checks/run-health-checks.sh -e $NHC_COMMON_FILE 2>&1) error=$? # In case of health check failure, shutdown the node by calling the script /usr/libexec/nhc/azhop-node-offline.sh if [ $error -eq 1 ]; then - /usr/libexec/nhc/azhop-node-offline.sh $(hostname) "$errormessage" - JETPACK=/opt/cycle/jetpack/bin/jetpack + $OFFLINE_NODE $(hostname) "$errormessage" $JETPACK shutdown --unhealthy fi else diff --git a/playbooks/roles/slurm/templates/azhop.conf.j2 b/playbooks/roles/slurm/templates/azhop.conf.j2 index 77aebd2a2..279db9e8e 100644 --- a/playbooks/roles/slurm/templates/azhop.conf.j2 +++ b/playbooks/roles/slurm/templates/azhop.conf.j2 @@ -1,5 +1,5 @@ -HealthCheckProgram=/usr/sbin/nhc -HealthCheckInterval=300 +#HealthCheckProgram=/mnt/cluster-init/common/default/scripts/99-healthcheck.sh +HealthCheckInterval=0 HealthCheckNodeState=IDLE SrunPortRange=59000-61000 Prolog=/sched/{{ slurm_cluster_name }}/prolog.sh