From ad8d3e9c4b9be972792fc97d8d679e355c8b1ba3 Mon Sep 17 00:00:00 2001 From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:41:45 -0400 Subject: [PATCH] Check that a PR driver is still running before trying to kill it (#2799) Adds a check to the SSH command used to kill child PIDs of a defunct driver instance on a different head node to prevent invalid kill commands, preventing CI failures. Resolves #2798 --- ci/scripts/check_ci.sh | 8 ++++---- ci/scripts/driver.sh | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 04dd92f4a6..24c5e242c3 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -50,14 +50,14 @@ fi export GH rocotostat=$(command -v rocotostat) -if [[ -z ${rocotostat+x} ]]; then +if [[ -z ${rocotostat} ]]; then echo "rocotostat not found on system" exit 1 else echo "rocotostat being used from ${rocotostat}" fi rocotocheck=$(command -v rocotocheck) -if [[ -z ${rocotocheck+x} ]]; then +if [[ -z ${rocotocheck} ]]; then echo "rocotocheck not found on system" exit 1 else @@ -70,7 +70,7 @@ pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true fi -if [[ -z "${pr_list+x}" ]]; then +if [[ -z "${pr_list}" ]]; then echo "no PRs open and ready to run cases on .. exiting" exit 0 fi @@ -124,7 +124,7 @@ for pr in ${pr_list}; do for pslot_dir in "${pr_dir}/RUNTESTS/EXPDIR/"*; do pslot=$(basename "${pslot_dir}") || true - if [[ -z "${pslot+x}" ]]; then + if [[ -z "${pslot}" ]]; then echo "No experiments found in ${pslot_dir} .. exiting" exit 0 fi diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 0f53ebff6f..8a99817325 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -77,8 +77,9 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" + [[ ! -d ${pr_dir} ]] && mkdir -p "${pr_dir}" db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") - output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" + output_ci_single="${pr_dir}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set # and in that case completely kill the previose driver.sh cron @@ -107,7 +108,9 @@ for pr in ${pr_list}; do echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill fi else - ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' + # Check if the driver is still running on the head node; if so, kill it and all child processes + #shellcheck disable=SC2029 + ssh "${driver_HOST}" "pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill || echo \"Failed to kill process with PID: ${driver_PID}, it may not be valid.\"" fi { echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}" @@ -141,7 +144,7 @@ pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then pr_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Ready) || true fi -if [[ -z "${pr_list+x}" ]]; then +if [[ -z "${pr_list}" ]]; then echo "no PRs open and ready for checkout/build .. exiting" exit 0 fi @@ -155,7 +158,7 @@ fi for pr in ${pr_list}; do # Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true - if [[ -z "${pr_building+x}" ]]; then + if [[ -n "${pr_building}" ]]; then continue fi id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')