Skip to content

Commit

Permalink
Hotfix: Handle UNAVAILABLE and UNKNOWN rocoto status in Bash CI (NOAA…
Browse files Browse the repository at this point in the history
…-EMC#2820)

# Description
From time to time, PBS pro cannot return a `qstat` response within a
given time limit set by `rocoto` (default is 45 seconds). If that
happens, then an `UNAVAILABLE` status will be returned for the given
job. This PR adds checking for this status to allow CI processing to
continue.
  • Loading branch information
DavidHuber-NOAA committed Aug 13, 2024
1 parent d994642 commit 336b78a
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions ci/scripts/utils/rocotostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def rocoto_statcount(rocotostat):
rocotostat_output = [line.split()[0:4] for line in rocotostat_output]
rocotostat_output = [line for line in rocotostat_output if len(line) != 1]

status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED']
status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED', 'UNAVAILABLE']

rocoto_status = {}
status_counts = Counter(case for sublist in rocotostat_output for case in sublist)
Expand Down Expand Up @@ -214,9 +214,16 @@ def is_stalled(rocoto_status):
elif rocoto_status['DEAD'] > 0:
error_return = rocoto_status['FAIL'] + rocoto_status['DEAD']
rocoto_state = 'FAIL'
elif 'UNKNOWN' in rocoto_status:
error_return = rocoto_status['UNKNOWN']
rocoto_state = 'UNKNOWN'
elif 'UNAVAILABLE' in rocoto_status or 'UNKNOWN' in rocoto_status:
rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError)
error_return = 0
rocoto_state = 'RUNNING'
if 'UNAVAILABLE' in rocoto_status:
error_return = rocoto_status['UNAVAILABLE']
rocoto_state = 'UNAVAILABLE'
if 'UNKNOWN' in rocoto_status:
error_return += rocoto_status['UNKNOWN']
rocoto_state = 'UNKNOWN'
elif is_stalled(rocoto_status):
rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError)
if is_stalled(rocoto_status):
Expand Down

0 comments on commit 336b78a

Please sign in to comment.