From 502b63b542e1dfa2b80782c550ed3bd33fb3283b Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 9 Oct 2024 09:13:11 +0100 Subject: [PATCH 01/12] Log job failure even when it doesn't cause a change in task state (i.e. when there is a retry set up). --- changes.d/6169.fix | 1 + cylc/flow/task_events_mgr.py | 20 ++++++++++++----- tests/integration/test_task_events_mgr.py | 27 ++++++++++++++++++++--- 3 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 changes.d/6169.fix diff --git a/changes.d/6169.fix b/changes.d/6169.fix new file mode 100644 index 00000000000..d116ea1ba75 --- /dev/null +++ b/changes.d/6169.fix @@ -0,0 +1 @@ +Ensure that job failure is logged, even when the presence of retries causes the task not to change state. diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py index 0a65baea1a9..7c3567448ef 100644 --- a/cylc/flow/task_events_mgr.py +++ b/cylc/flow/task_events_mgr.py @@ -744,7 +744,7 @@ def process_message( # Already failed. return True if self._process_message_failed( - itask, event_time, self.JOB_FAILED, forced + itask, event_time, self.JOB_FAILED, forced, message ): self.spawn_children(itask, TASK_OUTPUT_FAILED) @@ -795,7 +795,7 @@ def process_message( self.workflow_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._process_message_failed( - itask, event_time, self.JOB_FAILED, forced + itask, event_time, self.JOB_FAILED, forced, message ): self.spawn_children(itask, TASK_OUTPUT_FAILED) @@ -812,7 +812,7 @@ def process_message( self.workflow_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._process_message_failed( - itask, event_time, aborted_with, forced + itask, event_time, aborted_with, forced, message ): self.spawn_children(itask, TASK_OUTPUT_FAILED) @@ -1297,10 +1297,17 @@ def _retry_task(self, itask, wallclock_time, submit_retry=False): if itask.state_reset(TASK_STATUS_WAITING): self.data_store_mgr.delta_task_state(itask) - def _process_message_failed(self, itask, event_time, message, forced): + def _process_message_failed( + self, itask, event_time, message, forced, full_message + ): """Helper for process_message, handle a failed message. Return True if no retries (hence go to the failed state). + + Args: + full_message: + If we have retries lined up we still tell users what + happened to cause the this attempt to fail. """ no_retries = False if event_time is None: @@ -1332,7 +1339,10 @@ def _process_message_failed(self, itask, event_time, message, forced): timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] self._retry_task(itask, timer.timeout) delay_msg = f"retrying in {timer.delay_timeout_as_str()}" - LOG.warning(f"[{itask}] {delay_msg}") + LOG.warning( + f'[{itask}] {full_message or self.EVENT_FAILED} - ' + f'{delay_msg}' + ) msg = f"{self.JOB_FAILED}, {delay_msg}" self.setup_event_handlers(itask, self.EVENT_RETRY, msg) self._reset_job_timers(itask) diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index 7ac12274d7b..6e996bced6d 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -14,7 +14,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from itertools import product import logging from typing import Any as Fixture @@ -22,9 +21,7 @@ from cylc.flow.scheduler import Scheduler from cylc.flow.data_store_mgr import ( JOBS, - TASK_STATUSES_ORDERED, TASK_STATUS_WAITING, - TASK_STATUS_SUBMIT_FAILED, ) @@ -170,3 +167,27 @@ async def test__always_insert_task_job( '1/broken/01': 'submit-failed', '1/broken2/01': 'submit-failed' } + + +async def test__process_message_failed_with_retry(one, start): + """Log job failure, even if a retry is scheduled. + + See: https://github.com/cylc/cylc-flow/pull/6169 + + """ + + async with start(one) as LOG: + fail_once = one.pool.get_tasks()[0] + # Add retry timers: + one.task_job_mgr._set_retry_timers( + fail_once, { + 'execution retry delays': [1], + 'submission retry delays': [1] + }) + + # Process failed message: + one.task_events_mgr._process_message_failed( + fail_once, None, 'failed', False, 'failed/OOK') + + # Check that failure reported: + assert 'failed/OOK' in LOG.messages[-1] From 7b0a7b1375c6efc4e76811b85319d22fedc13548 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:01:16 +0100 Subject: [PATCH 02/12] move logging to _process_message_ for submit/failure --- changes.d/6169.fix | 2 +- cylc/flow/task_events_mgr.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/changes.d/6169.fix b/changes.d/6169.fix index d116ea1ba75..c7ca6f74da8 100644 --- a/changes.d/6169.fix +++ b/changes.d/6169.fix @@ -1 +1 @@ -Ensure that job failure is logged, even when the presence of retries causes the task not to change state. +Ensure that job submit/failure is logged, even when retries are planned. diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py index 7c3567448ef..d1c81085f42 100644 --- a/cylc/flow/task_events_mgr.py +++ b/cylc/flow/task_events_mgr.py @@ -928,11 +928,16 @@ def _process_message_check( return False severity_lvl: int = LOG_LEVELS.get(severity, INFO) + # Don't log submit/failure messages here: + if flag != self.FLAG_POLLED and message in { + self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR' + }: + return True # Demote log level to DEBUG if this is a message that duplicates what # gets logged by itask state change anyway (and not manual poll) if severity_lvl > DEBUG and flag != self.FLAG_POLLED and message in { self.EVENT_SUBMITTED, self.EVENT_STARTED, self.EVENT_SUCCEEDED, - self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR' + self.EVENT_FAILED }: severity_lvl = DEBUG LOG.log(severity_lvl, f"[{itask}] {flag}{message}{timestamp}") @@ -1334,6 +1339,9 @@ def _process_message_failed( self.data_store_mgr.delta_task_output( itask, TASK_OUTPUT_FAILED) self.data_store_mgr.delta_task_state(itask) + LOG.error( + f'[{itask}] {full_message or self.EVENT_FAILED} - ' + ) else: # There is an execution retry lined up. timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] @@ -1414,7 +1422,6 @@ def _process_message_submit_failed( Return True if no retries (hence go to the submit-failed state). """ no_retries = False - LOG.critical(f"[{itask}] {self.EVENT_SUBMIT_FAILED}") if event_time is None: event_time = get_current_time_string() self.workflow_db_mgr.put_update_task_jobs(itask, { @@ -1440,6 +1447,7 @@ def _process_message_submit_failed( self.data_store_mgr.delta_task_output( itask, TASK_OUTPUT_SUBMIT_FAILED) self.data_store_mgr.delta_task_state(itask) + LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}") else: # There is a submission retry lined up. timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY] From 00230a764de2c41aef23388dc082b9ad14225c9f Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:18:45 +0100 Subject: [PATCH 03/12] update integration tests --- tests/integration/test_task_events_mgr.py | 44 +++++++++++++++++------ 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index 6e996bced6d..a8da192b863 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -76,17 +76,22 @@ async def test__insert_task_job(flow, one_conf, scheduler, start, validate): with correct submit number. """ conf = { - 'scheduling': {'graph': {'R1': 'rhenas'}}, - 'runtime': {'rhenas': {'simulation': { - 'fail cycle points': '1', - 'fail try 1 only': False, - }}}} + "scheduling": {"graph": {"R1": "rhenas"}}, + "runtime": { + "rhenas": { + "simulation": { + "fail cycle points": "1", + "fail try 1 only": False, + } + } + }, + } id_ = flow(conf) schd = scheduler(id_) async with start(schd): # Set task to running: - itask = schd.pool.get_tasks()[0] - itask.state.status = 'running' + itask = schd.pool.get_tasks()[0] + itask.state.status = "running" itask.submit_num += 1 # Not run _insert_task_job yet: @@ -185,9 +190,28 @@ async def test__process_message_failed_with_retry(one, start): 'submission retry delays': [1] }) - # Process failed message: + # Process submit failed message with and without retries: + one.task_events_mgr._process_message_submit_failed( + fail_once, None, 1, False) + last_record = LOG.records[-1] + assert last_record.levelno == logging.WARNING + assert '1/one:waiting(queued)' in last_record.message + + one.task_events_mgr._process_message_submit_failed( + fail_once, None, 2, False) + last_record = LOG.records[-1] + assert last_record.levelno == logging.ERROR + assert 'submission failed' in last_record.message + + # Process failed message with and without retries: one.task_events_mgr._process_message_failed( fail_once, None, 'failed', False, 'failed/OOK') + last_record = LOG.records[-1] + assert last_record.levelno == logging.WARNING + assert 'failed/OOK' in last_record.message - # Check that failure reported: - assert 'failed/OOK' in LOG.messages[-1] + one.task_events_mgr._process_message_failed( + fail_once, None, 'failed', False, 'failed/OOK') + last_record = LOG.records[-1] + assert last_record.levelno == logging.ERROR + assert 'failed/OOK' in last_record.message From 11b83802758af88f255c29fa5170158f2bfd0629 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:08:14 +0000 Subject: [PATCH 04/12] remove EVENT_FAILED from debug demotion set --- cylc/flow/task_events_mgr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py index d1c81085f42..121b4ae4e59 100644 --- a/cylc/flow/task_events_mgr.py +++ b/cylc/flow/task_events_mgr.py @@ -937,7 +937,6 @@ def _process_message_check( # gets logged by itask state change anyway (and not manual poll) if severity_lvl > DEBUG and flag != self.FLAG_POLLED and message in { self.EVENT_SUBMITTED, self.EVENT_STARTED, self.EVENT_SUCCEEDED, - self.EVENT_FAILED }: severity_lvl = DEBUG LOG.log(severity_lvl, f"[{itask}] {flag}{message}{timestamp}") From 5b5c3910edfb68d9535f76c801da4917e4dc6aa2 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:18:39 +0000 Subject: [PATCH 05/12] [skip ci]wip --- tests/functional/cylc-remove/00-simple/flow.cylc | 2 +- tests/functional/cylc-remove/02-cycling/flow.cylc | 4 ++-- .../functional/cylc-trigger/02-filter-failed/flow.cylc | 6 +++--- .../functional/cylc-trigger/04-filter-names/flow.cylc | 10 +++++----- tests/functional/hold-release/11-retrying/flow.cylc | 2 +- tests/functional/reload/10-runahead.t | 2 +- tests/functional/reload/25-xtriggers.t | 2 +- tests/functional/reload/runahead/flow.cylc | 2 +- .../functional/spawn-on-demand/10-retrigger/flow.cylc | 2 +- tests/functional/triggering/19-and-suicide/flow.cylc | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/functional/cylc-remove/00-simple/flow.cylc b/tests/functional/cylc-remove/00-simple/flow.cylc index 84c740ad421..e1c23857b8f 100644 --- a/tests/functional/cylc-remove/00-simple/flow.cylc +++ b/tests/functional/cylc-remove/00-simple/flow.cylc @@ -15,7 +15,7 @@ script = false [[cleaner]] script = """ -cylc__job__poll_grep_workflow_log -E '1/b/01:running.* \(received\)failed' +cylc__job__poll_grep_workflow_log -E '1/b/01.* failed' # Remove the unhandled failed task cylc remove "$CYLC_WORKFLOW_ID//1/b" # Remove waiting 1/c diff --git a/tests/functional/cylc-remove/02-cycling/flow.cylc b/tests/functional/cylc-remove/02-cycling/flow.cylc index 3b6c1051493..5371d577394 100644 --- a/tests/functional/cylc-remove/02-cycling/flow.cylc +++ b/tests/functional/cylc-remove/02-cycling/flow.cylc @@ -17,8 +17,8 @@ [runtime] [[remover]] script = """ - cylc__job__poll_grep_workflow_log -E '2020/bar/01:running.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '2021/baz/01:running.* \(received\)failed' + cylc__job__poll_grep_workflow_log -E '2020/bar/01.* failed' + cylc__job__poll_grep_workflow_log -E '2021/baz/01.* failed' # Remove the two unhandled failed tasks. cylc remove "$CYLC_WORKFLOW_ID//*/ba*:failed" # Remove the two unsatisfied waiting tasks. diff --git a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc index 7416cf5790d..342f1449980 100644 --- a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc +++ b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc @@ -18,9 +18,9 @@ [[fixer]] script = """ cylc__job__wait_cylc_message_started - cylc__job__poll_grep_workflow_log -E '1/fixable1/01:running.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable2/01:running.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable3/01:running.* \(received\)failed' + cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:failed\] failed/ERR' + cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:failed\] failed/ERR' + cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:failed\] failed/ERR' cylc trigger "${CYLC_WORKFLOW_ID}//1/fixable*" """ [[Z]] diff --git a/tests/functional/cylc-trigger/04-filter-names/flow.cylc b/tests/functional/cylc-trigger/04-filter-names/flow.cylc index 31839c1b77f..95caf5ddc4e 100644 --- a/tests/functional/cylc-trigger/04-filter-names/flow.cylc +++ b/tests/functional/cylc-trigger/04-filter-names/flow.cylc @@ -22,11 +22,11 @@ [[fixer]] script = """ cylc__job__wait_cylc_message_started - cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* \(received\)failed' - cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* \(received\)failed' + cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* failed' + cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* failed' + cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* failed' + cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* failed' + cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* failed' cylc trigger "${CYLC_WORKFLOW_ID}//" \ '//1/FIXABLE-1' '//1/fixable-2*' '//1/fixable-3' """ diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc index 0e08699af09..0d747c4c798 100644 --- a/tests/functional/hold-release/11-retrying/flow.cylc +++ b/tests/functional/hold-release/11-retrying/flow.cylc @@ -18,7 +18,7 @@ t-retry-able => t-analyse [[t-hold-release]] script = """ cylc__job__poll_grep_workflow_log -E \ - '1/t-retry-able/01:running.* \(received\)failed' + '1/t-retry-able/01.*failed' cylc__job__poll_grep_workflow_log -E \ '1/t-retry-able/01:running.* => waiting' diff --git a/tests/functional/reload/10-runahead.t b/tests/functional/reload/10-runahead.t index d591997f68b..fa93be9891a 100644 --- a/tests/functional/reload/10-runahead.t +++ b/tests/functional/reload/10-runahead.t @@ -30,7 +30,7 @@ run_fail "${TEST_NAME}" cylc play --debug --no-detach "${WORKFLOW_NAME}" #------------------------------------------------------------------------------- TEST_NAME=${TEST_NAME_BASE}-check-fail DB_FILE="$RUN_DIR/${WORKFLOW_NAME}/log/db" -QUERY='SELECT COUNT(*) FROM task_states WHERE status == "failed"' +QUERY="SELECT COUNT(*) FROM task_states WHERE status == 'failed'" cmp_ok <(sqlite3 "$DB_FILE" "$QUERY") <<< "4" #------------------------------------------------------------------------------- purge diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t index 0269a2e3775..db37d26c793 100644 --- a/tests/functional/reload/25-xtriggers.t +++ b/tests/functional/reload/25-xtriggers.t @@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[reload]] script = """ # wait for "broken" to fail - cylc__job__poll_grep_workflow_log -E '1/broken/01.* \(received\)failed/ERR' + cylc__job__poll_grep_workflow_log -E '1/broken/01.* failed/ERR' # fix "broken" to allow it to pass sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc" # reload the workflow diff --git a/tests/functional/reload/runahead/flow.cylc b/tests/functional/reload/runahead/flow.cylc index c65b5e11d6d..b1fd8646bf3 100644 --- a/tests/functional/reload/runahead/flow.cylc +++ b/tests/functional/reload/runahead/flow.cylc @@ -20,7 +20,7 @@ script = true [[reloader]] script = """ - cylc__job__poll_grep_workflow_log -E "${CYLC_TASK_CYCLE_POINT}/foo/01:running.*\(received\)failed" + cylc__job__poll_grep_workflow_log -E "${CYLC_TASK_CYCLE_POINT}/foo/01:running.*failed" perl -pi -e 's/(runahead limit = )P1( # marker)/\1 P3\2/' $CYLC_WORKFLOW_RUN_DIR/flow.cylc cylc reload $CYLC_WORKFLOW_ID """ diff --git a/tests/functional/spawn-on-demand/10-retrigger/flow.cylc b/tests/functional/spawn-on-demand/10-retrigger/flow.cylc index 7e9149ce3c9..38e9aafaeee 100644 --- a/tests/functional/spawn-on-demand/10-retrigger/flow.cylc +++ b/tests/functional/spawn-on-demand/10-retrigger/flow.cylc @@ -18,7 +18,7 @@ """ [[triggerer]] script = """ - cylc__job__poll_grep_workflow_log -E '1/oops/01:running.* \(received\)failed' + cylc__job__poll_grep_workflow_log -E '1/oops/01.* failed' cylc trigger "${CYLC_WORKFLOW_ID}//1/oops" """ [[foo, bar]] diff --git a/tests/functional/triggering/19-and-suicide/flow.cylc b/tests/functional/triggering/19-and-suicide/flow.cylc index 670c361fc96..063360022a5 100644 --- a/tests/functional/triggering/19-and-suicide/flow.cylc +++ b/tests/functional/triggering/19-and-suicide/flow.cylc @@ -16,7 +16,7 @@ [[t0]] # https://github.com/cylc/cylc-flow/issues/2655 # "1/t2" should not suicide on "1/t1:failed" - script = cylc__job__poll_grep_workflow_log -E '1/t1.* \(received\)failed' + script = cylc__job__poll_grep_workflow_log -E '1/t1.* failed' [[t1]] script = false [[t2]] From 0b26abd5289aa5b8741f649b5c975ba511ea58d2 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:18:45 +0100 Subject: [PATCH 06/12] update integration tests --- .../hold-release/11-retrying/flow.cylc | 2 +- tests/functional/reload/25-xtriggers.t | 4 +- tests/integration/test_task_events_mgr.py | 44 ++++++++++++++----- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc index 0e08699af09..dbd1ec3ed59 100644 --- a/tests/functional/hold-release/11-retrying/flow.cylc +++ b/tests/functional/hold-release/11-retrying/flow.cylc @@ -18,7 +18,7 @@ t-retry-able => t-analyse [[t-hold-release]] script = """ cylc__job__poll_grep_workflow_log -E \ - '1/t-retry-able/01:running.* \(received\)failed' + '\[1/t-retry-able:waiting\] failed/ERR' cylc__job__poll_grep_workflow_log -E \ '1/t-retry-able/01:running.* => waiting' diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t index 0269a2e3775..fdaa4ebb3f1 100644 --- a/tests/functional/reload/25-xtriggers.t +++ b/tests/functional/reload/25-xtriggers.t @@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[reload]] script = """ # wait for "broken" to fail - cylc__job__poll_grep_workflow_log -E '1/broken/01.* \(received\)failed/ERR' + cylc__job__poll_grep_workflow_log -E '1/broken.*failed/ERR' # fix "broken" to allow it to pass sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc" # reload the workflow @@ -63,7 +63,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play "${WORKFLOW_NAME}" --no-detach log_scan "${TEST_NAME_BASE}-scan" \ "$(cylc cat-log -m p "${WORKFLOW_NAME}")" \ 1 1 \ - '1/broken.* (received)failed/ERR' + '1/broken.*failed/ERR' log_scan "${TEST_NAME_BASE}-scan" \ "$(cylc cat-log -m p "${WORKFLOW_NAME}")" 1 1 \ diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index 6e996bced6d..a8da192b863 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -76,17 +76,22 @@ async def test__insert_task_job(flow, one_conf, scheduler, start, validate): with correct submit number. """ conf = { - 'scheduling': {'graph': {'R1': 'rhenas'}}, - 'runtime': {'rhenas': {'simulation': { - 'fail cycle points': '1', - 'fail try 1 only': False, - }}}} + "scheduling": {"graph": {"R1": "rhenas"}}, + "runtime": { + "rhenas": { + "simulation": { + "fail cycle points": "1", + "fail try 1 only": False, + } + } + }, + } id_ = flow(conf) schd = scheduler(id_) async with start(schd): # Set task to running: - itask = schd.pool.get_tasks()[0] - itask.state.status = 'running' + itask = schd.pool.get_tasks()[0] + itask.state.status = "running" itask.submit_num += 1 # Not run _insert_task_job yet: @@ -185,9 +190,28 @@ async def test__process_message_failed_with_retry(one, start): 'submission retry delays': [1] }) - # Process failed message: + # Process submit failed message with and without retries: + one.task_events_mgr._process_message_submit_failed( + fail_once, None, 1, False) + last_record = LOG.records[-1] + assert last_record.levelno == logging.WARNING + assert '1/one:waiting(queued)' in last_record.message + + one.task_events_mgr._process_message_submit_failed( + fail_once, None, 2, False) + last_record = LOG.records[-1] + assert last_record.levelno == logging.ERROR + assert 'submission failed' in last_record.message + + # Process failed message with and without retries: one.task_events_mgr._process_message_failed( fail_once, None, 'failed', False, 'failed/OOK') + last_record = LOG.records[-1] + assert last_record.levelno == logging.WARNING + assert 'failed/OOK' in last_record.message - # Check that failure reported: - assert 'failed/OOK' in LOG.messages[-1] + one.task_events_mgr._process_message_failed( + fail_once, None, 'failed', False, 'failed/OOK') + last_record = LOG.records[-1] + assert last_record.levelno == logging.ERROR + assert 'failed/OOK' in last_record.message From 5453cac7cdfad2560156ddbde87fb847d5af27a0 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 7 Nov 2024 07:46:13 +0000 Subject: [PATCH 07/12] Update cylc/flow/task_events_mgr.py Co-authored-by: Hilary James Oliver response to review --- cylc/flow/task_events_mgr.py | 6 ++---- .../cylc-trigger/02-filter-failed/flow.cylc | 6 +++--- tests/integration/test_task_events_mgr.py | 12 +++++------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py index 121b4ae4e59..821fc09c370 100644 --- a/cylc/flow/task_events_mgr.py +++ b/cylc/flow/task_events_mgr.py @@ -1324,6 +1324,7 @@ def _process_message_failed( "run_status": 1, "time_run_exit": event_time, }) + LOG.error(f'[{itask}] {full_message or self.EVENT_FAILED}') if ( forced or TimerFlags.EXECUTION_RETRY not in itask.try_timers @@ -1338,9 +1339,6 @@ def _process_message_failed( self.data_store_mgr.delta_task_output( itask, TASK_OUTPUT_FAILED) self.data_store_mgr.delta_task_state(itask) - LOG.error( - f'[{itask}] {full_message or self.EVENT_FAILED} - ' - ) else: # There is an execution retry lined up. timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] @@ -1428,6 +1426,7 @@ def _process_message_submit_failed( "submit_status": 1, }) itask.summary['submit_method_id'] = None + LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}") if ( forced or TimerFlags.SUBMISSION_RETRY not in itask.try_timers @@ -1446,7 +1445,6 @@ def _process_message_submit_failed( self.data_store_mgr.delta_task_output( itask, TASK_OUTPUT_SUBMIT_FAILED) self.data_store_mgr.delta_task_state(itask) - LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}") else: # There is a submission retry lined up. timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY] diff --git a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc index 342f1449980..3da59a7d867 100644 --- a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc +++ b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc @@ -18,9 +18,9 @@ [[fixer]] script = """ cylc__job__wait_cylc_message_started - cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:failed\] failed/ERR' - cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:failed\] failed/ERR' - cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:failed\] failed/ERR' + cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:running\] failed/ERR' + cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:running\] failed/ERR' + cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:running\] failed/ERR' cylc trigger "${CYLC_WORKFLOW_ID}//1/fixable*" """ [[Z]] diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index a8da192b863..00557971135 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -174,7 +174,7 @@ async def test__always_insert_task_job( } -async def test__process_message_failed_with_retry(one, start): +async def test__process_message_failed_with_retry(one, start, log_filter): """Log job failure, even if a retry is scheduled. See: https://github.com/cylc/cylc-flow/pull/6169 @@ -199,9 +199,8 @@ async def test__process_message_failed_with_retry(one, start): one.task_events_mgr._process_message_submit_failed( fail_once, None, 2, False) - last_record = LOG.records[-1] - assert last_record.levelno == logging.ERROR - assert 'submission failed' in last_record.message + failed_record = log_filter(LOG, level=logging.ERROR)[-1] + assert 'submission failed' in failed_record[2] # Process failed message with and without retries: one.task_events_mgr._process_message_failed( @@ -212,6 +211,5 @@ async def test__process_message_failed_with_retry(one, start): one.task_events_mgr._process_message_failed( fail_once, None, 'failed', False, 'failed/OOK') - last_record = LOG.records[-1] - assert last_record.levelno == logging.ERROR - assert 'failed/OOK' in last_record.message + failed_record = log_filter(LOG, level=logging.ERROR)[-1] + assert 'failed/OOK' in failed_record[2] From 0def449824d58010943e6bc6e8bb0eb53eee1ba7 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:48:04 +0000 Subject: [PATCH 08/12] Apply suggestions from code review Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- tests/functional/reload/25-xtriggers.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t index fdaa4ebb3f1..292661bbaa1 100644 --- a/tests/functional/reload/25-xtriggers.t +++ b/tests/functional/reload/25-xtriggers.t @@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[reload]] script = """ # wait for "broken" to fail - cylc__job__poll_grep_workflow_log -E '1/broken.*failed/ERR' + cylc__job__poll_grep_workflow_log -E '1/broken/01.*failed/ERR' # fix "broken" to allow it to pass sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc" # reload the workflow @@ -63,7 +63,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play "${WORKFLOW_NAME}" --no-detach log_scan "${TEST_NAME_BASE}-scan" \ "$(cylc cat-log -m p "${WORKFLOW_NAME}")" \ 1 1 \ - '1/broken.*failed/ERR' + '1/broken/01.*failed/ERR' log_scan "${TEST_NAME_BASE}-scan" \ "$(cylc cat-log -m p "${WORKFLOW_NAME}")" 1 1 \ From a06353166a9fa29ed5883583094a2925ff731b3d Mon Sep 17 00:00:00 2001 From: Tim Pillinger Date: Tue, 7 Jan 2025 14:03:09 +0000 Subject: [PATCH 09/12] fixed test wrt to review --- tests/integration/test_task_events_mgr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index 00557971135..cf4cfc6253b 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -195,7 +195,7 @@ async def test__process_message_failed_with_retry(one, start, log_filter): fail_once, None, 1, False) last_record = LOG.records[-1] assert last_record.levelno == logging.WARNING - assert '1/one:waiting(queued)' in last_record.message + assert '1/one:waiting(queued)] retrying in' in last_record.message one.task_events_mgr._process_message_submit_failed( fail_once, None, 2, False) From 42fdd8b1dc215225a01e4076828ffdf64dac4dde Mon Sep 17 00:00:00 2001 From: Tim Pillinger Date: Wed, 8 Jan 2025 09:41:43 +0000 Subject: [PATCH 10/12] removed failure message from retry logging --- cylc/flow/task_events_mgr.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py index 821fc09c370..fa48ad271c5 100644 --- a/cylc/flow/task_events_mgr.py +++ b/cylc/flow/task_events_mgr.py @@ -1344,10 +1344,7 @@ def _process_message_failed( timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] self._retry_task(itask, timer.timeout) delay_msg = f"retrying in {timer.delay_timeout_as_str()}" - LOG.warning( - f'[{itask}] {full_message or self.EVENT_FAILED} - ' - f'{delay_msg}' - ) + LOG.warning(f'[{itask}] - {delay_msg}') msg = f"{self.JOB_FAILED}, {delay_msg}" self.setup_event_handlers(itask, self.EVENT_RETRY, msg) self._reset_job_timers(itask) From 463395f5086c55108269f11c6a02b7bbfd88bc71 Mon Sep 17 00:00:00 2001 From: Tim Pillinger Date: Wed, 8 Jan 2025 09:44:38 +0000 Subject: [PATCH 11/12] modify retry message to remove repeating the error --- tests/integration/test_task_events_mgr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py index cf4cfc6253b..eaa3561d371 100644 --- a/tests/integration/test_task_events_mgr.py +++ b/tests/integration/test_task_events_mgr.py @@ -207,7 +207,7 @@ async def test__process_message_failed_with_retry(one, start, log_filter): fail_once, None, 'failed', False, 'failed/OOK') last_record = LOG.records[-1] assert last_record.levelno == logging.WARNING - assert 'failed/OOK' in last_record.message + assert 'retrying in' in last_record.message one.task_events_mgr._process_message_failed( fail_once, None, 'failed', False, 'failed/OOK') From 21973679268e3b8d11e9c4d07ff9ff49e8183855 Mon Sep 17 00:00:00 2001 From: Tim Pillinger Date: Thu, 9 Jan 2025 09:32:51 +0000 Subject: [PATCH 12/12] fix another test --- tests/functional/hold-release/11-retrying/flow.cylc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc index dbd1ec3ed59..6473d87e148 100644 --- a/tests/functional/hold-release/11-retrying/flow.cylc +++ b/tests/functional/hold-release/11-retrying/flow.cylc @@ -18,7 +18,7 @@ t-retry-able => t-analyse [[t-hold-release]] script = """ cylc__job__poll_grep_workflow_log -E \ - '\[1/t-retry-able:waiting\] failed/ERR' + '\[1/t-retry-able:waiting\] - retrying' cylc__job__poll_grep_workflow_log -E \ '1/t-retry-able/01:running.* => waiting'