From 502b63b542e1dfa2b80782c550ed3bd33fb3283b Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Wed, 9 Oct 2024 09:13:11 +0100
Subject: [PATCH 01/12] Log job failure even when it doesn't cause a change in
task state (i.e. when there is a retry set up).
---
changes.d/6169.fix | 1 +
cylc/flow/task_events_mgr.py | 20 ++++++++++++-----
tests/integration/test_task_events_mgr.py | 27 ++++++++++++++++++++---
3 files changed, 40 insertions(+), 8 deletions(-)
create mode 100644 changes.d/6169.fix
diff --git a/changes.d/6169.fix b/changes.d/6169.fix
new file mode 100644
index 00000000000..d116ea1ba75
--- /dev/null
+++ b/changes.d/6169.fix
@@ -0,0 +1 @@
+Ensure that job failure is logged, even when the presence of retries causes the task not to change state.
diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py
index 0a65baea1a9..7c3567448ef 100644
--- a/cylc/flow/task_events_mgr.py
+++ b/cylc/flow/task_events_mgr.py
@@ -744,7 +744,7 @@ def process_message(
# Already failed.
return True
if self._process_message_failed(
- itask, event_time, self.JOB_FAILED, forced
+ itask, event_time, self.JOB_FAILED, forced, message
):
self.spawn_children(itask, TASK_OUTPUT_FAILED)
@@ -795,7 +795,7 @@ def process_message(
self.workflow_db_mgr.put_update_task_jobs(
itask, {"run_signal": signal})
if self._process_message_failed(
- itask, event_time, self.JOB_FAILED, forced
+ itask, event_time, self.JOB_FAILED, forced, message
):
self.spawn_children(itask, TASK_OUTPUT_FAILED)
@@ -812,7 +812,7 @@ def process_message(
self.workflow_db_mgr.put_update_task_jobs(
itask, {"run_signal": aborted_with})
if self._process_message_failed(
- itask, event_time, aborted_with, forced
+ itask, event_time, aborted_with, forced, message
):
self.spawn_children(itask, TASK_OUTPUT_FAILED)
@@ -1297,10 +1297,17 @@ def _retry_task(self, itask, wallclock_time, submit_retry=False):
if itask.state_reset(TASK_STATUS_WAITING):
self.data_store_mgr.delta_task_state(itask)
- def _process_message_failed(self, itask, event_time, message, forced):
+ def _process_message_failed(
+ self, itask, event_time, message, forced, full_message
+ ):
"""Helper for process_message, handle a failed message.
Return True if no retries (hence go to the failed state).
+
+ Args:
+ full_message:
+ If we have retries lined up we still tell users what
+ happened to cause the this attempt to fail.
"""
no_retries = False
if event_time is None:
@@ -1332,7 +1339,10 @@ def _process_message_failed(self, itask, event_time, message, forced):
timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
self._retry_task(itask, timer.timeout)
delay_msg = f"retrying in {timer.delay_timeout_as_str()}"
- LOG.warning(f"[{itask}] {delay_msg}")
+ LOG.warning(
+ f'[{itask}] {full_message or self.EVENT_FAILED} - '
+ f'{delay_msg}'
+ )
msg = f"{self.JOB_FAILED}, {delay_msg}"
self.setup_event_handlers(itask, self.EVENT_RETRY, msg)
self._reset_job_timers(itask)
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index 7ac12274d7b..6e996bced6d 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -14,7 +14,6 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-from itertools import product
import logging
from typing import Any as Fixture
@@ -22,9 +21,7 @@
from cylc.flow.scheduler import Scheduler
from cylc.flow.data_store_mgr import (
JOBS,
- TASK_STATUSES_ORDERED,
TASK_STATUS_WAITING,
- TASK_STATUS_SUBMIT_FAILED,
)
@@ -170,3 +167,27 @@ async def test__always_insert_task_job(
'1/broken/01': 'submit-failed',
'1/broken2/01': 'submit-failed'
}
+
+
+async def test__process_message_failed_with_retry(one, start):
+ """Log job failure, even if a retry is scheduled.
+
+ See: https://github.com/cylc/cylc-flow/pull/6169
+
+ """
+
+ async with start(one) as LOG:
+ fail_once = one.pool.get_tasks()[0]
+ # Add retry timers:
+ one.task_job_mgr._set_retry_timers(
+ fail_once, {
+ 'execution retry delays': [1],
+ 'submission retry delays': [1]
+ })
+
+ # Process failed message:
+ one.task_events_mgr._process_message_failed(
+ fail_once, None, 'failed', False, 'failed/OOK')
+
+ # Check that failure reported:
+ assert 'failed/OOK' in LOG.messages[-1]
From 7b0a7b1375c6efc4e76811b85319d22fedc13548 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:01:16 +0100
Subject: [PATCH 02/12] move logging to _process_message_ for
submit/failure
---
changes.d/6169.fix | 2 +-
cylc/flow/task_events_mgr.py | 12 ++++++++++--
2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/changes.d/6169.fix b/changes.d/6169.fix
index d116ea1ba75..c7ca6f74da8 100644
--- a/changes.d/6169.fix
+++ b/changes.d/6169.fix
@@ -1 +1 @@
-Ensure that job failure is logged, even when the presence of retries causes the task not to change state.
+Ensure that job submit/failure is logged, even when retries are planned.
diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py
index 7c3567448ef..d1c81085f42 100644
--- a/cylc/flow/task_events_mgr.py
+++ b/cylc/flow/task_events_mgr.py
@@ -928,11 +928,16 @@ def _process_message_check(
return False
severity_lvl: int = LOG_LEVELS.get(severity, INFO)
+ # Don't log submit/failure messages here:
+ if flag != self.FLAG_POLLED and message in {
+ self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR'
+ }:
+ return True
# Demote log level to DEBUG if this is a message that duplicates what
# gets logged by itask state change anyway (and not manual poll)
if severity_lvl > DEBUG and flag != self.FLAG_POLLED and message in {
self.EVENT_SUBMITTED, self.EVENT_STARTED, self.EVENT_SUCCEEDED,
- self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR'
+ self.EVENT_FAILED
}:
severity_lvl = DEBUG
LOG.log(severity_lvl, f"[{itask}] {flag}{message}{timestamp}")
@@ -1334,6 +1339,9 @@ def _process_message_failed(
self.data_store_mgr.delta_task_output(
itask, TASK_OUTPUT_FAILED)
self.data_store_mgr.delta_task_state(itask)
+ LOG.error(
+ f'[{itask}] {full_message or self.EVENT_FAILED} - '
+ )
else:
# There is an execution retry lined up.
timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
@@ -1414,7 +1422,6 @@ def _process_message_submit_failed(
Return True if no retries (hence go to the submit-failed state).
"""
no_retries = False
- LOG.critical(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
if event_time is None:
event_time = get_current_time_string()
self.workflow_db_mgr.put_update_task_jobs(itask, {
@@ -1440,6 +1447,7 @@ def _process_message_submit_failed(
self.data_store_mgr.delta_task_output(
itask, TASK_OUTPUT_SUBMIT_FAILED)
self.data_store_mgr.delta_task_state(itask)
+ LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
else:
# There is a submission retry lined up.
timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY]
From 00230a764de2c41aef23388dc082b9ad14225c9f Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:18:45 +0100
Subject: [PATCH 03/12] update integration tests
---
tests/integration/test_task_events_mgr.py | 44 +++++++++++++++++------
1 file changed, 34 insertions(+), 10 deletions(-)
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index 6e996bced6d..a8da192b863 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -76,17 +76,22 @@ async def test__insert_task_job(flow, one_conf, scheduler, start, validate):
with correct submit number.
"""
conf = {
- 'scheduling': {'graph': {'R1': 'rhenas'}},
- 'runtime': {'rhenas': {'simulation': {
- 'fail cycle points': '1',
- 'fail try 1 only': False,
- }}}}
+ "scheduling": {"graph": {"R1": "rhenas"}},
+ "runtime": {
+ "rhenas": {
+ "simulation": {
+ "fail cycle points": "1",
+ "fail try 1 only": False,
+ }
+ }
+ },
+ }
id_ = flow(conf)
schd = scheduler(id_)
async with start(schd):
# Set task to running:
- itask = schd.pool.get_tasks()[0]
- itask.state.status = 'running'
+ itask = schd.pool.get_tasks()[0]
+ itask.state.status = "running"
itask.submit_num += 1
# Not run _insert_task_job yet:
@@ -185,9 +190,28 @@ async def test__process_message_failed_with_retry(one, start):
'submission retry delays': [1]
})
- # Process failed message:
+ # Process submit failed message with and without retries:
+ one.task_events_mgr._process_message_submit_failed(
+ fail_once, None, 1, False)
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.WARNING
+ assert '1/one:waiting(queued)' in last_record.message
+
+ one.task_events_mgr._process_message_submit_failed(
+ fail_once, None, 2, False)
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.ERROR
+ assert 'submission failed' in last_record.message
+
+ # Process failed message with and without retries:
one.task_events_mgr._process_message_failed(
fail_once, None, 'failed', False, 'failed/OOK')
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.WARNING
+ assert 'failed/OOK' in last_record.message
- # Check that failure reported:
- assert 'failed/OOK' in LOG.messages[-1]
+ one.task_events_mgr._process_message_failed(
+ fail_once, None, 'failed', False, 'failed/OOK')
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.ERROR
+ assert 'failed/OOK' in last_record.message
From 11b83802758af88f255c29fa5170158f2bfd0629 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:08:14 +0000
Subject: [PATCH 04/12] remove EVENT_FAILED from debug demotion set
---
cylc/flow/task_events_mgr.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py
index d1c81085f42..121b4ae4e59 100644
--- a/cylc/flow/task_events_mgr.py
+++ b/cylc/flow/task_events_mgr.py
@@ -937,7 +937,6 @@ def _process_message_check(
# gets logged by itask state change anyway (and not manual poll)
if severity_lvl > DEBUG and flag != self.FLAG_POLLED and message in {
self.EVENT_SUBMITTED, self.EVENT_STARTED, self.EVENT_SUCCEEDED,
- self.EVENT_FAILED
}:
severity_lvl = DEBUG
LOG.log(severity_lvl, f"[{itask}] {flag}{message}{timestamp}")
From 5b5c3910edfb68d9535f76c801da4917e4dc6aa2 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:18:39 +0000
Subject: [PATCH 05/12] [skip ci]wip
---
tests/functional/cylc-remove/00-simple/flow.cylc | 2 +-
tests/functional/cylc-remove/02-cycling/flow.cylc | 4 ++--
.../functional/cylc-trigger/02-filter-failed/flow.cylc | 6 +++---
.../functional/cylc-trigger/04-filter-names/flow.cylc | 10 +++++-----
tests/functional/hold-release/11-retrying/flow.cylc | 2 +-
tests/functional/reload/10-runahead.t | 2 +-
tests/functional/reload/25-xtriggers.t | 2 +-
tests/functional/reload/runahead/flow.cylc | 2 +-
.../functional/spawn-on-demand/10-retrigger/flow.cylc | 2 +-
tests/functional/triggering/19-and-suicide/flow.cylc | 2 +-
10 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/tests/functional/cylc-remove/00-simple/flow.cylc b/tests/functional/cylc-remove/00-simple/flow.cylc
index 84c740ad421..e1c23857b8f 100644
--- a/tests/functional/cylc-remove/00-simple/flow.cylc
+++ b/tests/functional/cylc-remove/00-simple/flow.cylc
@@ -15,7 +15,7 @@
script = false
[[cleaner]]
script = """
-cylc__job__poll_grep_workflow_log -E '1/b/01:running.* \(received\)failed'
+cylc__job__poll_grep_workflow_log -E '1/b/01.* failed'
# Remove the unhandled failed task
cylc remove "$CYLC_WORKFLOW_ID//1/b"
# Remove waiting 1/c
diff --git a/tests/functional/cylc-remove/02-cycling/flow.cylc b/tests/functional/cylc-remove/02-cycling/flow.cylc
index 3b6c1051493..5371d577394 100644
--- a/tests/functional/cylc-remove/02-cycling/flow.cylc
+++ b/tests/functional/cylc-remove/02-cycling/flow.cylc
@@ -17,8 +17,8 @@
[runtime]
[[remover]]
script = """
- cylc__job__poll_grep_workflow_log -E '2020/bar/01:running.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '2021/baz/01:running.* \(received\)failed'
+ cylc__job__poll_grep_workflow_log -E '2020/bar/01.* failed'
+ cylc__job__poll_grep_workflow_log -E '2021/baz/01.* failed'
# Remove the two unhandled failed tasks.
cylc remove "$CYLC_WORKFLOW_ID//*/ba*:failed"
# Remove the two unsatisfied waiting tasks.
diff --git a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
index 7416cf5790d..342f1449980 100644
--- a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
+++ b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
@@ -18,9 +18,9 @@
[[fixer]]
script = """
cylc__job__wait_cylc_message_started
- cylc__job__poll_grep_workflow_log -E '1/fixable1/01:running.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable2/01:running.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable3/01:running.* \(received\)failed'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:failed\] failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:failed\] failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:failed\] failed/ERR'
cylc trigger "${CYLC_WORKFLOW_ID}//1/fixable*"
"""
[[Z]]
diff --git a/tests/functional/cylc-trigger/04-filter-names/flow.cylc b/tests/functional/cylc-trigger/04-filter-names/flow.cylc
index 31839c1b77f..95caf5ddc4e 100644
--- a/tests/functional/cylc-trigger/04-filter-names/flow.cylc
+++ b/tests/functional/cylc-trigger/04-filter-names/flow.cylc
@@ -22,11 +22,11 @@
[[fixer]]
script = """
cylc__job__wait_cylc_message_started
- cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* \(received\)failed'
- cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* \(received\)failed'
+ cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* failed'
+ cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* failed'
+ cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* failed'
+ cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* failed'
+ cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* failed'
cylc trigger "${CYLC_WORKFLOW_ID}//" \
'//1/FIXABLE-1' '//1/fixable-2*' '//1/fixable-3'
"""
diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc
index 0e08699af09..0d747c4c798 100644
--- a/tests/functional/hold-release/11-retrying/flow.cylc
+++ b/tests/functional/hold-release/11-retrying/flow.cylc
@@ -18,7 +18,7 @@ t-retry-able => t-analyse
[[t-hold-release]]
script = """
cylc__job__poll_grep_workflow_log -E \
- '1/t-retry-able/01:running.* \(received\)failed'
+ '1/t-retry-able/01.*failed'
cylc__job__poll_grep_workflow_log -E \
'1/t-retry-able/01:running.* => waiting'
diff --git a/tests/functional/reload/10-runahead.t b/tests/functional/reload/10-runahead.t
index d591997f68b..fa93be9891a 100644
--- a/tests/functional/reload/10-runahead.t
+++ b/tests/functional/reload/10-runahead.t
@@ -30,7 +30,7 @@ run_fail "${TEST_NAME}" cylc play --debug --no-detach "${WORKFLOW_NAME}"
#-------------------------------------------------------------------------------
TEST_NAME=${TEST_NAME_BASE}-check-fail
DB_FILE="$RUN_DIR/${WORKFLOW_NAME}/log/db"
-QUERY='SELECT COUNT(*) FROM task_states WHERE status == "failed"'
+QUERY="SELECT COUNT(*) FROM task_states WHERE status == 'failed'"
cmp_ok <(sqlite3 "$DB_FILE" "$QUERY") <<< "4"
#-------------------------------------------------------------------------------
purge
diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t
index 0269a2e3775..db37d26c793 100644
--- a/tests/functional/reload/25-xtriggers.t
+++ b/tests/functional/reload/25-xtriggers.t
@@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[[reload]]
script = """
# wait for "broken" to fail
- cylc__job__poll_grep_workflow_log -E '1/broken/01.* \(received\)failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '1/broken/01.* failed/ERR'
# fix "broken" to allow it to pass
sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc"
# reload the workflow
diff --git a/tests/functional/reload/runahead/flow.cylc b/tests/functional/reload/runahead/flow.cylc
index c65b5e11d6d..b1fd8646bf3 100644
--- a/tests/functional/reload/runahead/flow.cylc
+++ b/tests/functional/reload/runahead/flow.cylc
@@ -20,7 +20,7 @@
script = true
[[reloader]]
script = """
- cylc__job__poll_grep_workflow_log -E "${CYLC_TASK_CYCLE_POINT}/foo/01:running.*\(received\)failed"
+ cylc__job__poll_grep_workflow_log -E "${CYLC_TASK_CYCLE_POINT}/foo/01:running.*failed"
perl -pi -e 's/(runahead limit = )P1( # marker)/\1 P3\2/' $CYLC_WORKFLOW_RUN_DIR/flow.cylc
cylc reload $CYLC_WORKFLOW_ID
"""
diff --git a/tests/functional/spawn-on-demand/10-retrigger/flow.cylc b/tests/functional/spawn-on-demand/10-retrigger/flow.cylc
index 7e9149ce3c9..38e9aafaeee 100644
--- a/tests/functional/spawn-on-demand/10-retrigger/flow.cylc
+++ b/tests/functional/spawn-on-demand/10-retrigger/flow.cylc
@@ -18,7 +18,7 @@
"""
[[triggerer]]
script = """
- cylc__job__poll_grep_workflow_log -E '1/oops/01:running.* \(received\)failed'
+ cylc__job__poll_grep_workflow_log -E '1/oops/01.* failed'
cylc trigger "${CYLC_WORKFLOW_ID}//1/oops"
"""
[[foo, bar]]
diff --git a/tests/functional/triggering/19-and-suicide/flow.cylc b/tests/functional/triggering/19-and-suicide/flow.cylc
index 670c361fc96..063360022a5 100644
--- a/tests/functional/triggering/19-and-suicide/flow.cylc
+++ b/tests/functional/triggering/19-and-suicide/flow.cylc
@@ -16,7 +16,7 @@
[[t0]]
# https://github.com/cylc/cylc-flow/issues/2655
# "1/t2" should not suicide on "1/t1:failed"
- script = cylc__job__poll_grep_workflow_log -E '1/t1.* \(received\)failed'
+ script = cylc__job__poll_grep_workflow_log -E '1/t1.* failed'
[[t1]]
script = false
[[t2]]
From 0b26abd5289aa5b8741f649b5c975ba511ea58d2 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:18:45 +0100
Subject: [PATCH 06/12] update integration tests
---
.../hold-release/11-retrying/flow.cylc | 2 +-
tests/functional/reload/25-xtriggers.t | 4 +-
tests/integration/test_task_events_mgr.py | 44 ++++++++++++++-----
3 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc
index 0e08699af09..dbd1ec3ed59 100644
--- a/tests/functional/hold-release/11-retrying/flow.cylc
+++ b/tests/functional/hold-release/11-retrying/flow.cylc
@@ -18,7 +18,7 @@ t-retry-able => t-analyse
[[t-hold-release]]
script = """
cylc__job__poll_grep_workflow_log -E \
- '1/t-retry-able/01:running.* \(received\)failed'
+ '\[1/t-retry-able:waiting\] failed/ERR'
cylc__job__poll_grep_workflow_log -E \
'1/t-retry-able/01:running.* => waiting'
diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t
index 0269a2e3775..fdaa4ebb3f1 100644
--- a/tests/functional/reload/25-xtriggers.t
+++ b/tests/functional/reload/25-xtriggers.t
@@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[[reload]]
script = """
# wait for "broken" to fail
- cylc__job__poll_grep_workflow_log -E '1/broken/01.* \(received\)failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '1/broken.*failed/ERR'
# fix "broken" to allow it to pass
sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc"
# reload the workflow
@@ -63,7 +63,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play "${WORKFLOW_NAME}" --no-detach
log_scan "${TEST_NAME_BASE}-scan" \
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" \
1 1 \
- '1/broken.* (received)failed/ERR'
+ '1/broken.*failed/ERR'
log_scan "${TEST_NAME_BASE}-scan" \
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" 1 1 \
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index 6e996bced6d..a8da192b863 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -76,17 +76,22 @@ async def test__insert_task_job(flow, one_conf, scheduler, start, validate):
with correct submit number.
"""
conf = {
- 'scheduling': {'graph': {'R1': 'rhenas'}},
- 'runtime': {'rhenas': {'simulation': {
- 'fail cycle points': '1',
- 'fail try 1 only': False,
- }}}}
+ "scheduling": {"graph": {"R1": "rhenas"}},
+ "runtime": {
+ "rhenas": {
+ "simulation": {
+ "fail cycle points": "1",
+ "fail try 1 only": False,
+ }
+ }
+ },
+ }
id_ = flow(conf)
schd = scheduler(id_)
async with start(schd):
# Set task to running:
- itask = schd.pool.get_tasks()[0]
- itask.state.status = 'running'
+ itask = schd.pool.get_tasks()[0]
+ itask.state.status = "running"
itask.submit_num += 1
# Not run _insert_task_job yet:
@@ -185,9 +190,28 @@ async def test__process_message_failed_with_retry(one, start):
'submission retry delays': [1]
})
- # Process failed message:
+ # Process submit failed message with and without retries:
+ one.task_events_mgr._process_message_submit_failed(
+ fail_once, None, 1, False)
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.WARNING
+ assert '1/one:waiting(queued)' in last_record.message
+
+ one.task_events_mgr._process_message_submit_failed(
+ fail_once, None, 2, False)
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.ERROR
+ assert 'submission failed' in last_record.message
+
+ # Process failed message with and without retries:
one.task_events_mgr._process_message_failed(
fail_once, None, 'failed', False, 'failed/OOK')
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.WARNING
+ assert 'failed/OOK' in last_record.message
- # Check that failure reported:
- assert 'failed/OOK' in LOG.messages[-1]
+ one.task_events_mgr._process_message_failed(
+ fail_once, None, 'failed', False, 'failed/OOK')
+ last_record = LOG.records[-1]
+ assert last_record.levelno == logging.ERROR
+ assert 'failed/OOK' in last_record.message
From 5453cac7cdfad2560156ddbde87fb847d5af27a0 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Thu, 7 Nov 2024 07:46:13 +0000
Subject: [PATCH 07/12] Update cylc/flow/task_events_mgr.py
Co-authored-by: Hilary James Oliver
response to review
---
cylc/flow/task_events_mgr.py | 6 ++----
.../cylc-trigger/02-filter-failed/flow.cylc | 6 +++---
tests/integration/test_task_events_mgr.py | 12 +++++-------
3 files changed, 10 insertions(+), 14 deletions(-)
diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py
index 121b4ae4e59..821fc09c370 100644
--- a/cylc/flow/task_events_mgr.py
+++ b/cylc/flow/task_events_mgr.py
@@ -1324,6 +1324,7 @@ def _process_message_failed(
"run_status": 1,
"time_run_exit": event_time,
})
+ LOG.error(f'[{itask}] {full_message or self.EVENT_FAILED}')
if (
forced
or TimerFlags.EXECUTION_RETRY not in itask.try_timers
@@ -1338,9 +1339,6 @@ def _process_message_failed(
self.data_store_mgr.delta_task_output(
itask, TASK_OUTPUT_FAILED)
self.data_store_mgr.delta_task_state(itask)
- LOG.error(
- f'[{itask}] {full_message or self.EVENT_FAILED} - '
- )
else:
# There is an execution retry lined up.
timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
@@ -1428,6 +1426,7 @@ def _process_message_submit_failed(
"submit_status": 1,
})
itask.summary['submit_method_id'] = None
+ LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
if (
forced
or TimerFlags.SUBMISSION_RETRY not in itask.try_timers
@@ -1446,7 +1445,6 @@ def _process_message_submit_failed(
self.data_store_mgr.delta_task_output(
itask, TASK_OUTPUT_SUBMIT_FAILED)
self.data_store_mgr.delta_task_state(itask)
- LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
else:
# There is a submission retry lined up.
timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY]
diff --git a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
index 342f1449980..3da59a7d867 100644
--- a/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
+++ b/tests/functional/cylc-trigger/02-filter-failed/flow.cylc
@@ -18,9 +18,9 @@
[[fixer]]
script = """
cylc__job__wait_cylc_message_started
- cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:failed\] failed/ERR'
- cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:failed\] failed/ERR'
- cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:failed\] failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:running\] failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:running\] failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:running\] failed/ERR'
cylc trigger "${CYLC_WORKFLOW_ID}//1/fixable*"
"""
[[Z]]
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index a8da192b863..00557971135 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -174,7 +174,7 @@ async def test__always_insert_task_job(
}
-async def test__process_message_failed_with_retry(one, start):
+async def test__process_message_failed_with_retry(one, start, log_filter):
"""Log job failure, even if a retry is scheduled.
See: https://github.com/cylc/cylc-flow/pull/6169
@@ -199,9 +199,8 @@ async def test__process_message_failed_with_retry(one, start):
one.task_events_mgr._process_message_submit_failed(
fail_once, None, 2, False)
- last_record = LOG.records[-1]
- assert last_record.levelno == logging.ERROR
- assert 'submission failed' in last_record.message
+ failed_record = log_filter(LOG, level=logging.ERROR)[-1]
+ assert 'submission failed' in failed_record[2]
# Process failed message with and without retries:
one.task_events_mgr._process_message_failed(
@@ -212,6 +211,5 @@ async def test__process_message_failed_with_retry(one, start):
one.task_events_mgr._process_message_failed(
fail_once, None, 'failed', False, 'failed/OOK')
- last_record = LOG.records[-1]
- assert last_record.levelno == logging.ERROR
- assert 'failed/OOK' in last_record.message
+ failed_record = log_filter(LOG, level=logging.ERROR)[-1]
+ assert 'failed/OOK' in failed_record[2]
From 0def449824d58010943e6bc6e8bb0eb53eee1ba7 Mon Sep 17 00:00:00 2001
From: Tim Pillinger <26465611+wxtim@users.noreply.github.com>
Date: Tue, 7 Jan 2025 13:48:04 +0000
Subject: [PATCH 08/12] Apply suggestions from code review
Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
---
tests/functional/reload/25-xtriggers.t | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tests/functional/reload/25-xtriggers.t b/tests/functional/reload/25-xtriggers.t
index fdaa4ebb3f1..292661bbaa1 100644
--- a/tests/functional/reload/25-xtriggers.t
+++ b/tests/functional/reload/25-xtriggers.t
@@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[[reload]]
script = """
# wait for "broken" to fail
- cylc__job__poll_grep_workflow_log -E '1/broken.*failed/ERR'
+ cylc__job__poll_grep_workflow_log -E '1/broken/01.*failed/ERR'
# fix "broken" to allow it to pass
sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc"
# reload the workflow
@@ -63,7 +63,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play "${WORKFLOW_NAME}" --no-detach
log_scan "${TEST_NAME_BASE}-scan" \
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" \
1 1 \
- '1/broken.*failed/ERR'
+ '1/broken/01.*failed/ERR'
log_scan "${TEST_NAME_BASE}-scan" \
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" 1 1 \
From a06353166a9fa29ed5883583094a2925ff731b3d Mon Sep 17 00:00:00 2001
From: Tim Pillinger
Date: Tue, 7 Jan 2025 14:03:09 +0000
Subject: [PATCH 09/12] fixed test wrt to review
---
tests/integration/test_task_events_mgr.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index 00557971135..cf4cfc6253b 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -195,7 +195,7 @@ async def test__process_message_failed_with_retry(one, start, log_filter):
fail_once, None, 1, False)
last_record = LOG.records[-1]
assert last_record.levelno == logging.WARNING
- assert '1/one:waiting(queued)' in last_record.message
+ assert '1/one:waiting(queued)] retrying in' in last_record.message
one.task_events_mgr._process_message_submit_failed(
fail_once, None, 2, False)
From 42fdd8b1dc215225a01e4076828ffdf64dac4dde Mon Sep 17 00:00:00 2001
From: Tim Pillinger
Date: Wed, 8 Jan 2025 09:41:43 +0000
Subject: [PATCH 10/12] removed failure message from retry logging
---
cylc/flow/task_events_mgr.py | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/cylc/flow/task_events_mgr.py b/cylc/flow/task_events_mgr.py
index 821fc09c370..fa48ad271c5 100644
--- a/cylc/flow/task_events_mgr.py
+++ b/cylc/flow/task_events_mgr.py
@@ -1344,10 +1344,7 @@ def _process_message_failed(
timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
self._retry_task(itask, timer.timeout)
delay_msg = f"retrying in {timer.delay_timeout_as_str()}"
- LOG.warning(
- f'[{itask}] {full_message or self.EVENT_FAILED} - '
- f'{delay_msg}'
- )
+ LOG.warning(f'[{itask}] - {delay_msg}')
msg = f"{self.JOB_FAILED}, {delay_msg}"
self.setup_event_handlers(itask, self.EVENT_RETRY, msg)
self._reset_job_timers(itask)
From 463395f5086c55108269f11c6a02b7bbfd88bc71 Mon Sep 17 00:00:00 2001
From: Tim Pillinger
Date: Wed, 8 Jan 2025 09:44:38 +0000
Subject: [PATCH 11/12] modify retry message to remove repeating the error
---
tests/integration/test_task_events_mgr.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index cf4cfc6253b..eaa3561d371 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -207,7 +207,7 @@ async def test__process_message_failed_with_retry(one, start, log_filter):
fail_once, None, 'failed', False, 'failed/OOK')
last_record = LOG.records[-1]
assert last_record.levelno == logging.WARNING
- assert 'failed/OOK' in last_record.message
+ assert 'retrying in' in last_record.message
one.task_events_mgr._process_message_failed(
fail_once, None, 'failed', False, 'failed/OOK')
From 21973679268e3b8d11e9c4d07ff9ff49e8183855 Mon Sep 17 00:00:00 2001
From: Tim Pillinger
Date: Thu, 9 Jan 2025 09:32:51 +0000
Subject: [PATCH 12/12] fix another test
---
tests/functional/hold-release/11-retrying/flow.cylc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/functional/hold-release/11-retrying/flow.cylc b/tests/functional/hold-release/11-retrying/flow.cylc
index dbd1ec3ed59..6473d87e148 100644
--- a/tests/functional/hold-release/11-retrying/flow.cylc
+++ b/tests/functional/hold-release/11-retrying/flow.cylc
@@ -18,7 +18,7 @@ t-retry-able => t-analyse
[[t-hold-release]]
script = """
cylc__job__poll_grep_workflow_log -E \
- '\[1/t-retry-able:waiting\] failed/ERR'
+ '\[1/t-retry-able:waiting\] - retrying'
cylc__job__poll_grep_workflow_log -E \
'1/t-retry-able/01:running.* => waiting'