From f24ecef8b3c527acd83a97837502ccc7d417577d Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Sun, 5 May 2024 04:28:07 -0600 Subject: [PATCH] fix: Warn when signal is handled (#5186) Cloud-init is expected to run to completion. It does not expect to be stopped or restarted. Therefore, cloud-init should loudly warn when this happens so that the user is aware that the unexpected has happened. Problems: - When systemd stops cloud-init, no warning is logged. - When systemd restarts cloud-init, status info and return code from the current stage is lost. The net effect of these problems is that when cloud-init is restarted no warning is logged, and cloud-init status indicates no problem. Communicate unexpected state by: - Increase signal handling log level to ERROR, add log to stderr. - Write status.json before exiting when sys.exit() is called. - Warn when status.json contains existing data when the stage starts[1]. - Append existing errors and recoverable status.json errors rather than overwriting[2]. This should make cloud-init properly warn in status output and logs that something is amiss. [1] This should never happen and indicates that something restarted the service. When cloud-init doesn't have the opportunity to handle a signal (SIGKILL), other mitigations will fail. [2] Pre-existing warnings were previously silently lost from `cloud-init status`'s output. Fixes GH-5190 --- cloudinit/cmd/main.py | 117 +++++++++++++++++--------- cloudinit/signal_handler.py | 2 +- tests/integration_tests/util.py | 4 + tests/unittests/test_cli.py | 141 ++++++++++++++++++++++++++++++-- 4 files changed, 220 insertions(+), 44 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 92a78504b9e..96f6a750f63 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -56,6 +56,14 @@ "once": PER_ONCE, } +# https://cloudinit.readthedocs.io/en/latest/explanation/boot.html +STAGE_NAME = { + "init-local": "Local Stage", + "init": "Network Stage", + "modules-config": "Config Stage", + "modules-final": "Final Stage", +} + LOG = logging.getLogger(__name__) @@ -737,18 +745,26 @@ def status_wrapper(name, args): else: raise ValueError("unknown name: %s" % name) - modes = ( - "init", - "init-local", - "modules-config", - "modules-final", - ) - if mode not in modes: + if mode not in STAGE_NAME: raise ValueError( "Invalid cloud init mode specified '{0}'".format(mode) ) - status = None + nullstatus = { + "errors": [], + "recoverable_errors": {}, + "start": None, + "finished": None, + } + status = { + "v1": { + "datasource": None, + "init": nullstatus.copy(), + "init-local": nullstatus.copy(), + "modules-config": nullstatus.copy(), + "modules-final": nullstatus.copy(), + } + } if mode == "init-local": for f in (status_link, result_link, status_path, result_path): util.del_file(f) @@ -758,25 +774,21 @@ def status_wrapper(name, args): except Exception: pass - nullstatus = { - "errors": [], - "start": None, - "finished": None, - } - - if status is None: - status = {"v1": {}} - status["v1"]["datasource"] = None - - for m in modes: - if m not in status["v1"]: - status["v1"][m] = nullstatus.copy() + if mode not in status["v1"]: + # this should never happen, but leave it just to be safe + status["v1"][mode] = nullstatus.copy() v1 = status["v1"] v1["stage"] = mode - uptime = util.uptime() - v1[mode]["start"] = float(uptime) - v1[mode]["recoverable_errors"] = next( + if v1[mode]["start"] and not v1[mode]["finished"]: + # This stage was restarted, which isn't expected. + LOG.warning( + "Unexpected start time found for %s. Was this stage restarted?", + STAGE_NAME[mode], + ) + + v1[mode]["start"] = float(util.uptime()) + preexisting_recoverable_errors = next( filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers) ).export_logs() @@ -795,27 +807,56 @@ def status_wrapper(name, args): else: errors = ret - v1[mode]["errors"] = [str(e) for e in errors] - + v1[mode]["errors"].extend([str(e) for e in errors]) except Exception as e: - util.logexc(LOG, "failed stage %s", mode) + LOG.exception("failed stage %s", mode) print_exc("failed run of stage %s" % mode) - v1[mode]["errors"] = [str(e)] - - v1[mode]["finished"] = float(util.uptime()) - v1["stage"] = None + v1[mode]["errors"].append(str(e)) + except SystemExit as e: + # All calls to sys.exit() resume running here. + # silence a pylint false positive + # https://github.com/pylint-dev/pylint/issues/9556 + if e.code: # pylint: disable=using-constant-test + # Only log errors when sys.exit() is called with a non-zero + # exit code + LOG.exception("failed stage %s", mode) + print_exc("failed run of stage %s" % mode) + v1[mode]["errors"].append(f"sys.exit({str(e.code)}) called") + finally: + # Before it exits, cloud-init will: + # 1) Write status.json (and result.json if in Final stage). + # 2) Write the final log message containing module run time. + # 3) Flush any queued reporting event handlers. + v1[mode]["finished"] = float(util.uptime()) + v1["stage"] = None + + # merge new recoverable errors into existing recoverable error list + new_recoverable_errors = next( + filter( + lambda h: isinstance(h, log.LogExporter), root_logger.handlers + ) + ).export_logs() + for key in new_recoverable_errors.keys(): + if key in preexisting_recoverable_errors: + v1[mode]["recoverable_errors"][key] = list( + set( + preexisting_recoverable_errors[key] + + new_recoverable_errors[key] + ) + ) + else: + v1[mode]["recoverable_errors"][key] = new_recoverable_errors[ + key + ] - # Write status.json after running init / module code - v1[mode]["recoverable_errors"] = next( - filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers) - ).export_logs() - atomic_helper.write_json(status_path, status) + # Write status.json after running init / module code + atomic_helper.write_json(status_path, status) if mode == "modules-final": # write the 'finished' file errors = [] - for m in modes: - if v1[m]["errors"]: + for m in v1.keys(): + if isinstance(v1[m], dict) and v1[m].get("errors"): errors.extend(v1[m].get("errors", [])) atomic_helper.write_json( diff --git a/cloudinit/signal_handler.py b/cloudinit/signal_handler.py index 63744178f3f..00eb4f4af05 100644 --- a/cloudinit/signal_handler.py +++ b/cloudinit/signal_handler.py @@ -44,7 +44,7 @@ def _handle_exit(signum, frame): contents = StringIO() contents.write("%s\n" % (msg)) _pprint_frame(frame, 1, BACK_FRAME_TRACE_DEPTH, contents) - util.multi_log(contents.getvalue(), console=True, stderr=False, log=LOG) + util.multi_log(contents.getvalue(), log=LOG, log_level=logging.ERROR) sys.exit(rc) diff --git a/tests/integration_tests/util.py b/tests/integration_tests/util.py index 1bcc26bccfa..71cdf631b59 100644 --- a/tests/integration_tests/util.py +++ b/tests/integration_tests/util.py @@ -184,6 +184,10 @@ def verify_clean_log(log: str, ignore_deprecations: bool = True): raise AssertionError( "Found unexpected errors: %s" % "\n".join(error_logs) ) + if re.findall("Cloud-init.*received SIG", log): + raise AssertionError( + "Found unexpected signal termination: %s" % "\n".join(error_logs) + ) warning_count = log.count("[WARNING]") expected_warnings = 0 diff --git a/tests/unittests/test_cli.py b/tests/unittests/test_cli.py index b17d719fa20..b3851732994 100644 --- a/tests/unittests/test_cli.py +++ b/tests/unittests/test_cli.py @@ -2,8 +2,10 @@ import contextlib import io +import json import logging import os +import sys from collections import namedtuple import pytest @@ -16,6 +18,7 @@ M_PATH = "cloudinit.cmd.main." Tmpdir = namedtuple("Tmpdir", ["tmpdir", "link_d", "data_d"]) +FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"]) @pytest.fixture() @@ -76,7 +79,6 @@ def _call_main(self, sysv_args=None): def test_status_wrapper_errors( self, action, name, match, caplog, mock_status_wrapper ): - FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"]) my_action = mock.Mock() myargs = FakeArgs((action, my_action), False, "bogusmode") @@ -102,8 +104,6 @@ def test_status_wrapper_init_local_writes_fresh_status_info( str(_dir), {"status.json": "old", "result.json": "old"} ) - FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"]) - def myaction(name, args): # Return an error to watch status capture them return "SomeDatasource", ["an error"] @@ -140,8 +140,6 @@ def test_status_wrapper_init_local_honor_cloud_dir( data_d = mock_status_wrapper.data_d link_d = mock_status_wrapper.link_d - FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"]) - def myaction(name, args): # Return an error to watch status capture them return "SomeDatasource", ["an_error"] @@ -414,3 +412,136 @@ def test_features_hook_subcommand(self, m_features): assert "features" == parseargs.action[0] assert False is parseargs.debug assert False is parseargs.force + + +class TestSignalHandling: + @mock.patch("cloudinit.cmd.main.atomic_helper.write_json") + def test_status_wrapper_signal_sys_exit( + self, + m_json, + mocker, + mock_status_wrapper, + ): + """make sure that when sys.exit(N) is called, the correct code is + returned + """ + for code in [1, 2, 3, 4]: + rc = cli.status_wrapper( + "init", + FakeArgs( + ( + None, + # silence pylint false positive + # https://github.com/pylint-dev/pylint/issues/9557 + lambda *_: sys.exit(code), # pylint: disable=W0640 + ), + False, + "bogusmode", + ), + ) + assert 1 == rc + + # assert that the status shows errors + assert ( + f"sys.exit({code}) called" + in m_json.call_args[0][1]["v1"]["init"]["errors"] + ) + + @mock.patch("cloudinit.cmd.main.atomic_helper.write_json") + def test_status_wrapper_no_signal_sys_exit( + self, + m_json, + mock_status_wrapper, + ): + """if sys.exit(0) is called, make sure that cloud-init doesn't log a + warning""" + # call status_wrapper() with the required args + rc = cli.status_wrapper( + "init", + FakeArgs( + ( + None, + lambda *_: sys.exit(0), + ), + False, + "bogusmode", + ), + ) + assert 0 == rc + assert not m_json.call_args[0][1]["v1"]["init"]["errors"] + + @mock.patch("cloudinit.cmd.main.atomic_helper.write_json") + def test_status_wrapper_signal_warnings( + self, + m_json, + mock_status_wrapper, + ): + """If a stage is started and status.json already has a start time but + no end time for that stage, this is an unknown state - make sure that + a warning is logged. + """ + + # Write a status.json to the mocked temporary directory + for dir in mock_status_wrapper.data_d, mock_status_wrapper.link_d: + test_helpers.populate_dir( + str(dir), + { + "status.json": json.dumps( + { + "v1": { + "stage": "init", + "datasource": ( + "DataSourceNoCloud " + "[seed=/var/.../seed/nocloud-net]" + "[dsmode=net]" + ), + "init": { + "errors": [], + "recoverable_errors": {}, + "start": 124.567, + "finished": None, + }, + "init-local": { + "errors": [], + "recoverable_errors": {}, + "start": 100.0, + "finished": 100.00001, + }, + "modules-config": { + "errors": [], + "recoverable_errors": {}, + "start": None, + "finished": None, + }, + "modules-final": { + "errors": [], + "recoverable_errors": {}, + "start": None, + "finished": None, + }, + } + } + ) + }, + ) + # call status_wrapper() with the required args + cli.status_wrapper( + "init", + FakeArgs( + ( + None, + lambda *_: ("SomeDataSource", []), + ), + False, + "bogusmode", + ), + ) + + # assert that the status shows recoverable errors + assert ( + "Unexpected start time found for Network Stage. " + "Was this stage restarted?" + in m_json.call_args[0][1]["v1"]["init"]["recoverable_errors"][ + "WARNING" + ] + )