From f24ecef8b3c527acd83a97837502ccc7d417577d Mon Sep 17 00:00:00 2001
From: Brett Holman <brett.holman@canonical.com>
Date: Sun, 5 May 2024 04:28:07 -0600
Subject: [PATCH] fix: Warn when signal is handled (#5186)

Cloud-init is expected to run to completion. It does not expect to be
stopped or restarted. Therefore, cloud-init should loudly warn when this
happens so that the user is aware that the unexpected has happened.

Problems:

- When systemd stops cloud-init, no warning is logged.
- When systemd restarts cloud-init, status info and return code from the
  current stage is lost.

The net effect of these problems is that when cloud-init is restarted
no warning is logged, and cloud-init status indicates no problem.

Communicate unexpected state by:

- Increase signal handling log level to ERROR, add log to stderr.
- Write status.json before exiting when sys.exit() is called.
- Warn when status.json contains existing data when the stage
  starts[1].
- Append existing errors and recoverable status.json errors rather
  than overwriting[2].

This should make cloud-init properly warn in status output and logs that
something is amiss.

[1] This should never happen and indicates that something restarted the
    service. When cloud-init doesn't have the opportunity to handle a
    signal (SIGKILL), other mitigations will fail.
[2] Pre-existing warnings were previously silently lost from
    `cloud-init status`'s output.

Fixes GH-5190
---
 cloudinit/cmd/main.py           | 117 +++++++++++++++++---------
 cloudinit/signal_handler.py     |   2 +-
 tests/integration_tests/util.py |   4 +
 tests/unittests/test_cli.py     | 141 ++++++++++++++++++++++++++++++--
 4 files changed, 220 insertions(+), 44 deletions(-)

diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py
index 92a78504b9e..96f6a750f63 100644
--- a/cloudinit/cmd/main.py
+++ b/cloudinit/cmd/main.py
@@ -56,6 +56,14 @@
     "once": PER_ONCE,
 }
 
+# https://cloudinit.readthedocs.io/en/latest/explanation/boot.html
+STAGE_NAME = {
+    "init-local": "Local Stage",
+    "init": "Network Stage",
+    "modules-config": "Config Stage",
+    "modules-final": "Final Stage",
+}
+
 LOG = logging.getLogger(__name__)
 
 
@@ -737,18 +745,26 @@ def status_wrapper(name, args):
     else:
         raise ValueError("unknown name: %s" % name)
 
-    modes = (
-        "init",
-        "init-local",
-        "modules-config",
-        "modules-final",
-    )
-    if mode not in modes:
+    if mode not in STAGE_NAME:
         raise ValueError(
             "Invalid cloud init mode specified '{0}'".format(mode)
         )
 
-    status = None
+    nullstatus = {
+        "errors": [],
+        "recoverable_errors": {},
+        "start": None,
+        "finished": None,
+    }
+    status = {
+        "v1": {
+            "datasource": None,
+            "init": nullstatus.copy(),
+            "init-local": nullstatus.copy(),
+            "modules-config": nullstatus.copy(),
+            "modules-final": nullstatus.copy(),
+        }
+    }
     if mode == "init-local":
         for f in (status_link, result_link, status_path, result_path):
             util.del_file(f)
@@ -758,25 +774,21 @@ def status_wrapper(name, args):
         except Exception:
             pass
 
-    nullstatus = {
-        "errors": [],
-        "start": None,
-        "finished": None,
-    }
-
-    if status is None:
-        status = {"v1": {}}
-        status["v1"]["datasource"] = None
-
-    for m in modes:
-        if m not in status["v1"]:
-            status["v1"][m] = nullstatus.copy()
+    if mode not in status["v1"]:
+        # this should never happen, but leave it just to be safe
+        status["v1"][mode] = nullstatus.copy()
 
     v1 = status["v1"]
     v1["stage"] = mode
-    uptime = util.uptime()
-    v1[mode]["start"] = float(uptime)
-    v1[mode]["recoverable_errors"] = next(
+    if v1[mode]["start"] and not v1[mode]["finished"]:
+        # This stage was restarted, which isn't expected.
+        LOG.warning(
+            "Unexpected start time found for %s. Was this stage restarted?",
+            STAGE_NAME[mode],
+        )
+
+    v1[mode]["start"] = float(util.uptime())
+    preexisting_recoverable_errors = next(
         filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
     ).export_logs()
 
@@ -795,27 +807,56 @@ def status_wrapper(name, args):
         else:
             errors = ret
 
-        v1[mode]["errors"] = [str(e) for e in errors]
-
+        v1[mode]["errors"].extend([str(e) for e in errors])
     except Exception as e:
-        util.logexc(LOG, "failed stage %s", mode)
+        LOG.exception("failed stage %s", mode)
         print_exc("failed run of stage %s" % mode)
-        v1[mode]["errors"] = [str(e)]
-
-    v1[mode]["finished"] = float(util.uptime())
-    v1["stage"] = None
+        v1[mode]["errors"].append(str(e))
+    except SystemExit as e:
+        # All calls to sys.exit() resume running here.
+        # silence a pylint false positive
+        # https://github.com/pylint-dev/pylint/issues/9556
+        if e.code:  # pylint: disable=using-constant-test
+            # Only log errors when sys.exit() is called with a non-zero
+            # exit code
+            LOG.exception("failed stage %s", mode)
+            print_exc("failed run of stage %s" % mode)
+            v1[mode]["errors"].append(f"sys.exit({str(e.code)}) called")
+    finally:
+        # Before it exits, cloud-init will:
+        # 1) Write status.json (and result.json if in Final stage).
+        # 2) Write the final log message containing module run time.
+        # 3) Flush any queued reporting event handlers.
+        v1[mode]["finished"] = float(util.uptime())
+        v1["stage"] = None
+
+        # merge new recoverable errors into existing recoverable error list
+        new_recoverable_errors = next(
+            filter(
+                lambda h: isinstance(h, log.LogExporter), root_logger.handlers
+            )
+        ).export_logs()
+        for key in new_recoverable_errors.keys():
+            if key in preexisting_recoverable_errors:
+                v1[mode]["recoverable_errors"][key] = list(
+                    set(
+                        preexisting_recoverable_errors[key]
+                        + new_recoverable_errors[key]
+                    )
+                )
+            else:
+                v1[mode]["recoverable_errors"][key] = new_recoverable_errors[
+                    key
+                ]
 
-    # Write status.json after running init / module code
-    v1[mode]["recoverable_errors"] = next(
-        filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
-    ).export_logs()
-    atomic_helper.write_json(status_path, status)
+        # Write status.json after running init / module code
+        atomic_helper.write_json(status_path, status)
 
     if mode == "modules-final":
         # write the 'finished' file
         errors = []
-        for m in modes:
-            if v1[m]["errors"]:
+        for m in v1.keys():
+            if isinstance(v1[m], dict) and v1[m].get("errors"):
                 errors.extend(v1[m].get("errors", []))
 
         atomic_helper.write_json(
diff --git a/cloudinit/signal_handler.py b/cloudinit/signal_handler.py
index 63744178f3f..00eb4f4af05 100644
--- a/cloudinit/signal_handler.py
+++ b/cloudinit/signal_handler.py
@@ -44,7 +44,7 @@ def _handle_exit(signum, frame):
     contents = StringIO()
     contents.write("%s\n" % (msg))
     _pprint_frame(frame, 1, BACK_FRAME_TRACE_DEPTH, contents)
-    util.multi_log(contents.getvalue(), console=True, stderr=False, log=LOG)
+    util.multi_log(contents.getvalue(), log=LOG, log_level=logging.ERROR)
     sys.exit(rc)
 
 
diff --git a/tests/integration_tests/util.py b/tests/integration_tests/util.py
index 1bcc26bccfa..71cdf631b59 100644
--- a/tests/integration_tests/util.py
+++ b/tests/integration_tests/util.py
@@ -184,6 +184,10 @@ def verify_clean_log(log: str, ignore_deprecations: bool = True):
         raise AssertionError(
             "Found unexpected errors: %s" % "\n".join(error_logs)
         )
+    if re.findall("Cloud-init.*received SIG", log):
+        raise AssertionError(
+            "Found unexpected signal termination: %s" % "\n".join(error_logs)
+        )
 
     warning_count = log.count("[WARNING]")
     expected_warnings = 0
diff --git a/tests/unittests/test_cli.py b/tests/unittests/test_cli.py
index b17d719fa20..b3851732994 100644
--- a/tests/unittests/test_cli.py
+++ b/tests/unittests/test_cli.py
@@ -2,8 +2,10 @@
 
 import contextlib
 import io
+import json
 import logging
 import os
+import sys
 from collections import namedtuple
 
 import pytest
@@ -16,6 +18,7 @@
 
 M_PATH = "cloudinit.cmd.main."
 Tmpdir = namedtuple("Tmpdir", ["tmpdir", "link_d", "data_d"])
+FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
 
 
 @pytest.fixture()
@@ -76,7 +79,6 @@ def _call_main(self, sysv_args=None):
     def test_status_wrapper_errors(
         self, action, name, match, caplog, mock_status_wrapper
     ):
-        FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
         my_action = mock.Mock()
 
         myargs = FakeArgs((action, my_action), False, "bogusmode")
@@ -102,8 +104,6 @@ def test_status_wrapper_init_local_writes_fresh_status_info(
                 str(_dir), {"status.json": "old", "result.json": "old"}
             )
 
-        FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
-
         def myaction(name, args):
             # Return an error to watch status capture them
             return "SomeDatasource", ["an error"]
@@ -140,8 +140,6 @@ def test_status_wrapper_init_local_honor_cloud_dir(
         data_d = mock_status_wrapper.data_d
         link_d = mock_status_wrapper.link_d
 
-        FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
-
         def myaction(name, args):
             # Return an error to watch status capture them
             return "SomeDatasource", ["an_error"]
@@ -414,3 +412,136 @@ def test_features_hook_subcommand(self, m_features):
         assert "features" == parseargs.action[0]
         assert False is parseargs.debug
         assert False is parseargs.force
+
+
+class TestSignalHandling:
+    @mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
+    def test_status_wrapper_signal_sys_exit(
+        self,
+        m_json,
+        mocker,
+        mock_status_wrapper,
+    ):
+        """make sure that when sys.exit(N) is called, the correct code is
+        returned
+        """
+        for code in [1, 2, 3, 4]:
+            rc = cli.status_wrapper(
+                "init",
+                FakeArgs(
+                    (
+                        None,
+                        # silence pylint false positive
+                        # https://github.com/pylint-dev/pylint/issues/9557
+                        lambda *_: sys.exit(code),  # pylint: disable=W0640
+                    ),
+                    False,
+                    "bogusmode",
+                ),
+            )
+            assert 1 == rc
+
+            # assert that the status shows errors
+            assert (
+                f"sys.exit({code}) called"
+                in m_json.call_args[0][1]["v1"]["init"]["errors"]
+            )
+
+    @mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
+    def test_status_wrapper_no_signal_sys_exit(
+        self,
+        m_json,
+        mock_status_wrapper,
+    ):
+        """if sys.exit(0) is called, make sure that cloud-init doesn't log a
+        warning"""
+        # call status_wrapper() with the required args
+        rc = cli.status_wrapper(
+            "init",
+            FakeArgs(
+                (
+                    None,
+                    lambda *_: sys.exit(0),
+                ),
+                False,
+                "bogusmode",
+            ),
+        )
+        assert 0 == rc
+        assert not m_json.call_args[0][1]["v1"]["init"]["errors"]
+
+    @mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
+    def test_status_wrapper_signal_warnings(
+        self,
+        m_json,
+        mock_status_wrapper,
+    ):
+        """If a stage is started and status.json already has a start time but
+        no end time for that stage, this is an unknown state - make sure that
+        a warning is logged.
+        """
+
+        # Write a status.json to the mocked temporary directory
+        for dir in mock_status_wrapper.data_d, mock_status_wrapper.link_d:
+            test_helpers.populate_dir(
+                str(dir),
+                {
+                    "status.json": json.dumps(
+                        {
+                            "v1": {
+                                "stage": "init",
+                                "datasource": (
+                                    "DataSourceNoCloud "
+                                    "[seed=/var/.../seed/nocloud-net]"
+                                    "[dsmode=net]"
+                                ),
+                                "init": {
+                                    "errors": [],
+                                    "recoverable_errors": {},
+                                    "start": 124.567,
+                                    "finished": None,
+                                },
+                                "init-local": {
+                                    "errors": [],
+                                    "recoverable_errors": {},
+                                    "start": 100.0,
+                                    "finished": 100.00001,
+                                },
+                                "modules-config": {
+                                    "errors": [],
+                                    "recoverable_errors": {},
+                                    "start": None,
+                                    "finished": None,
+                                },
+                                "modules-final": {
+                                    "errors": [],
+                                    "recoverable_errors": {},
+                                    "start": None,
+                                    "finished": None,
+                                },
+                            }
+                        }
+                    )
+                },
+            )
+        # call status_wrapper() with the required args
+        cli.status_wrapper(
+            "init",
+            FakeArgs(
+                (
+                    None,
+                    lambda *_: ("SomeDataSource", []),
+                ),
+                False,
+                "bogusmode",
+            ),
+        )
+
+        # assert that the status shows recoverable errors
+        assert (
+            "Unexpected start time found for Network Stage. "
+            "Was this stage restarted?"
+            in m_json.call_args[0][1]["v1"]["init"]["recoverable_errors"][
+                "WARNING"
+            ]
+        )