From f7a2c48f23d924bc9a12742983f866387a248e18 Mon Sep 17 00:00:00 2001 From: Chad Smith Date: Wed, 23 Aug 2023 14:12:22 -0600 Subject: [PATCH] status: treat SubState=running and MainPID=0 as service exited status: treat SubState=running and MainPID=0 as service exited Avoid cloud-init status --wait blocking indefinitely on systemd status SubState=running and MainPID=0 condition for cloud-init systemd services. When daemons are spawned by a systemd unit in the unified hierarchy, long-lived processes or daemons launched by the systemd unit will be tracked in the cgroup/slice. As long as spawned daemons of processes live, the systemd tooling will report the parent systemd unit as SubState=running instead of SubState=exited. This affected Ubuntu across the 23.04 -> 23.10 release boundary because a long-term patch to ignore cgroup status was dropped at systemd 252.5 which last existed in Ubuntu 23.04 (Lunar). Cloud-init status --wait checks MainPID == 0 when SubState=running as a secondary indicator the the executing process of the boot stage has completed despite any forked daemons. Additionally fix typo in puppet integration test name. Fixes GH-4373 --- cloudinit/cmd/status.py | 19 +++++---- .../integration_tests/modules/test_puppet.py | 2 +- tests/unittests/cmd/test_status.py | 41 ++++++++++++------- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/cloudinit/cmd/status.py b/cloudinit/cmd/status.py index a528f5da639..e0db81c8fd9 100644 --- a/cloudinit/cmd/status.py +++ b/cloudinit/cmd/status.py @@ -213,7 +213,7 @@ def _get_systemd_status() -> Optional[UXAppStatus]: [ "systemctl", "show", - "--property=ActiveState,UnitFileState,SubState", + "--property=ActiveState,UnitFileState,SubState,MainPID", service, ], ).stdout @@ -226,12 +226,17 @@ def _get_systemd_status() -> Optional[UXAppStatus]: ): # Individual services should not get disabled return UXAppStatus.ERROR - if ( - states["ActiveState"] == "active" - and states["SubState"] == "exited" - ): - # Service exited normally, nothing interesting from systemd - continue + if states["ActiveState"] == "active": + if states["SubState"] == "exited": + # Service exited normally, nothing interesting from systemd + continue + elif states["SubState"] == "running" and states["MainPID"] == "0": + # Service is active, substate still reports running due to + # daemon or backgroud process spawned by CGroup/slice still + # running. MainPID being set back to 0 means control of the + # service/unit has exited in this case and + # "the process is no longer around". + continue if states["ActiveState"] == "failed" or states["SubState"] == "failed": # We have an error return UXAppStatus.ERROR diff --git a/tests/integration_tests/modules/test_puppet.py b/tests/integration_tests/modules/test_puppet.py index b86138664a5..796f316a711 100644 --- a/tests/integration_tests/modules/test_puppet.py +++ b/tests/integration_tests/modules/test_puppet.py @@ -38,7 +38,7 @@ def test_puppet_service(client: IntegrationInstance): @pytest.mark.user_data @pytest.mark.user_data(EXEC_DATA) -def test_pupet_exec(client: IntegrationInstance): +def test_puppet_exec(client: IntegrationInstance): """Basic test that puppet gets installed and runs.""" log = client.read_from_file("/var/log/cloud-init.log") assert "Running command ['puppet', 'agent', '--noop']" in log diff --git a/tests/unittests/cmd/test_status.py b/tests/unittests/cmd/test_status.py index 7d4502e70b3..994209a1b9a 100644 --- a/tests/unittests/cmd/test_status.py +++ b/tests/unittests/cmd/test_status.py @@ -666,7 +666,7 @@ def test_status_main( class TestSystemdStatusDetails: @pytest.mark.parametrize( - ["active_state", "unit_file_state", "sub_state", "status"], + ["active_state", "unit_file_state", "sub_state", "main_pid", "status"], [ # To cut down on the combination of states, I'm grouping # enabled, enabled-runtime, and static into an "enabled" state @@ -675,31 +675,42 @@ class TestSystemdStatusDetails: # different depending on the ActiveState they are mapped too. # Because of this I'm only testing SubState combinations seen # in real-world testing (or using "any" string if we dont care). - ("activating", "enabled", "start", UXAppStatus.RUNNING), - ("active", "enabled-runtime", "exited", None), + ("activating", "enabled", "start", "123", UXAppStatus.RUNNING), + ("activating", "enabled", "start", "123", UXAppStatus.RUNNING), + ("active", "enabled-runtime", "exited", "0", None), + ("active", "enabled", "exited", "0", None), + ("active", "enabled", "running", "345", UXAppStatus.RUNNING), + ("active", "enabled", "running", "0", None), # Dead doesn't mean exited here. It means not run yet. - ("inactive", "static", "dead", UXAppStatus.RUNNING), - ("reloading", "enabled", "start", UXAppStatus.RUNNING), - ("deactivating", "enabled-runtime", "any", UXAppStatus.RUNNING), - ("failed", "static", "failed", UXAppStatus.ERROR), + ("inactive", "static", "dead", "123", UXAppStatus.RUNNING), + ("reloading", "enabled", "start", "123", UXAppStatus.RUNNING), + ( + "deactivating", + "enabled-runtime", + "any", + "123", + UXAppStatus.RUNNING, + ), + ("failed", "static", "failed", "0", UXAppStatus.ERROR), # Try previous combinations again with "not enabled" states - ("activating", "linked", "start", UXAppStatus.ERROR), - ("active", "linked-runtime", "exited", UXAppStatus.ERROR), - ("inactive", "masked", "dead", UXAppStatus.ERROR), - ("reloading", "masked-runtime", "start", UXAppStatus.ERROR), - ("deactivating", "disabled", "any", UXAppStatus.ERROR), - ("failed", "invalid", "failed", UXAppStatus.ERROR), + ("activating", "linked", "start", "0", UXAppStatus.ERROR), + ("active", "linked-runtime", "exited", "0", UXAppStatus.ERROR), + ("inactive", "masked", "dead", "0", UXAppStatus.ERROR), + ("reloading", "masked-runtime", "start", "0", UXAppStatus.ERROR), + ("deactivating", "disabled", "any", "0", UXAppStatus.ERROR), + ("failed", "invalid", "failed", "0", UXAppStatus.ERROR), ], ) def test_get_systemd_status( - self, active_state, unit_file_state, sub_state, status + self, active_state, unit_file_state, sub_state, main_pid, status ): with mock.patch( f"{M_PATH}subp.subp", return_value=SubpResult( f"ActiveState={active_state}\n" f"UnitFileState={unit_file_state}\n" - f"SubState={sub_state}", + f"SubState={sub_state}\n" + f"MainPID={main_pid}\n", stderr=None, ), ):