Skip to content

Commit

Permalink
status: treat SubState=running and MainPID=0 as service exited
Browse files Browse the repository at this point in the history
status: treat SubState=running and MainPID=0 as service exited

Avoid cloud-init status --wait blocking indefinitely on systemd
status SubState=running and MainPID=0 condition for cloud-init
systemd services.

When daemons are spawned by a systemd unit
in the unified hierarchy, long-lived processes or daemons launched by
the systemd unit will be tracked in the cgroup/slice.
As long as spawned daemons of processes live, the systemd tooling will
report the parent systemd unit as SubState=running instead of
SubState=exited.

This affected Ubuntu across the 23.04 -> 23.10 release boundary
because a long-term patch to ignore cgroup status was dropped at
systemd 252.5 which last existed in Ubuntu 23.04 (Lunar).

Cloud-init status --wait checks MainPID == 0 when SubState=running
as a secondary indicator the the executing process of the boot
stage has completed despite any forked daemons.

Additionally fix typo in puppet integration test name.

Fixes GH-4373
  • Loading branch information
blackboxsw authored Aug 23, 2023
1 parent b669f31 commit f7a2c48
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 23 deletions.
19 changes: 12 additions & 7 deletions cloudinit/cmd/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def _get_systemd_status() -> Optional[UXAppStatus]:
[
"systemctl",
"show",
"--property=ActiveState,UnitFileState,SubState",
"--property=ActiveState,UnitFileState,SubState,MainPID",
service,
],
).stdout
Expand All @@ -226,12 +226,17 @@ def _get_systemd_status() -> Optional[UXAppStatus]:
):
# Individual services should not get disabled
return UXAppStatus.ERROR
if (
states["ActiveState"] == "active"
and states["SubState"] == "exited"
):
# Service exited normally, nothing interesting from systemd
continue
if states["ActiveState"] == "active":
if states["SubState"] == "exited":
# Service exited normally, nothing interesting from systemd
continue
elif states["SubState"] == "running" and states["MainPID"] == "0":
# Service is active, substate still reports running due to
# daemon or backgroud process spawned by CGroup/slice still
# running. MainPID being set back to 0 means control of the
# service/unit has exited in this case and
# "the process is no longer around".
continue
if states["ActiveState"] == "failed" or states["SubState"] == "failed":
# We have an error
return UXAppStatus.ERROR
Expand Down
2 changes: 1 addition & 1 deletion tests/integration_tests/modules/test_puppet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_puppet_service(client: IntegrationInstance):

@pytest.mark.user_data
@pytest.mark.user_data(EXEC_DATA)
def test_pupet_exec(client: IntegrationInstance):
def test_puppet_exec(client: IntegrationInstance):
"""Basic test that puppet gets installed and runs."""
log = client.read_from_file("/var/log/cloud-init.log")
assert "Running command ['puppet', 'agent', '--noop']" in log
41 changes: 26 additions & 15 deletions tests/unittests/cmd/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def test_status_main(

class TestSystemdStatusDetails:
@pytest.mark.parametrize(
["active_state", "unit_file_state", "sub_state", "status"],
["active_state", "unit_file_state", "sub_state", "main_pid", "status"],
[
# To cut down on the combination of states, I'm grouping
# enabled, enabled-runtime, and static into an "enabled" state
Expand All @@ -675,31 +675,42 @@ class TestSystemdStatusDetails:
# different depending on the ActiveState they are mapped too.
# Because of this I'm only testing SubState combinations seen
# in real-world testing (or using "any" string if we dont care).
("activating", "enabled", "start", UXAppStatus.RUNNING),
("active", "enabled-runtime", "exited", None),
("activating", "enabled", "start", "123", UXAppStatus.RUNNING),
("activating", "enabled", "start", "123", UXAppStatus.RUNNING),
("active", "enabled-runtime", "exited", "0", None),
("active", "enabled", "exited", "0", None),
("active", "enabled", "running", "345", UXAppStatus.RUNNING),
("active", "enabled", "running", "0", None),
# Dead doesn't mean exited here. It means not run yet.
("inactive", "static", "dead", UXAppStatus.RUNNING),
("reloading", "enabled", "start", UXAppStatus.RUNNING),
("deactivating", "enabled-runtime", "any", UXAppStatus.RUNNING),
("failed", "static", "failed", UXAppStatus.ERROR),
("inactive", "static", "dead", "123", UXAppStatus.RUNNING),
("reloading", "enabled", "start", "123", UXAppStatus.RUNNING),
(
"deactivating",
"enabled-runtime",
"any",
"123",
UXAppStatus.RUNNING,
),
("failed", "static", "failed", "0", UXAppStatus.ERROR),
# Try previous combinations again with "not enabled" states
("activating", "linked", "start", UXAppStatus.ERROR),
("active", "linked-runtime", "exited", UXAppStatus.ERROR),
("inactive", "masked", "dead", UXAppStatus.ERROR),
("reloading", "masked-runtime", "start", UXAppStatus.ERROR),
("deactivating", "disabled", "any", UXAppStatus.ERROR),
("failed", "invalid", "failed", UXAppStatus.ERROR),
("activating", "linked", "start", "0", UXAppStatus.ERROR),
("active", "linked-runtime", "exited", "0", UXAppStatus.ERROR),
("inactive", "masked", "dead", "0", UXAppStatus.ERROR),
("reloading", "masked-runtime", "start", "0", UXAppStatus.ERROR),
("deactivating", "disabled", "any", "0", UXAppStatus.ERROR),
("failed", "invalid", "failed", "0", UXAppStatus.ERROR),
],
)
def test_get_systemd_status(
self, active_state, unit_file_state, sub_state, status
self, active_state, unit_file_state, sub_state, main_pid, status
):
with mock.patch(
f"{M_PATH}subp.subp",
return_value=SubpResult(
f"ActiveState={active_state}\n"
f"UnitFileState={unit_file_state}\n"
f"SubState={sub_state}",
f"SubState={sub_state}\n"
f"MainPID={main_pid}\n",
stderr=None,
),
):
Expand Down

0 comments on commit f7a2c48

Please sign in to comment.