From f7ccda9c765f83de140ffa60d70a7c35a5cca145 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Tue, 2 Apr 2024 06:18:14 -0600 Subject: [PATCH 01/22] feat: Single process optimization Python interpreter initialization and module import time is a significant portion of cloud-init's total runtime when the default configuration is used, and in all cases it contributes a significant amount of wall clock time to cloud-init's runtime. This commit significantly improves cloud-init time to completion by eliminating redundant interpreter starts and module loads. Since multiple cloud-init processes sit in the critical chain of the boot order, this significantly reduces cloud-init's time to ssh and time to completion. Cloud-init has four stages. Each stage starts its own Python interpreter and loads the same libraries. To eliminate the redundant work of starting an interpreter and loading libraries, this changes cloud-init to run as a single process. Systemd service ordering is retained by using the existing cloud-init services as shims which use a synchronization protocol to start each cloud-init stage and to communicate that each stage is complete to the init system. Currently only systemd is supported, but the synchronization protocol should be capable of supporting other init systems as well with minor changes. Note: This makes possible many additional improvements that eliminate redundant work. However, these potential improvements are temporarily ignored. This commit has been structured to minimize the changes required to capture the majority of primary performance savings while preserving correctness and the ability to preserve backwards compatibility. Many additional improvements will be possible once this merges. Synchronization protocol ======================== - create one Unix socket for each systemd service stage - send sd_notify() - For each of the four stages (local, network, config, final): - when init system sends "start" to the Unix socket, start the stage - when running stage is complete, send "done" to Unix socket socket.py (new) --------------- - define a systemd-notify helper function - define a context manager which implements a multi-socket synchronization protocol cloud-init-single.service (new) ------------------------------- - use service type to 'notify' - invoke cloud-init in single process mode - adopt systemd ordering requirements from cloud-init-local.service - adopt KillMode from cloud-final.service main.py ------- - Add command line flag to indicate single process mode - In this mode run each stage followed by an IPC synchronization protocol step cloud-{local,init,config,final}.services ---------------------------------- - change ExecStart to use netcat to connect to Unix socket and: - send a start message - wait for completion response - note: a pure Python equivalent is possible for any downstreams which do not package openbsd's netcat cloud-final.services -------------------- - drop KillMode cloud-init-local.services -------------------- - drop dependencies made redundant by ordering after cloud-init-single.service Performance Results =================== An integration test [1] on a Noble lxd container comparing POC to current release showed significant improvement. In the POC, cloud-config.service didn't register in the critical-chain (~340ms improvement), cloud-init.service added ~378ms to total boot time (~400ms improvement), and cloud-init-local.service had a marginal improvement (~90ms) which was likely in the threshold of noise. The total improvement in this (early stage) test showed a 0.83s improvement to total boot time with 0.66s of boot time remaining due to cloud-init. This 0.83s second improvement roughly corresponds to the total boot time, with systemd-analyze critical-chain reporting 2.267s to reach graphical.target, which is a 0.8s improvement over the current release time. Note: The numbers quoted above gathered from only one series (Noble), one platform (lxc), one host machine (Ryzen 7840U), and run environment was not controlled. I ran the test multiple times to ensure that the results were repeatable, but not enough times to be considered statistically significant. I verified that cloud-init worked as expected, but given the limited scope of this integration test, this is still very much a proof of concept. [1] test_logging.py BREAKING_CHANGE: Run all four cloud-init services as a single systemd service. --- cloudinit/cmd/main.py | 54 +++++++- cloudinit/socket.py | 117 ++++++++++++++++++ systemd/cloud-config.service.tmpl | 3 +- systemd/cloud-final.service.tmpl | 4 +- systemd/cloud-init-local.service.tmpl | 4 +- systemd/cloud-init-single.service | 27 ++++ systemd/cloud-init.service.tmpl | 3 +- tests/unittests/test_cli.py | 4 +- tests/unittests/test_single_process.py | 164 +++++++++++++++++++++++++ 9 files changed, 369 insertions(+), 11 deletions(-) create mode 100644 cloudinit/socket.py create mode 100644 systemd/cloud-init-single.service create mode 100644 tests/unittests/test_single_process.py diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 54ba79e1bbc..3a78d2f0805 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -24,6 +24,7 @@ from cloudinit import netinfo from cloudinit import signal_handler from cloudinit import sources +from cloudinit import socket from cloudinit import stages from cloudinit import url_helper from cloudinit import util @@ -38,7 +39,12 @@ from cloudinit.config.schema import validate_cloudconfig_schema from cloudinit import log from cloudinit.reporting import events -from cloudinit.settings import PER_INSTANCE, PER_ALWAYS, PER_ONCE, CLOUD_CONFIG +from cloudinit.settings import ( + PER_INSTANCE, + PER_ALWAYS, + PER_ONCE, + CLOUD_CONFIG, +) # Welcome message template WELCOME_MSG_TPL = ( @@ -953,9 +959,19 @@ def main(sysv_args=None): default=False, ) + parser.add_argument( + "--single-process", + dest="single_process", + action="store_true", + help=( + "Run run the four stages as a single process as an optimization." + "Requires init system integration." + ), + default=False, + ) + parser.set_defaults(reporter=None) subparsers = parser.add_subparsers(title="Subcommands", dest="subcommand") - subparsers.required = True # Each action and its sub-options (if any) parser_init = subparsers.add_parser( @@ -1143,8 +1159,42 @@ def main(sysv_args=None): status_parser(parser_status) parser_status.set_defaults(action=("status", handle_status_args)) + else: + parser.error("a subcommand is required") args = parser.parse_args(args=sysv_args) + if not args.single_process: + return sub_main(args) + LOG.info("Running cloud-init in single process mode.") + + # this _must_ be called before sd_notify is called otherwise netcat may + # attempt to send "start" before a socket exists + sync = socket.SocketSync("local", "network", "config", "final") + + # notify cloud-init-local.service that this stage has completed + socket.sd_notify(b"READY=1") + + # wait for cloud-init-local.service to start + with sync("local"): + sub_main(parser.parse_args(args=["init", "--local"])) + + # wait for cloud-init.service to start + with sync("network"): + # init stage + sub_main(parser.parse_args(args=["init"])) + + # wait for cloud-config.service to start + with sync("config"): + # config stage + sub_main(parser.parse_args(args=["modules", "--mode=config"])) + + with sync("final"): + # final stage + sub_main(parser.parse_args(args=["modules", "--mode=final"])) + socket.sd_notify(b"STOPPING=1") + + +def sub_main(args): # Subparsers.required = True and each subparser sets action=(name, functor) (name, functor) = args.action diff --git a/cloudinit/socket.py b/cloudinit/socket.py new file mode 100644 index 00000000000..685f43f28f9 --- /dev/null +++ b/cloudinit/socket.py @@ -0,0 +1,117 @@ +# This file is part of cloud-init. See LICENSE file for license information. +"""A module for common socket helpers.""" +import logging +import os +import socket +from contextlib import suppress + +from cloudinit.settings import DEFAULT_RUN_DIR + +LOG = logging.getLogger(__name__) + + +def sd_notify(message: bytes): + """Send a sd_notify message.""" + LOG.info("Sending sd_notify(%s)", str(message)) + socket_path = os.environ.get("NOTIFY_SOCKET", "") + + # abstract + if socket_path[0] == "@": + socket_path.replace("@", "\0", 1) + + # unix domain + elif not socket_path[0] == "/": + raise OSError("Unsupported socket type") + + with socket.socket( + socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC + ) as sock: + sock.connect(socket_path) + sock.sendall(message) + + +class SocketSync: + """A two way synchronization protocol over Unix domain sockets.""" + + def __init__(self, *names: str): + """Initialize a synchronization context. + + 1) Ensure that the socket directory exists. + 2) Bind a socket for each stage. + + Binding the sockets on initialization allows receipt of stage + "start" notifications prior to the cloud-init stage being ready to + start. + + :param names: stage names, used as a unique identifiers + """ + self.stage = "" + self.remote = "" + self.sockets = { + name: socket.socket( + socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC + ) + for name in names + } + # ensure the directory exists + os.makedirs(f"{DEFAULT_RUN_DIR}/share", mode=0o700, exist_ok=True) + # removing stale sockets and bind + for name, sock in self.sockets.items(): + socket_path = f"{DEFAULT_RUN_DIR}/share/{name}.sock" + with suppress(FileNotFoundError): + os.remove(socket_path) + sock.bind(socket_path) + + def __call__(self, stage: str): + """Set the stage before entering context. + + This enables the context manager to be initialized separately from + each stage synchronization. + + :param stage: the name of a stage to synchronize + + Example: + sync = SocketSync("stage 1", "stage 2"): + with sync("stage 1"): + pass + with sync("stage 2"): + pass + """ + self.stage = stage + return self + + def __enter__(self): + """Wait until a message has been received on this stage's socket. + + Once the message has been received, enter the context. + """ + LOG.debug("sync(%s): initial synchronization starting", self.stage) + # block until init system sends us data + # the first value returned contains a message from the init system + # (should be "start") + # the second value contains the path to a unix socket on which to + # reply, which is expected to be /path/to/{self.stage}-return.sock + sock = self.sockets[self.stage] + chunk, self.remote = sock.recvfrom(5) + + if b"start" != chunk: + # The protocol expects to receive a command "start" + self.__exit__(None, None, None) + raise ValueError(f"Received invalid message: [{str(chunk)}]") + elif f"{DEFAULT_RUN_DIR}/share/{self.stage}-return.sock" != str( + self.remote + ): + # assert that the return path is in a directory with appropriate + # permissions + self.__exit__(None, None, None) + raise ValueError(f"Unexpected path to unix socket: {self.remote}") + + LOG.debug("sync(%s): initial synchronization complete", self.stage) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Notify the socket that this stage is complete.""" + sock = self.sockets[self.stage] + sock.connect(self.remote) + sock.sendall(b"done") + sock.close() diff --git a/systemd/cloud-config.service.tmpl b/systemd/cloud-config.service.tmpl index 79c75c71ae6..cefe2c0613c 100644 --- a/systemd/cloud-config.service.tmpl +++ b/systemd/cloud-config.service.tmpl @@ -10,12 +10,13 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init modules --mode=config +ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/config.sock -s /run/cloud-init/share/config-return.sock RemainAfterExit=yes TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console +StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/systemd/cloud-final.service.tmpl b/systemd/cloud-final.service.tmpl index b66533643d3..26d33192e12 100644 --- a/systemd/cloud-final.service.tmpl +++ b/systemd/cloud-final.service.tmpl @@ -15,10 +15,9 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init modules --mode=final +ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/final.sock -s /run/cloud-init/share/final-return.sock RemainAfterExit=yes TimeoutSec=0 -KillMode=process {% if variant in ["almalinux", "cloudlinux", "rhel"] %} # Restart NetworkManager if it is present and running. ExecStartPost=/bin/sh -c 'u=NetworkManager.service; \ @@ -32,6 +31,7 @@ TasksMax=infinity # Output needs to appear in instance console output StandardOutput=journal+console +StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/systemd/cloud-init-local.service.tmpl b/systemd/cloud-init-local.service.tmpl index 0da2d8337e9..418a753e93f 100644 --- a/systemd/cloud-init-local.service.tmpl +++ b/systemd/cloud-init-local.service.tmpl @@ -7,7 +7,6 @@ DefaultDependencies=no {% endif %} Wants=network-pre.target After=hv_kvp_daemon.service -After=systemd-remount-fs.service {% if variant in ["almalinux", "cloudlinux", "rhel"] %} Requires=dbus.socket After=dbus.socket @@ -38,12 +37,13 @@ ExecStartPre=/bin/mkdir -p /run/cloud-init ExecStartPre=/sbin/restorecon /run/cloud-init ExecStartPre=/usr/bin/touch /run/cloud-init/enabled {% endif %} -ExecStart=/usr/bin/cloud-init init --local +ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/local.sock -s /run/cloud-init/share/local-return.sock RemainAfterExit=yes TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console +StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/systemd/cloud-init-single.service b/systemd/cloud-init-single.service new file mode 100644 index 00000000000..17bbcfc1bb2 --- /dev/null +++ b/systemd/cloud-init-single.service @@ -0,0 +1,27 @@ +[Unit] +Description=Cloud-init: Single Process +DefaultDependencies=no +Wants=network-pre.target +After=systemd-remount-fs.service +Before=NetworkManager.service +Before=network-pre.target +Before=shutdown.target +Before=sysinit.target +Before=cloud-init-local.service +Conflicts=shutdown.target +RequiresMountsFor=/var/lib/cloud +ConditionPathExists=!/etc/cloud/cloud-init.disabled +ConditionKernelCommandLine=!cloud-init=disabled +ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled + +[Service] +Type=notify +ExecStart=/usr/bin/cloud-init --single-process +KillMode=process +TimeoutStartSec=infinity + +# Output needs to appear in instance console output +StandardOutput=journal+console + +[Install] +WantedBy=cloud-init.target diff --git a/systemd/cloud-init.service.tmpl b/systemd/cloud-init.service.tmpl index 58031cc4331..32469fe07b3 100644 --- a/systemd/cloud-init.service.tmpl +++ b/systemd/cloud-init.service.tmpl @@ -46,12 +46,13 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init init +ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/network.sock -s /run/cloud-init/share/network-return.sock RemainAfterExit=yes TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console +StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/tests/unittests/test_cli.py b/tests/unittests/test_cli.py index 6ab6d496b16..a7c3b1ba38b 100644 --- a/tests/unittests/test_cli.py +++ b/tests/unittests/test_cli.py @@ -160,9 +160,7 @@ def test_no_arguments_shows_usage(self, capsys): def test_no_arguments_shows_error_message(self, capsys): exit_code = self._call_main() - missing_subcommand_message = ( - "the following arguments are required: subcommand" - ) + missing_subcommand_message = "a subcommand is required" _out, err = capsys.readouterr() assert ( missing_subcommand_message in err diff --git a/tests/unittests/test_single_process.py b/tests/unittests/test_single_process.py new file mode 100644 index 00000000000..9df2d122f15 --- /dev/null +++ b/tests/unittests/test_single_process.py @@ -0,0 +1,164 @@ +import random +import signal +import socket +import time +from threading import Thread +from unittest import mock + +from cloudinit import socket as ci_socket + + +class Sync: + """A device to send and receive synchronization messages + + Creating an instance of the device sends a b"start" + """ + + def __init__(self, name: str, path: str): + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + self.sock.connect(f"{path}/share/{name}.sock") + self.sock.bind(f"{path}/share/{name}-return.sock") + self.sock.sendall(b"start") + + def receive(self): + """receive 5 bytes from the socket""" + received = self.sock.recv(5) + self.sock.close() + return received + + +class Timeout: + """A utility which may be used to verify that a timeout occurs + + TimeoutError is raised on successful timeout. + + Create a signal handler and use signal.alarm to verify that the + timeout occured. + """ + + def handle_timeout(self, *_): + raise TimeoutError() + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + # 1 second is, unfortunately, the minimum + signal.alarm(1) + + def __exit__(self, *_): + signal.alarm(0) + + +def test_single_process_times_out(tmp_path): + """Verify that no "start" makes the protocol block""" + with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + sync = ci_socket.SocketSync("first") + + try: + with Timeout(): + # this should block for 1 second + with sync("first"): + pass + except TimeoutError: + # success is a timeout + pass + else: + raise AssertionError("Expected the thing to timeout!") + + +def test_single_process(tmp_path): + """Verify that a socket can store "start" messages + + After a socket has been been bound but before it has started listening + """ + expected = b"done" + with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + sync = ci_socket.SocketSync("first", "second", "third") + + # send all three syncs to the sockets + first = Sync("first", tmp_path) + second = Sync("second", tmp_path) + third = Sync("third", tmp_path) + + # "wait" on the first sync event + with sync("first"): + pass + + # check that the first sync returned + assert expected == first.receive() + # "wait" on the second sync event + with sync("second"): + pass + # check that the second sync returned + assert expected == second.receive() + # "wait" on the third sync event + with sync("third"): + pass + # check that the third sync returned + assert expected == third.receive() + + +def test_single_process_threaded(tmp_path): + """Verify that arbitrary "start" order works""" + + # in milliseconds + max_sleep = 100 + # initialize random number generator + random.seed(time.time()) + expected = b"done" + sync_storage = {} + + def syncer(index: int, name: str): + """sleep for 0-100ms then send a sync notification + + this allows sync order to be arbitrary + """ + time.sleep(0.001 * random.randint(0, max_sleep)) + sync_storage[index] = Sync(name, tmp_path) + + with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + + sync = ci_socket.SocketSync( + "first", "second", "third", "fourth", "fifth" + ) + + for i, name in { + 1: "first", + 2: "second", + 3: "third", + 4: "fourth", + 5: "fifth", + }.items(): + t = Thread(target=syncer, args=(i, name)) + t.run() + + # wait on the first sync event + with sync("first"): + pass + + # check that the first sync returned + assert expected == sync_storage[1].receive() + + # wait on the second sync event + with sync("second"): + pass + + # check that the second sync returned + assert expected == sync_storage[2].receive() + + # wait on the third sync event + with sync("third"): + pass + + # check that the third sync returned + assert expected == sync_storage[3].receive() + with sync("fourth"): + pass + + # check that the fourth sync returned + assert expected == sync_storage[4].receive() + + with sync("fifth"): + pass + + # check that the fifth sync returned + assert expected == sync_storage[5].receive() From 79e191fbe2f69ba068baade1f3102fe32d569592 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Mon, 15 Jul 2024 12:16:52 -0600 Subject: [PATCH 02/22] comments --- cloudinit/cmd/main.py | 6 +++--- cloudinit/socket.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 3a78d2f0805..a50c629368f 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -964,7 +964,7 @@ def main(sysv_args=None): dest="single_process", action="store_true", help=( - "Run run the four stages as a single process as an optimization." + "Run cloud-init's stages as a single process as an optimization." "Requires init system integration." ), default=False, @@ -1171,8 +1171,8 @@ def main(sysv_args=None): # attempt to send "start" before a socket exists sync = socket.SocketSync("local", "network", "config", "final") - # notify cloud-init-local.service that this stage has completed - socket.sd_notify(b"READY=1") + # notify systemd that this stage has completed + socket.sd_notify("READY=1") # wait for cloud-init-local.service to start with sync("local"): diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 685f43f28f9..184637e4ef7 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -10,8 +10,11 @@ LOG = logging.getLogger(__name__) -def sd_notify(message: bytes): - """Send a sd_notify message.""" +def sd_notify(message: str): + """Send a sd_notify message. + + :param message: sd-notify message (must be valid ascii) + """ LOG.info("Sending sd_notify(%s)", str(message)) socket_path = os.environ.get("NOTIFY_SOCKET", "") @@ -20,14 +23,14 @@ def sd_notify(message: bytes): socket_path.replace("@", "\0", 1) # unix domain - elif not socket_path[0] == "/": + elif socket_path[0] != "/": raise OSError("Unsupported socket type") with socket.socket( socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC ) as sock: sock.connect(socket_path) - sock.sendall(message) + sock.sendall(message.encode("ascii")) class SocketSync: @@ -77,6 +80,8 @@ def __call__(self, stage: str): with sync("stage 2"): pass """ + if stage not in self.sockets: + raise ValueError(f"Invalid stage name: {stage}") self.stage = stage return self From 7a6289733123ec8f870d139649822bd4ae9356f6 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Mon, 15 Jul 2024 13:44:36 -0600 Subject: [PATCH 03/22] remove redundant services, add TasksMax= from cloud-final --- systemd/cloud-init-single.service | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/systemd/cloud-init-single.service b/systemd/cloud-init-single.service index 17bbcfc1bb2..e0c75f00278 100644 --- a/systemd/cloud-init-single.service +++ b/systemd/cloud-init-single.service @@ -1,11 +1,15 @@ +# systemd ordering resources +# ========================== +# https://systemd.io/NETWORK_ONLINE/ +# https://docs.cloud-init.io/en/latest/explanation/boot.html +# https://www.freedesktop.org/wiki/Software/systemd/NetworkTarget/ +# https://www.freedesktop.org/software/systemd/man/latest/systemd.special.html +# https://www.freedesktop.org/software/systemd/man/latest/systemd-remount-fs.service.html [Unit] Description=Cloud-init: Single Process DefaultDependencies=no Wants=network-pre.target After=systemd-remount-fs.service -Before=NetworkManager.service -Before=network-pre.target -Before=shutdown.target Before=sysinit.target Before=cloud-init-local.service Conflicts=shutdown.target @@ -18,6 +22,7 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled Type=notify ExecStart=/usr/bin/cloud-init --single-process KillMode=process +TasksMax=infinity TimeoutStartSec=infinity # Output needs to appear in instance console output From c127fcac988c417423a6aa6dcd2429c86e59d455 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Mon, 15 Jul 2024 16:46:37 -0600 Subject: [PATCH 04/22] use sd_notify to update systemd in case of fatal exception and of current progress --- cloudinit/cmd/main.py | 61 +++++++++++++++----------- cloudinit/socket.py | 12 ++++- tests/unittests/test_single_process.py | 12 +++-- 3 files changed, 55 insertions(+), 30 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index a50c629368f..d01e118db18 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -1167,31 +1167,42 @@ def main(sysv_args=None): return sub_main(args) LOG.info("Running cloud-init in single process mode.") - # this _must_ be called before sd_notify is called otherwise netcat may - # attempt to send "start" before a socket exists - sync = socket.SocketSync("local", "network", "config", "final") - - # notify systemd that this stage has completed - socket.sd_notify("READY=1") - - # wait for cloud-init-local.service to start - with sync("local"): - sub_main(parser.parse_args(args=["init", "--local"])) - - # wait for cloud-init.service to start - with sync("network"): - # init stage - sub_main(parser.parse_args(args=["init"])) - - # wait for cloud-config.service to start - with sync("config"): - # config stage - sub_main(parser.parse_args(args=["modules", "--mode=config"])) - - with sync("final"): - # final stage - sub_main(parser.parse_args(args=["modules", "--mode=final"])) - socket.sd_notify(b"STOPPING=1") + try: + # this _must_ be called before sd_notify is called otherwise netcat may + # attempt to send "start" before a socket exists + sync = socket.SocketSync("local", "network", "config", "final") + + # notify systemd that this stage has completed + socket.sd_notify("READY=1") + + # wait for cloud-init-local.service to start + with sync("local"): + # local stage + sub_main(parser.parse_args(args=["init", "--local"])) + + # wait for cloud-init.service to start + with sync("network"): + # init stage + sub_main(parser.parse_args(args=["init"])) + + # wait for cloud-config.service to start + with sync("config"): + # config stage + sub_main(parser.parse_args(args=["modules", "--mode=config"])) + + with sync("final"): + # final stage + return_code = sub_main( + parser.parse_args(args=["modules", "--mode=final"]) + ) + except Exception as e: + LOG.fatal("Fatal exception: %s", e, exc_info=True) + status = traceback.format_exc().replace("\n", " ") + socket.sd_notify(f"STATUS={status}") + return_code = 1 + socket.sd_notify("STATUS=Completed") + socket.sd_notify("STOPPING=1") + return return_code def sub_main(args): diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 184637e4ef7..0111eaef191 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -3,6 +3,7 @@ import logging import os import socket +import time from contextlib import suppress from cloudinit.settings import DEFAULT_RUN_DIR @@ -90,7 +91,11 @@ def __enter__(self): Once the message has been received, enter the context. """ - LOG.debug("sync(%s): initial synchronization starting", self.stage) + sd_notify( + "STATUS=Waiting on external services to " + f"complete ({self.stage} stage)" + ) + start_time = time.monotonic() # block until init system sends us data # the first value returned contains a message from the init system # (should be "start") @@ -111,7 +116,10 @@ def __enter__(self): self.__exit__(None, None, None) raise ValueError(f"Unexpected path to unix socket: {self.remote}") - LOG.debug("sync(%s): initial synchronization complete", self.stage) + total = time.monotonic() - start_time + time_msg = f"took {total: .3f}s " if total > 0.1 else "" + sd_notify(f"STATUS=Running ({self.stage} stage)") + LOG.debug("sync(%s): synchronization %scomplete", self.stage, time_msg) return self def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/tests/unittests/test_single_process.py b/tests/unittests/test_single_process.py index 9df2d122f15..ef3d13294cd 100644 --- a/tests/unittests/test_single_process.py +++ b/tests/unittests/test_single_process.py @@ -50,7 +50,9 @@ def __exit__(self, *_): def test_single_process_times_out(tmp_path): """Verify that no "start" makes the protocol block""" - with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"): sync = ci_socket.SocketSync("first") try: @@ -71,7 +73,9 @@ def test_single_process(tmp_path): After a socket has been been bound but before it has started listening """ expected = b"done" - with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"): sync = ci_socket.SocketSync("first", "second", "third") # send all three syncs to the sockets @@ -115,7 +119,9 @@ def syncer(index: int, name: str): time.sleep(0.001 * random.randint(0, max_sleep)) sync_storage[index] = Sync(name, tmp_path) - with mock.patch.object(ci_socket, "DEFAULT_RUN_DIR", tmp_path): + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"): sync = ci_socket.SocketSync( "first", "second", "third", "fourth", "fifth" From 3247c115d158d6cf754e2fa181ac49b5a57427f6 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Fri, 19 Jul 2024 17:29:50 -0600 Subject: [PATCH 05/22] Rename cloud-init services to be more intuitive. Make cloud-network.service map to the cloud-init network stage. Make cloud-init.service map to all of cloud-init. BREAKING CHANGE: Changes the semantics of the cloud-init.service files --- systemd/{cloud-init-single.service => cloud-init.service} | 0 systemd/{cloud-init.service.tmpl => cloud-network.service.tmpl} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename systemd/{cloud-init-single.service => cloud-init.service} (100%) rename systemd/{cloud-init.service.tmpl => cloud-network.service.tmpl} (100%) diff --git a/systemd/cloud-init-single.service b/systemd/cloud-init.service similarity index 100% rename from systemd/cloud-init-single.service rename to systemd/cloud-init.service diff --git a/systemd/cloud-init.service.tmpl b/systemd/cloud-network.service.tmpl similarity index 100% rename from systemd/cloud-init.service.tmpl rename to systemd/cloud-network.service.tmpl From d87be7cfcb64d4f6997429e0e3b87d8c66cc40bd Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Fri, 19 Jul 2024 17:50:15 -0600 Subject: [PATCH 06/22] add templated order to service from cloud-init-local.service --- ...d-init.service => cloud-init.service.tmpl} | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) rename systemd/{cloud-init.service => cloud-init.service.tmpl} (62%) diff --git a/systemd/cloud-init.service b/systemd/cloud-init.service.tmpl similarity index 62% rename from systemd/cloud-init.service rename to systemd/cloud-init.service.tmpl index e0c75f00278..46be3d59e73 100644 --- a/systemd/cloud-init.service +++ b/systemd/cloud-init.service.tmpl @@ -1,3 +1,4 @@ +## template:jinja # systemd ordering resources # ========================== # https://systemd.io/NETWORK_ONLINE/ @@ -7,8 +8,22 @@ # https://www.freedesktop.org/software/systemd/man/latest/systemd-remount-fs.service.html [Unit] Description=Cloud-init: Single Process -DefaultDependencies=no Wants=network-pre.target +{% if variant in ["almalinux", "cloudlinux", "ubuntu", "unknown", "debian", "rhel"] %} +DefaultDependencies=no +{% endif %} +{% if variant in ["almalinux", "cloudlinux", "rhel"] %} +Requires=dbus.socket +After=dbus.socket +Before=network.service +Before=firewalld.target +Conflicts=shutdown.target +{% endif %} +{% if variant in ["ubuntu", "unknown", "debian"] %} +Before=sysinit.target +Conflicts=shutdown.target +{% endif %} + After=systemd-remount-fs.service Before=sysinit.target Before=cloud-init-local.service @@ -24,6 +39,11 @@ ExecStart=/usr/bin/cloud-init --single-process KillMode=process TasksMax=infinity TimeoutStartSec=infinity +{% if variant in ["almalinux", "cloudlinux", "rhel"] %} +ExecStartPre=/bin/mkdir -p /run/cloud-init +ExecStartPre=/sbin/restorecon /run/cloud-init +ExecStartPre=/usr/bin/touch /run/cloud-init/enabled +{% endif %} # Output needs to appear in instance console output StandardOutput=journal+console From 18aa5b3cd6849f9f974497738d6bfa7db9b59bf1 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Mon, 22 Jul 2024 11:43:40 -0600 Subject: [PATCH 07/22] Improve intra-stage error handling - make it such that if one stage fails, the next stage isn't blocked indefinitely - notify the init system of per-stage exit codes and failure messages - make parent process (cloud-init.service) exit with representative exit code --- cloudinit/cmd/main.py | 75 +++++++++++++------------- cloudinit/socket.py | 28 +++++++++- systemd/cloud-config.service.tmpl | 2 +- systemd/cloud-final.service.tmpl | 3 +- systemd/cloud-init-local.service.tmpl | 3 +- systemd/cloud-network.service.tmpl | 3 +- tests/unittests/test_single_process.py | 27 ++++++++-- 7 files changed, 94 insertions(+), 47 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index d01e118db18..7018e7ec112 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -1167,42 +1167,45 @@ def main(sysv_args=None): return sub_main(args) LOG.info("Running cloud-init in single process mode.") - try: - # this _must_ be called before sd_notify is called otherwise netcat may - # attempt to send "start" before a socket exists - sync = socket.SocketSync("local", "network", "config", "final") - - # notify systemd that this stage has completed - socket.sd_notify("READY=1") - - # wait for cloud-init-local.service to start - with sync("local"): - # local stage - sub_main(parser.parse_args(args=["init", "--local"])) - - # wait for cloud-init.service to start - with sync("network"): - # init stage - sub_main(parser.parse_args(args=["init"])) - - # wait for cloud-config.service to start - with sync("config"): - # config stage - sub_main(parser.parse_args(args=["modules", "--mode=config"])) - - with sync("final"): - # final stage - return_code = sub_main( - parser.parse_args(args=["modules", "--mode=final"]) - ) - except Exception as e: - LOG.fatal("Fatal exception: %s", e, exc_info=True) - status = traceback.format_exc().replace("\n", " ") - socket.sd_notify(f"STATUS={status}") - return_code = 1 - socket.sd_notify("STATUS=Completed") - socket.sd_notify("STOPPING=1") - return return_code + # this _must_ be called before sd_notify is called otherwise netcat may + # attempt to send "start" before a socket exists + sync = socket.SocketSync("local", "network", "config", "final") + + # notify systemd that this stage has completed + socket.sd_notify("READY=1") + # wait for cloud-init-local.service to start + with sync("local"): + # run local stage + sub_main(parser.parse_args(args=["init", "--local"])) + + # wait for cloud-init.service to start + with sync("network"): + # run init stage + sub_main(parser.parse_args(args=["init"])) + + # wait for cloud-config.service to start + with sync("config"): + # run config stage + sub_main(parser.parse_args(args=["modules", "--mode=config"])) + + # wait for cloud-final.service to start + with sync("final"): + # run final stage + sub_main(parser.parse_args(args=["modules", "--mode=final"])) + + # signal completion to cloud-init.service + if sync.first_exception: + socket.sd_notify( + "STATUS=Completed with failure, first exception received: " + f"{sync.first_exception}. Run 'cloud-init status --long' for more " + "details." + ) + socket.sd_notify("STOPPING=1") + # exit 1 for a fatal failure in any stage + return 1 + else: + socket.sd_notify("STATUS=Completed") + socket.sd_notify("STOPPING=1") def sub_main(args): diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 0111eaef191..a4b8a3a3800 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -51,6 +51,7 @@ def __init__(self, *names: str): """ self.stage = "" self.remote = "" + self.first_exception = "" self.sockets = { name: socket.socket( socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC @@ -124,7 +125,32 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): """Notify the socket that this stage is complete.""" + message = "done" + systemd_exit_code = "0" + if exc_type: + # handle exception thrown in context + systemd_exit_code = "1" + status = f"{repr(exc_val)} in {exc_tb.tb_frame}" + message = ( + 'fatal error, run "systemctl cloud-init.service" for more ' + "details" + ) + if not self.first_exception: + self.first_exception = message + LOG.fatal(status) + sd_notify(f"STATUS={status}") + sock = self.sockets[self.stage] sock.connect(self.remote) - sock.sendall(b"done") + + # the returned message will be executed in a subshell + # hardcode this message rather than sending a more informative message + # to avoid having to sanitize inputs (to prevent escaping the shell) + sock.sendall(f"echo '{message}'; exit {systemd_exit_code};".encode()) sock.close() + + # suppress exception - the exception was logged and the init system + # notified of stage completion (and the exception received as a status + # message). Raising an exception would block the rest of boot, so carry + # on in a degraded state. + return True diff --git a/systemd/cloud-config.service.tmpl b/systemd/cloud-config.service.tmpl index cefe2c0613c..20c7d375714 100644 --- a/systemd/cloud-config.service.tmpl +++ b/systemd/cloud-config.service.tmpl @@ -10,7 +10,7 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/config.sock -s /run/cloud-init/share/config-return.sock +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/config.sock -s /run/cloud-init/share/config-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-final.service.tmpl b/systemd/cloud-final.service.tmpl index 26d33192e12..98b51bf01c2 100644 --- a/systemd/cloud-final.service.tmpl +++ b/systemd/cloud-final.service.tmpl @@ -15,7 +15,7 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/final.sock -s /run/cloud-init/share/final-return.sock +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/final.sock -s /run/cloud-init/share/final-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 {% if variant in ["almalinux", "cloudlinux", "rhel"] %} @@ -31,7 +31,6 @@ TasksMax=infinity # Output needs to appear in instance console output StandardOutput=journal+console -StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/systemd/cloud-init-local.service.tmpl b/systemd/cloud-init-local.service.tmpl index 418a753e93f..c8c19b76e50 100644 --- a/systemd/cloud-init-local.service.tmpl +++ b/systemd/cloud-init-local.service.tmpl @@ -37,13 +37,12 @@ ExecStartPre=/bin/mkdir -p /run/cloud-init ExecStartPre=/sbin/restorecon /run/cloud-init ExecStartPre=/usr/bin/touch /run/cloud-init/enabled {% endif %} -ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/local.sock -s /run/cloud-init/share/local-return.sock +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/local.sock -s /run/cloud-init/share/local-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console -StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/systemd/cloud-network.service.tmpl b/systemd/cloud-network.service.tmpl index 32469fe07b3..e6092341862 100644 --- a/systemd/cloud-network.service.tmpl +++ b/systemd/cloud-network.service.tmpl @@ -46,13 +46,12 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=nc.openbsd -Uu -W1 /run/cloud-init/share/network.sock -s /run/cloud-init/share/network-return.sock +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/network.sock -s /run/cloud-init/share/network-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console -StandardInputText=start [Install] WantedBy=cloud-init.target diff --git a/tests/unittests/test_single_process.py b/tests/unittests/test_single_process.py index ef3d13294cd..083361f11e6 100644 --- a/tests/unittests/test_single_process.py +++ b/tests/unittests/test_single_process.py @@ -22,7 +22,7 @@ def __init__(self, name: str, path: str): def receive(self): """receive 5 bytes from the socket""" - received = self.sock.recv(5) + received = self.sock.recv(4096) self.sock.close() return received @@ -72,7 +72,7 @@ def test_single_process(tmp_path): After a socket has been been bound but before it has started listening """ - expected = b"done" + expected = b"echo 'done'; exit 0;" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path ), mock.patch.object(ci_socket, "sd_notify"): @@ -108,7 +108,7 @@ def test_single_process_threaded(tmp_path): max_sleep = 100 # initialize random number generator random.seed(time.time()) - expected = b"done" + expected = b"echo 'done'; exit 0;" sync_storage = {} def syncer(index: int, name: str): @@ -168,3 +168,24 @@ def syncer(index: int, name: str): # check that the fifth sync returned assert expected == sync_storage[5].receive() + + +def test_single_process_exception(tmp_path): + """Verify that exceptions log messages produce a valid warning message""" + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"): + sync = ci_socket.SocketSync("first", "second", "third") + + # send all three syncs to the sockets + first = Sync("first", tmp_path) + + # "wait" on the first sync event + with sync("first"): + # verify that an exception in context doesn't raise + 1 / 0 # type: ignore + + assert ( + b'echo \'fatal error, run "systemctl cloud-init.service" for ' + b"more details'; exit 1;" == first.receive() + ) From 5c056907f4b2b317926f13430b88b910b43a8ccc Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Tue, 16 Jul 2024 10:20:58 -0600 Subject: [PATCH 08/22] Do not set up logger multiple times Add a new attribute flag to the argparser Namespace attribute which is used to disable logging. This isn't elegant, but fixing logging is going to be a large refactor so this gets logging "working" for now while minimizing number of LOC changed --- cloudinit/cmd/main.py | 43 +++++++++++++++++++++++--------- tests/unittests/cmd/test_main.py | 6 ++++- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 7018e7ec112..90179214d9b 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -368,8 +368,11 @@ def main_init(name, args): outfmt = None errfmt = None try: - close_stdin(lambda msg: early_logs.append((logging.DEBUG, msg))) - outfmt, errfmt = util.fixup_output(init.cfg, name) + if not args.skip_log_setup: + close_stdin(lambda msg: early_logs.append((logging.DEBUG, msg))) + outfmt, errfmt = util.fixup_output(init.cfg, name) + else: + outfmt, errfmt = util.get_output_cfg(init.cfg, name) except Exception: msg = "Failed to setup output redirection!" util.logexc(LOG, msg) @@ -381,8 +384,9 @@ def main_init(name, args): "Logging being reset, this logger may no longer be active shortly" ) log.reset_logging() - log.setup_logging(init.cfg) - apply_reporting_cfg(init.cfg) + if not args.skip_log_setup: + log.setup_logging(init.cfg) + apply_reporting_cfg(init.cfg) # Any log usage prior to setup_logging above did not have local user log # config applied. We send the welcome message now, as stderr/out have @@ -639,8 +643,9 @@ def main_modules(action_name, args): mods = Modules(init, extract_fns(args), reporter=args.reporter) # Stage 4 try: - close_stdin() - util.fixup_output(mods.cfg, name) + if not args.skip_log_setup: + close_stdin() + util.fixup_output(mods.cfg, name) except Exception: util.logexc(LOG, "Failed to setup output redirection!") if args.debug: @@ -649,8 +654,9 @@ def main_modules(action_name, args): "Logging being reset, this logger may no longer be active shortly" ) log.reset_logging() - log.setup_logging(mods.cfg) - apply_reporting_cfg(init.cfg) + if not args.skip_log_setup: + log.setup_logging(mods.cfg) + apply_reporting_cfg(init.cfg) # now that logging is setup and stdout redirected, send welcome welcome(name, msg=w_msg) @@ -1163,6 +1169,7 @@ def main(sysv_args=None): parser.error("a subcommand is required") args = parser.parse_args(args=sysv_args) + setattr(args, "skip_log_setup", False) if not args.single_process: return sub_main(args) LOG.info("Running cloud-init in single process mode.") @@ -1175,23 +1182,35 @@ def main(sysv_args=None): socket.sd_notify("READY=1") # wait for cloud-init-local.service to start with sync("local"): + # set up logger + args = parser.parse_args(args=["init", "--local"]) + args.skip_log_setup = False # run local stage - sub_main(parser.parse_args(args=["init", "--local"])) + sub_main(args) # wait for cloud-init.service to start with sync("network"): + # skip re-setting up logger + args = parser.parse_args(args=["init"]) + args.skip_log_setup = True # run init stage - sub_main(parser.parse_args(args=["init"])) + sub_main(args) # wait for cloud-config.service to start with sync("config"): + # skip re-setting up logger + args = parser.parse_args(args=["modules", "--mode=config"]) + args.skip_log_setup = True # run config stage - sub_main(parser.parse_args(args=["modules", "--mode=config"])) + sub_main(args) # wait for cloud-final.service to start with sync("final"): + # skip re-setting up logger + args = parser.parse_args(args=["modules", "--mode=final"]) + args.skip_log_setup = True # run final stage - sub_main(parser.parse_args(args=["modules", "--mode=final"])) + sub_main(args) # signal completion to cloud-init.service if sync.first_exception: diff --git a/tests/unittests/cmd/test_main.py b/tests/unittests/cmd/test_main.py index f9b3faab130..bad728f2a72 100644 --- a/tests/unittests/cmd/test_main.py +++ b/tests/unittests/cmd/test_main.py @@ -13,7 +13,9 @@ from cloudinit.util import ensure_dir, load_text_file, write_file from tests.unittests.helpers import FilesystemMockingTestCase, wrap_and_call -MyArgs = namedtuple("MyArgs", "debug files force local reporter subcommand") +MyArgs = namedtuple( + "MyArgs", "debug files force local reporter subcommand skip_log_setup" +) class TestMain(FilesystemMockingTestCase): @@ -76,6 +78,7 @@ def test_main_init_run_net_runs_modules(self): local=False, reporter=None, subcommand="init", + skip_log_setup=False, ) (_item1, item2) = wrap_and_call( "cloudinit.cmd.main", @@ -122,6 +125,7 @@ def test_main_init_run_net_calls_set_hostname_when_metadata_present(self): local=False, reporter=None, subcommand="init", + skip_log_setup=False, ) def set_hostname(name, cfg, cloud, args): From 14ca37f807fc5369e274e4472ff9e2feb5fc7f8e Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Tue, 23 Jul 2024 13:08:52 -0600 Subject: [PATCH 09/22] fix commandline (for debugger use) skips sync protocol when stdin is a tty --- cloudinit/socket.py | 14 ++++++++++--- tests/unittests/test_single_process.py | 27 +++++++++++++++++++++----- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cloudinit/socket.py b/cloudinit/socket.py index a4b8a3a3800..26e2734fd8b 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -3,6 +3,7 @@ import logging import os import socket +import sys import time from contextlib import suppress @@ -16,11 +17,14 @@ def sd_notify(message: str): :param message: sd-notify message (must be valid ascii) """ - LOG.info("Sending sd_notify(%s)", str(message)) socket_path = os.environ.get("NOTIFY_SOCKET", "") - # abstract - if socket_path[0] == "@": + if not socket_path: + # not running under systemd, no-op + return + + elif socket_path[0] == "@": + # abstract socket_path.replace("@", "\0", 1) # unix domain @@ -30,6 +34,7 @@ def sd_notify(message: str): with socket.socket( socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC ) as sock: + LOG.info("Sending sd_notify(%s)", str(message)) sock.connect(socket_path) sock.sendall(message.encode("ascii")) @@ -92,6 +97,9 @@ def __enter__(self): Once the message has been received, enter the context. """ + if os.isatty(sys.stdin.fileno()): + LOG.info("Not syncing, stdin is a tty.") + return sd_notify( "STATUS=Waiting on external services to " f"complete ({self.stage} stage)" diff --git a/tests/unittests/test_single_process.py b/tests/unittests/test_single_process.py index 083361f11e6..361e3372bc9 100644 --- a/tests/unittests/test_single_process.py +++ b/tests/unittests/test_single_process.py @@ -5,6 +5,7 @@ from threading import Thread from unittest import mock + from cloudinit import socket as ci_socket @@ -52,7 +53,11 @@ def test_single_process_times_out(tmp_path): """Verify that no "start" makes the protocol block""" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path - ), mock.patch.object(ci_socket, "sd_notify"): + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): sync = ci_socket.SocketSync("first") try: @@ -75,7 +80,11 @@ def test_single_process(tmp_path): expected = b"echo 'done'; exit 0;" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path - ), mock.patch.object(ci_socket, "sd_notify"): + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): sync = ci_socket.SocketSync("first", "second", "third") # send all three syncs to the sockets @@ -121,7 +130,11 @@ def syncer(index: int, name: str): with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path - ), mock.patch.object(ci_socket, "sd_notify"): + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): sync = ci_socket.SocketSync( "first", "second", "third", "fourth", "fifth" @@ -174,7 +187,11 @@ def test_single_process_exception(tmp_path): """Verify that exceptions log messages produce a valid warning message""" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path - ), mock.patch.object(ci_socket, "sd_notify"): + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): sync = ci_socket.SocketSync("first", "second", "third") # send all three syncs to the sockets @@ -183,7 +200,7 @@ def test_single_process_exception(tmp_path): # "wait" on the first sync event with sync("first"): # verify that an exception in context doesn't raise - 1 / 0 # type: ignore + 1 / 0 # pylint: disable=W0104 assert ( b'echo \'fatal error, run "systemctl cloud-init.service" for ' From c2079ea358e4fbb58e6acfc9a0eb71626eeea141 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Tue, 23 Jul 2024 13:57:11 -0600 Subject: [PATCH 10/22] rename command to --all-stages --- cloudinit/cmd/main.py | 10 +++++----- systemd/cloud-init.service.tmpl | 2 +- .../{test_single_process.py => test_all_stages.py} | 9 ++++----- 3 files changed, 10 insertions(+), 11 deletions(-) rename tests/unittests/{test_single_process.py => test_all_stages.py} (97%) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 90179214d9b..ebe42e44f66 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -966,12 +966,12 @@ def main(sysv_args=None): ) parser.add_argument( - "--single-process", - dest="single_process", + "--all-stages", + dest="all_stages", action="store_true", help=( - "Run cloud-init's stages as a single process as an optimization." - "Requires init system integration." + "Run cloud-init's stages under a single process using a " + "syncronization protocol. This is not intended for CLI usage." ), default=False, ) @@ -1170,7 +1170,7 @@ def main(sysv_args=None): args = parser.parse_args(args=sysv_args) setattr(args, "skip_log_setup", False) - if not args.single_process: + if not args.all_stages: return sub_main(args) LOG.info("Running cloud-init in single process mode.") diff --git a/systemd/cloud-init.service.tmpl b/systemd/cloud-init.service.tmpl index 46be3d59e73..1ddfd62073e 100644 --- a/systemd/cloud-init.service.tmpl +++ b/systemd/cloud-init.service.tmpl @@ -35,7 +35,7 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=notify -ExecStart=/usr/bin/cloud-init --single-process +ExecStart=/usr/bin/cloud-init --all-stages KillMode=process TasksMax=infinity TimeoutStartSec=infinity diff --git a/tests/unittests/test_single_process.py b/tests/unittests/test_all_stages.py similarity index 97% rename from tests/unittests/test_single_process.py rename to tests/unittests/test_all_stages.py index 361e3372bc9..4dacca0b684 100644 --- a/tests/unittests/test_single_process.py +++ b/tests/unittests/test_all_stages.py @@ -5,7 +5,6 @@ from threading import Thread from unittest import mock - from cloudinit import socket as ci_socket @@ -49,7 +48,7 @@ def __exit__(self, *_): signal.alarm(0) -def test_single_process_times_out(tmp_path): +def test_all_stages_times_out(tmp_path): """Verify that no "start" makes the protocol block""" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path @@ -72,7 +71,7 @@ def test_single_process_times_out(tmp_path): raise AssertionError("Expected the thing to timeout!") -def test_single_process(tmp_path): +def test_all_stages(tmp_path): """Verify that a socket can store "start" messages After a socket has been been bound but before it has started listening @@ -110,7 +109,7 @@ def test_single_process(tmp_path): assert expected == third.receive() -def test_single_process_threaded(tmp_path): +def test_all_stages_threaded(tmp_path): """Verify that arbitrary "start" order works""" # in milliseconds @@ -183,7 +182,7 @@ def syncer(index: int, name: str): assert expected == sync_storage[5].receive() -def test_single_process_exception(tmp_path): +def test_all_stages_exception(tmp_path): """Verify that exceptions log messages produce a valid warning message""" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path From f0944d0f83289fc86ce94e1c5ab50a046dd133c0 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Tue, 23 Jul 2024 15:20:08 -0600 Subject: [PATCH 11/22] drop unused stdin --- systemd/cloud-config.service.tmpl | 1 - 1 file changed, 1 deletion(-) diff --git a/systemd/cloud-config.service.tmpl b/systemd/cloud-config.service.tmpl index 20c7d375714..9d23ee7cfba 100644 --- a/systemd/cloud-config.service.tmpl +++ b/systemd/cloud-config.service.tmpl @@ -16,7 +16,6 @@ TimeoutSec=0 # Output needs to appear in instance console output StandardOutput=journal+console -StandardInputText=start [Install] WantedBy=cloud-init.target From 417d5500225b37244e6af59a178d931834b3fea8 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Wed, 24 Jul 2024 16:02:06 -0600 Subject: [PATCH 12/22] comments --- cloudinit/cmd/main.py | 9 +++++++-- cloudinit/cmd/status.py | 3 ++- cloudinit/config/schemas/schema-cloud-config-v1.json | 4 ++-- cloudinit/socket.py | 12 +++++++----- systemd/cloud-config.service.tmpl | 7 +++++++ systemd/cloud-final.service.tmpl | 7 +++++++ systemd/cloud-init-local.service.tmpl | 7 +++++++ ...nit.service.tmpl => cloud-init-main.service.tmpl} | 0 ....service.tmpl => cloud-init-network.service.tmpl} | 7 +++++++ tests/integration_tests/assets/enable_coverage.py | 2 +- tests/integration_tests/assets/enable_profile.py | 2 +- tests/integration_tests/conftest.py | 2 +- tests/integration_tests/datasources/test_nocloud.py | 4 ++-- tests/unittests/test_all_stages.py | 3 ++- 14 files changed, 53 insertions(+), 16 deletions(-) rename systemd/{cloud-init.service.tmpl => cloud-init-main.service.tmpl} (100%) rename systemd/{cloud-network.service.tmpl => cloud-init-network.service.tmpl} (77%) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index ebe42e44f66..9a7b8847f66 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -1172,6 +1172,11 @@ def main(sysv_args=None): setattr(args, "skip_log_setup", False) if not args.all_stages: return sub_main(args) + return all_stages(parser) + + +def all_stages(parser): + """Run all stages in a single process using an ordering protocol.""" LOG.info("Running cloud-init in single process mode.") # this _must_ be called before sd_notify is called otherwise netcat may @@ -1188,7 +1193,7 @@ def main(sysv_args=None): # run local stage sub_main(args) - # wait for cloud-init.service to start + # wait for cloud-init-network.service to start with sync("network"): # skip re-setting up logger args = parser.parse_args(args=["init"]) @@ -1212,7 +1217,7 @@ def main(sysv_args=None): # run final stage sub_main(args) - # signal completion to cloud-init.service + # signal completion to cloud-init-main.service if sync.first_exception: socket.sd_notify( "STATUS=Completed with failure, first exception received: " diff --git a/cloudinit/cmd/status.py b/cloudinit/cmd/status.py index 39089802984..f027321ce22 100644 --- a/cloudinit/cmd/status.py +++ b/cloudinit/cmd/status.py @@ -318,8 +318,9 @@ def systemd_failed(wait: bool) -> bool: for service in [ "cloud-final.service", "cloud-config.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-init-local.service", + "cloud-init-main.service", ]: try: stdout = query_systemctl( diff --git a/cloudinit/config/schemas/schema-cloud-config-v1.json b/cloudinit/config/schemas/schema-cloud-config-v1.json index c05bd994212..4ae8b4a8f70 100644 --- a/cloudinit/config/schemas/schema-cloud-config-v1.json +++ b/cloudinit/config/schemas/schema-cloud-config-v1.json @@ -2022,12 +2022,12 @@ }, "mount_default_fields": { "type": "array", - "description": "Default mount configuration for any mount entry with less than 6 options provided. When specified, 6 items are required and represent ``/etc/fstab`` entries. Default: ``defaults,nofail,x-systemd.after=cloud-init.service,_netdev``", + "description": "Default mount configuration for any mount entry with less than 6 options provided. When specified, 6 items are required and represent ``/etc/fstab`` entries. Default: ``defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev``", "default": [ null, null, "auto", - "defaults,nofail,x-systemd.after=cloud-init.service", + "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2" ], diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 26e2734fd8b..38dd5723484 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -98,11 +98,13 @@ def __enter__(self): Once the message has been received, enter the context. """ if os.isatty(sys.stdin.fileno()): - LOG.info("Not syncing, stdin is a tty.") + LOG.info( + "Stdin is a tty, so skipping stage synchronization protocol" + ) return sd_notify( "STATUS=Waiting on external services to " - f"complete ({self.stage} stage)" + f"complete before starting the {self.stage} stage." ) start_time = time.monotonic() # block until init system sends us data @@ -126,7 +128,7 @@ def __enter__(self): raise ValueError(f"Unexpected path to unix socket: {self.remote}") total = time.monotonic() - start_time - time_msg = f"took {total: .3f}s " if total > 0.1 else "" + time_msg = f"took {total: .3f}s to " if total > 0.01 else "" sd_notify(f"STATUS=Running ({self.stage} stage)") LOG.debug("sync(%s): synchronization %scomplete", self.stage, time_msg) return self @@ -140,8 +142,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): systemd_exit_code = "1" status = f"{repr(exc_val)} in {exc_tb.tb_frame}" message = ( - 'fatal error, run "systemctl cloud-init.service" for more ' - "details" + 'fatal error, run "systemctl status cloud-init-main.service" ' + 'and "cloud-init status --long" for more details' ) if not self.first_exception: self.first_exception = message diff --git a/systemd/cloud-config.service.tmpl b/systemd/cloud-config.service.tmpl index 9d23ee7cfba..9067d6e4bc0 100644 --- a/systemd/cloud-config.service.tmpl +++ b/systemd/cloud-config.service.tmpl @@ -10,6 +10,13 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/config.sock -s /run/cloud-init/share/config-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-final.service.tmpl b/systemd/cloud-final.service.tmpl index 98b51bf01c2..9fb2f681f73 100644 --- a/systemd/cloud-final.service.tmpl +++ b/systemd/cloud-final.service.tmpl @@ -15,6 +15,13 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/final.sock -s /run/cloud-init/share/final-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-init-local.service.tmpl b/systemd/cloud-init-local.service.tmpl index c8c19b76e50..b0a534b8f9a 100644 --- a/systemd/cloud-init-local.service.tmpl +++ b/systemd/cloud-init-local.service.tmpl @@ -37,6 +37,13 @@ ExecStartPre=/bin/mkdir -p /run/cloud-init ExecStartPre=/sbin/restorecon /run/cloud-init ExecStartPre=/usr/bin/touch /run/cloud-init/enabled {% endif %} +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/local.sock -s /run/cloud-init/share/local-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-init.service.tmpl b/systemd/cloud-init-main.service.tmpl similarity index 100% rename from systemd/cloud-init.service.tmpl rename to systemd/cloud-init-main.service.tmpl diff --git a/systemd/cloud-network.service.tmpl b/systemd/cloud-init-network.service.tmpl similarity index 77% rename from systemd/cloud-network.service.tmpl rename to systemd/cloud-init-network.service.tmpl index e6092341862..6957b39f1ee 100644 --- a/systemd/cloud-network.service.tmpl +++ b/systemd/cloud-init-network.service.tmpl @@ -46,6 +46,13 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/network.sock -s /run/cloud-init/share/network-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/tests/integration_tests/assets/enable_coverage.py b/tests/integration_tests/assets/enable_coverage.py index ed71ceef8f5..1d18fcbef04 100644 --- a/tests/integration_tests/assets/enable_coverage.py +++ b/tests/integration_tests/assets/enable_coverage.py @@ -2,7 +2,7 @@ services = [ "cloud-init-local.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-config.service", "cloud-final.service", ] diff --git a/tests/integration_tests/assets/enable_profile.py b/tests/integration_tests/assets/enable_profile.py index a6a0070c3c5..9b68e42ce05 100644 --- a/tests/integration_tests/assets/enable_profile.py +++ b/tests/integration_tests/assets/enable_profile.py @@ -2,7 +2,7 @@ services = [ "cloud-init-local.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-config.service", "cloud-final.service", ] diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index c3b8531ae92..8ba5a81b2b5 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -191,7 +191,7 @@ def _collect_profile(instance: IntegrationInstance, log_dir: Path): log_dir / "profile" / "local.stats", ) instance.pull_file( - "/var/log/cloud-init.service.stats", + "/var/log/cloud-init-network.service.stats", log_dir / "profile" / "network.stats", ) instance.pull_file( diff --git a/tests/integration_tests/datasources/test_nocloud.py b/tests/integration_tests/datasources/test_nocloud.py index 24aecc0bd8d..c3462c433a3 100644 --- a/tests/integration_tests/datasources/test_nocloud.py +++ b/tests/integration_tests/datasources/test_nocloud.py @@ -162,7 +162,7 @@ def test_smbios_seed_network(self, client: IntegrationInstance): """\ [Unit] Description=Serve a local webserver - Before=cloud-init.service + Before=cloud-init-network.service Wants=cloud-init-local.service DefaultDependencies=no After=systemd-networkd-wait-online.service @@ -354,7 +354,7 @@ def _boot_with_cmdline( # and NoCloud operates in network timeframe After=systemd-networkd-wait-online.service After=networking.service - Before=cloud-init.service + Before=cloud-init-network.service [Service] Type=exec diff --git a/tests/unittests/test_all_stages.py b/tests/unittests/test_all_stages.py index 4dacca0b684..1ea47200295 100644 --- a/tests/unittests/test_all_stages.py +++ b/tests/unittests/test_all_stages.py @@ -202,6 +202,7 @@ def test_all_stages_exception(tmp_path): 1 / 0 # pylint: disable=W0104 assert ( - b'echo \'fatal error, run "systemctl cloud-init.service" for ' + b"echo 'fatal error, run \"systemctl status cloud-init-main." + b'service" and "cloud-init status --long" for ' b"more details'; exit 1;" == first.receive() ) From cef0f5eedd991917071951ec181338626ed353b0 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Wed, 24 Jul 2024 17:12:52 -0600 Subject: [PATCH 13/22] fix new tests --- tests/unittests/config/test_cc_mounts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unittests/config/test_cc_mounts.py b/tests/unittests/config/test_cc_mounts.py index 9982b6741c6..f3cd0472970 100644 --- a/tests/unittests/config/test_cc_mounts.py +++ b/tests/unittests/config/test_cc_mounts.py @@ -565,9 +565,9 @@ def test_fstab_mounts_combinations(self): LABEL=keepme none ext4 defaults 0 0 LABEL=UEFI /dev/sda4 /mnt2 auto nofail,comment=cloudconfig 1 2 - /dev/sda5 /mnt3 auto defaults,nofail,x-systemd.after=cloud-init.service,_netdev,comment=cloudconfig 0 2 - /dev/sda1 /mnt xfs auto,comment=cloudconfig 0 2 - /dev/sda3 /mnt4 btrfs defaults,nofail,x-systemd.after=cloud-init.service,_netdev,comment=cloudconfig 0 2 + /dev/sda5 /mnt3 auto defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 + /dev/sda1 /mnt xfs defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 + /dev/sda3 /mnt4 btrfs defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 /dev/sdb1 none swap sw,comment=cloudconfig 0 0 """ # noqa: E501 ).strip() From 7d13021926c9e8e0307f0240f82553fb59df2d8e Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Mon, 29 Jul 2024 15:39:43 -0600 Subject: [PATCH 14/22] drop reference to cloud-init.service --- config/cloud.cfg.tmpl | 2 +- doc/module-docs/cc_mounts/data.yaml | 4 ++-- doc/rtd/explanation/boot.rst | 2 +- doc/rtd/howto/debugging.rst | 2 +- packages/redhat/cloud-init.spec.in | 4 ++-- systemd/cloud-config.target | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/cloud.cfg.tmpl b/config/cloud.cfg.tmpl index 4b1efdbcbf1..bc3e6067ec4 100644 --- a/config/cloud.cfg.tmpl +++ b/config/cloud.cfg.tmpl @@ -62,7 +62,7 @@ disable_root: true "openmandriva", "photon", "TencentOS"] or is_rhel %} {% if is_rhel %} -mount_default_fields: [~, ~, 'auto', 'defaults,nofail,x-systemd.after=cloud-init.service,_netdev', '0', '2'] +mount_default_fields: [~, ~, 'auto', 'defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev', '0', '2'] {% else %} mount_default_fields: [~, ~, 'auto', 'defaults,nofail', '0', '2'] {% endif %} diff --git a/doc/module-docs/cc_mounts/data.yaml b/doc/module-docs/cc_mounts/data.yaml index 751b301d501..18193f062d3 100644 --- a/doc/module-docs/cc_mounts/data.yaml +++ b/doc/module-docs/cc_mounts/data.yaml @@ -18,7 +18,7 @@ cc_mounts: .. code-block:: yaml mounts: - - ["ephemeral0", "/mnt", "auto", "defaults,nofail,x-systemd.after=cloud-init.service", "0", "2"] + - ["ephemeral0", "/mnt", "auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] - ["swap", "none", "swap", "sw", "0", "0"] In order to remove a previously-listed mount, an entry can be added to the @@ -32,7 +32,7 @@ cc_mounts: .. code-block:: yaml - mount_default_fields: [none, none, "auto", "defaults,nofail,x-systemd.after=cloud-init.service", "0", "2"] + mount_default_fields: [none, none, "auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] Non-systemd init systems will vary in ``mount_default_fields``. diff --git a/doc/rtd/explanation/boot.rst b/doc/rtd/explanation/boot.rst index a975ca7a093..ff3b65ebd28 100644 --- a/doc/rtd/explanation/boot.rst +++ b/doc/rtd/explanation/boot.rst @@ -108,7 +108,7 @@ Network ======= +------------------+----------------------------------------------------------+ -| systemd service | ``cloud-init.service`` | +| systemd service | ``cloud-init-network.service`` | +---------+--------+----------------------------------------------------------+ | runs | after local stage and configured networking is up | +---------+--------+----------------------------------------------------------+ diff --git a/doc/rtd/howto/debugging.rst b/doc/rtd/howto/debugging.rst index c8b2a2634bc..546e8dd9f45 100644 --- a/doc/rtd/howto/debugging.rst +++ b/doc/rtd/howto/debugging.rst @@ -55,7 +55,7 @@ Cloud-init did not run .. code-block:: - systemctl status cloud-init-local.service cloud-init.service\ + systemctl status cloud-init-local.service cloud-init-network.service\ cloud-config.service cloud-final.service Cloud-init may have started to run, but not completed. This shows how many, diff --git a/packages/redhat/cloud-init.spec.in b/packages/redhat/cloud-init.spec.in index bc57fe9aac9..672cd426673 100644 --- a/packages/redhat/cloud-init.spec.in +++ b/packages/redhat/cloud-init.spec.in @@ -124,7 +124,7 @@ if [ $1 -eq 1 ] then /bin/systemctl enable cloud-config.service >/dev/null 2>&1 || : /bin/systemctl enable cloud-final.service >/dev/null 2>&1 || : - /bin/systemctl enable cloud-init.service >/dev/null 2>&1 || : + /bin/systemctl enable cloud-init-network.service >/dev/null 2>&1 || : /bin/systemctl enable cloud-init-local.service >/dev/null 2>&1 || : fi %else @@ -141,7 +141,7 @@ if [ $1 -eq 0 ] then /bin/systemctl --no-reload disable cloud-config.service >/dev/null 2>&1 || : /bin/systemctl --no-reload disable cloud-final.service >/dev/null 2>&1 || : - /bin/systemctl --no-reload disable cloud-init.service >/dev/null 2>&1 || : + /bin/systemctl --no-reload disable cloud-init-network.service >/dev/null 2>&1 || : /bin/systemctl --no-reload disable cloud-init-local.service >/dev/null 2>&1 || : fi %else diff --git a/systemd/cloud-config.target b/systemd/cloud-config.target index 2d65e3433ce..be754bbd19d 100644 --- a/systemd/cloud-config.target +++ b/systemd/cloud-config.target @@ -14,5 +14,5 @@ [Unit] Description=Cloud-config availability -Wants=cloud-init-local.service cloud-init.service -After=cloud-init-local.service cloud-init.service +Wants=cloud-init-local.service cloud-init-network.service +After=cloud-init-local.service cloud-init-network.service From 08487344a7124a6712a6e234ddddd058affb64fd Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Wed, 31 Jul 2024 11:41:09 -0600 Subject: [PATCH 15/22] plumb stage exit codes through to systemd --- cloudinit/cmd/main.py | 8 ++++---- cloudinit/socket.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 9a7b8847f66..332d2b60f15 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -1191,7 +1191,7 @@ def all_stages(parser): args = parser.parse_args(args=["init", "--local"]) args.skip_log_setup = False # run local stage - sub_main(args) + sync.systemd_exit_code = sub_main(args) # wait for cloud-init-network.service to start with sync("network"): @@ -1199,7 +1199,7 @@ def all_stages(parser): args = parser.parse_args(args=["init"]) args.skip_log_setup = True # run init stage - sub_main(args) + sync.systemd_exit_code = sub_main(args) # wait for cloud-config.service to start with sync("config"): @@ -1207,7 +1207,7 @@ def all_stages(parser): args = parser.parse_args(args=["modules", "--mode=config"]) args.skip_log_setup = True # run config stage - sub_main(args) + sync.systemd_exit_code = sub_main(args) # wait for cloud-final.service to start with sync("final"): @@ -1215,7 +1215,7 @@ def all_stages(parser): args = parser.parse_args(args=["modules", "--mode=final"]) args.skip_log_setup = True # run final stage - sub_main(args) + sync.systemd_exit_code = sub_main(args) # signal completion to cloud-init-main.service if sync.first_exception: diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 38dd5723484..08db4995e59 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -57,6 +57,7 @@ def __init__(self, *names: str): self.stage = "" self.remote = "" self.first_exception = "" + self.systemd_exit_code = 0 self.sockets = { name: socket.socket( socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC @@ -102,6 +103,7 @@ def __enter__(self): "Stdin is a tty, so skipping stage synchronization protocol" ) return + self.systemd_exit_code = 0 sd_notify( "STATUS=Waiting on external services to " f"complete before starting the {self.stage} stage." @@ -136,10 +138,9 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): """Notify the socket that this stage is complete.""" message = "done" - systemd_exit_code = "0" if exc_type: # handle exception thrown in context - systemd_exit_code = "1" + self.systemd_exit_code = 1 status = f"{repr(exc_val)} in {exc_tb.tb_frame}" message = ( 'fatal error, run "systemctl status cloud-init-main.service" ' @@ -156,7 +157,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): # the returned message will be executed in a subshell # hardcode this message rather than sending a more informative message # to avoid having to sanitize inputs (to prevent escaping the shell) - sock.sendall(f"echo '{message}'; exit {systemd_exit_code};".encode()) + sock.sendall(f"echo '{message}'; exit {self.systemd_exit_code};".encode()) sock.close() # suppress exception - the exception was logged and the init system From 212f8414739274b3bc7c9e06cc34f1cf3543ae33 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Wed, 31 Jul 2024 11:44:33 -0600 Subject: [PATCH 16/22] format --- cloudinit/socket.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 08db4995e59..3e5e92f5bb9 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -157,7 +157,9 @@ def __exit__(self, exc_type, exc_val, exc_tb): # the returned message will be executed in a subshell # hardcode this message rather than sending a more informative message # to avoid having to sanitize inputs (to prevent escaping the shell) - sock.sendall(f"echo '{message}'; exit {self.systemd_exit_code};".encode()) + sock.sendall( + f"echo '{message}'; exit {self.systemd_exit_code};".encode() + ) sock.close() # suppress exception - the exception was logged and the init system From 5089e59d74040bf05662dd80507ed63251b1325b Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Wed, 31 Jul 2024 12:01:22 -0600 Subject: [PATCH 17/22] plumb substage failures to the exit code of cloud-init-main.service --- cloudinit/cmd/main.py | 10 ++++++---- cloudinit/socket.py | 5 +++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 332d2b60f15..59dc48ed21f 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -1218,11 +1218,13 @@ def all_stages(parser): sync.systemd_exit_code = sub_main(args) # signal completion to cloud-init-main.service - if sync.first_exception: + if sync.experienced_any_error: + message = "a stage of cloud-init exited non-zero" + if sync.first_exception: + message = "first exception received: {sync.first_exception}" socket.sd_notify( - "STATUS=Completed with failure, first exception received: " - f"{sync.first_exception}. Run 'cloud-init status --long' for more " - "details." + f"STATUS=Completed with failure, {message}. Run 'cloud-init status" + "--long' for more details." ) socket.sd_notify("STOPPING=1") # exit 1 for a fatal failure in any stage diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 3e5e92f5bb9..182844bf786 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -58,6 +58,7 @@ def __init__(self, *names: str): self.remote = "" self.first_exception = "" self.systemd_exit_code = 0 + self.experienced_any_error = False self.sockets = { name: socket.socket( socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC @@ -141,6 +142,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): if exc_type: # handle exception thrown in context self.systemd_exit_code = 1 + self.experienced_any_error = True status = f"{repr(exc_val)} in {exc_tb.tb_frame}" message = ( 'fatal error, run "systemctl status cloud-init-main.service" ' @@ -151,6 +153,9 @@ def __exit__(self, exc_type, exc_val, exc_tb): LOG.fatal(status) sd_notify(f"STATUS={status}") + self.experienced_any_error = self.experienced_any_error or bool( + self.systemd_exit_code + ) sock = self.sockets[self.stage] sock.connect(self.remote) From a053c193064fae04bf6118bb3ba63dd5471c3c86 Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Thu, 1 Aug 2024 17:18:07 -0600 Subject: [PATCH 18/22] Clean up UI - remove logs duplicated across stages - send the single line traceback to systemd - fix a minor string format in user output --- cloudinit/cmd/main.py | 15 +++++++++------ cloudinit/log.py | 3 +++ cloudinit/socket.py | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 59dc48ed21f..1fe49a1c7e7 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -816,9 +816,10 @@ def status_wrapper(name, args): ) v1[mode]["start"] = float(util.uptime()) - preexisting_recoverable_errors = next( + handler = next( filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers) - ).export_logs() + ) + preexisting_recoverable_errors = handler.export_logs() # Write status.json prior to running init / module code atomic_helper.write_json(status_path, status) @@ -859,11 +860,13 @@ def status_wrapper(name, args): v1["stage"] = None # merge new recoverable errors into existing recoverable error list - new_recoverable_errors = next( + handler = next( filter( lambda h: isinstance(h, log.LogExporter), root_logger.handlers ) - ).export_logs() + ) + new_recoverable_errors = handler.export_logs() + handler.clean_logs() for key in new_recoverable_errors.keys(): if key in preexisting_recoverable_errors: v1[mode]["recoverable_errors"][key] = list( @@ -1221,10 +1224,10 @@ def all_stages(parser): if sync.experienced_any_error: message = "a stage of cloud-init exited non-zero" if sync.first_exception: - message = "first exception received: {sync.first_exception}" + message = f"first exception received: {sync.first_exception}" socket.sd_notify( f"STATUS=Completed with failure, {message}. Run 'cloud-init status" - "--long' for more details." + " --long' for more details." ) socket.sd_notify("STOPPING=1") # exit 1 for a fatal failure in any stage diff --git a/cloudinit/log.py b/cloudinit/log.py index 61b96262aa1..983b426b7ce 100644 --- a/cloudinit/log.py +++ b/cloudinit/log.py @@ -152,6 +152,9 @@ def emit(self, record: logging.LogRecord): def export_logs(self): return copy.deepcopy(self.holder) + def clean_logs(self): + self.holder = defaultdict(list) + def flush(self): pass diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 182844bf786..10e4ae2b6c2 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -149,7 +149,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): 'and "cloud-init status --long" for more details' ) if not self.first_exception: - self.first_exception = message + self.first_exception = status LOG.fatal(status) sd_notify(f"STATUS={status}") From 9ca7a97280820a050d3c5f7755374c1b9261d46d Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Thu, 1 Aug 2024 17:33:29 -0600 Subject: [PATCH 19/22] format --- cloudinit/cmd/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 1fe49a1c7e7..60f94bb8a0f 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -865,7 +865,7 @@ def status_wrapper(name, args): lambda h: isinstance(h, log.LogExporter), root_logger.handlers ) ) - new_recoverable_errors = handler.export_logs() + new_recoverable_errors = handler.export_logs() handler.clean_logs() for key in new_recoverable_errors.keys(): if key in preexisting_recoverable_errors: From 7df7d8378a169ac7d69f9417b29e66a0ee4c1dfa Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Fri, 2 Aug 2024 14:06:18 -0600 Subject: [PATCH 20/22] comments --- cloudinit/cmd/main.py | 5 ----- cloudinit/socket.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 60f94bb8a0f..2de9826bb83 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -860,11 +860,6 @@ def status_wrapper(name, args): v1["stage"] = None # merge new recoverable errors into existing recoverable error list - handler = next( - filter( - lambda h: isinstance(h, log.LogExporter), root_logger.handlers - ) - ) new_recoverable_errors = handler.export_logs() handler.clean_logs() for key in new_recoverable_errors.keys(): diff --git a/cloudinit/socket.py b/cloudinit/socket.py index 10e4ae2b6c2..7ef19f43798 100644 --- a/cloudinit/socket.py +++ b/cloudinit/socket.py @@ -138,7 +138,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): """Notify the socket that this stage is complete.""" - message = "done" + message = f"Completed socket interaction for boot stage {self.stage}" if exc_type: # handle exception thrown in context self.systemd_exit_code = 1 From cb7bb255c2231159a30f48522d397995244a296d Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Fri, 2 Aug 2024 14:21:05 -0600 Subject: [PATCH 21/22] fix merge conflicts --- cloudinit/config/cc_mounts.py | 2 +- tests/unittests/config/test_cc_mounts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudinit/config/cc_mounts.py b/cloudinit/config/cc_mounts.py index 5efcec946d8..1d9f821bbd0 100644 --- a/cloudinit/config/cc_mounts.py +++ b/cloudinit/config/cc_mounts.py @@ -524,7 +524,7 @@ def handle(name: str, cfg: Config, cloud: Cloud, args: list) -> None: # fs_spec, fs_file, fs_vfstype, fs_mntops, fs-freq, fs_passno uses_systemd = cloud.distro.uses_systemd() default_mount_options = ( - "defaults,nofail,x-systemd.after=cloud-init.service,_netdev" + "defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev" if uses_systemd else "defaults,nobootwait" ) diff --git a/tests/unittests/config/test_cc_mounts.py b/tests/unittests/config/test_cc_mounts.py index f3cd0472970..7e85987b744 100644 --- a/tests/unittests/config/test_cc_mounts.py +++ b/tests/unittests/config/test_cc_mounts.py @@ -566,7 +566,7 @@ def test_fstab_mounts_combinations(self): LABEL=UEFI /dev/sda4 /mnt2 auto nofail,comment=cloudconfig 1 2 /dev/sda5 /mnt3 auto defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 - /dev/sda1 /mnt xfs defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 + /dev/sda1 /mnt xfs auto,comment=cloudconfig 0 2 /dev/sda3 /mnt4 btrfs defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 /dev/sdb1 none swap sw,comment=cloudconfig 0 0 """ # noqa: E501 From 4b7dbbb09b3b9df805d742dcfca02c4ff72d8afa Mon Sep 17 00:00:00 2001 From: Brett Holman Date: Fri, 2 Aug 2024 14:37:52 -0600 Subject: [PATCH 22/22] update unit tests --- tests/unittests/test_all_stages.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/unittests/test_all_stages.py b/tests/unittests/test_all_stages.py index 1ea47200295..90bde5e1add 100644 --- a/tests/unittests/test_all_stages.py +++ b/tests/unittests/test_all_stages.py @@ -76,7 +76,7 @@ def test_all_stages(tmp_path): After a socket has been been bound but before it has started listening """ - expected = b"echo 'done'; exit 0;" + expected = "echo 'Completed socket interaction for boot stage {}'; exit 0;" with mock.patch.object( ci_socket, "DEFAULT_RUN_DIR", tmp_path ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( @@ -96,17 +96,17 @@ def test_all_stages(tmp_path): pass # check that the first sync returned - assert expected == first.receive() + assert expected.format("first").encode() == first.receive() # "wait" on the second sync event with sync("second"): pass # check that the second sync returned - assert expected == second.receive() + assert expected.format("second").encode() == second.receive() # "wait" on the third sync event with sync("third"): pass # check that the third sync returned - assert expected == third.receive() + assert expected.format("third").encode() == third.receive() def test_all_stages_threaded(tmp_path): @@ -116,7 +116,7 @@ def test_all_stages_threaded(tmp_path): max_sleep = 100 # initialize random number generator random.seed(time.time()) - expected = b"echo 'done'; exit 0;" + expected = "echo 'Completed socket interaction for boot stage {}'; exit 0;" sync_storage = {} def syncer(index: int, name: str): @@ -154,32 +154,32 @@ def syncer(index: int, name: str): pass # check that the first sync returned - assert expected == sync_storage[1].receive() + assert expected.format("first").encode() == sync_storage[1].receive() # wait on the second sync event with sync("second"): pass # check that the second sync returned - assert expected == sync_storage[2].receive() + assert expected.format("second").encode() == sync_storage[2].receive() # wait on the third sync event with sync("third"): pass # check that the third sync returned - assert expected == sync_storage[3].receive() + assert expected.format("third").encode() == sync_storage[3].receive() with sync("fourth"): pass # check that the fourth sync returned - assert expected == sync_storage[4].receive() + assert expected.format("fourth").encode() == sync_storage[4].receive() with sync("fifth"): pass # check that the fifth sync returned - assert expected == sync_storage[5].receive() + assert expected.format("fifth").encode() == sync_storage[5].receive() def test_all_stages_exception(tmp_path):