diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 54ba79e1bbc..2de9826bb83 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -24,6 +24,7 @@ from cloudinit import netinfo from cloudinit import signal_handler from cloudinit import sources +from cloudinit import socket from cloudinit import stages from cloudinit import url_helper from cloudinit import util @@ -38,7 +39,12 @@ from cloudinit.config.schema import validate_cloudconfig_schema from cloudinit import log from cloudinit.reporting import events -from cloudinit.settings import PER_INSTANCE, PER_ALWAYS, PER_ONCE, CLOUD_CONFIG +from cloudinit.settings import ( + PER_INSTANCE, + PER_ALWAYS, + PER_ONCE, + CLOUD_CONFIG, +) # Welcome message template WELCOME_MSG_TPL = ( @@ -362,8 +368,11 @@ def main_init(name, args): outfmt = None errfmt = None try: - close_stdin(lambda msg: early_logs.append((logging.DEBUG, msg))) - outfmt, errfmt = util.fixup_output(init.cfg, name) + if not args.skip_log_setup: + close_stdin(lambda msg: early_logs.append((logging.DEBUG, msg))) + outfmt, errfmt = util.fixup_output(init.cfg, name) + else: + outfmt, errfmt = util.get_output_cfg(init.cfg, name) except Exception: msg = "Failed to setup output redirection!" util.logexc(LOG, msg) @@ -375,8 +384,9 @@ def main_init(name, args): "Logging being reset, this logger may no longer be active shortly" ) log.reset_logging() - log.setup_logging(init.cfg) - apply_reporting_cfg(init.cfg) + if not args.skip_log_setup: + log.setup_logging(init.cfg) + apply_reporting_cfg(init.cfg) # Any log usage prior to setup_logging above did not have local user log # config applied. We send the welcome message now, as stderr/out have @@ -633,8 +643,9 @@ def main_modules(action_name, args): mods = Modules(init, extract_fns(args), reporter=args.reporter) # Stage 4 try: - close_stdin() - util.fixup_output(mods.cfg, name) + if not args.skip_log_setup: + close_stdin() + util.fixup_output(mods.cfg, name) except Exception: util.logexc(LOG, "Failed to setup output redirection!") if args.debug: @@ -643,8 +654,9 @@ def main_modules(action_name, args): "Logging being reset, this logger may no longer be active shortly" ) log.reset_logging() - log.setup_logging(mods.cfg) - apply_reporting_cfg(init.cfg) + if not args.skip_log_setup: + log.setup_logging(mods.cfg) + apply_reporting_cfg(init.cfg) # now that logging is setup and stdout redirected, send welcome welcome(name, msg=w_msg) @@ -804,9 +816,10 @@ def status_wrapper(name, args): ) v1[mode]["start"] = float(util.uptime()) - preexisting_recoverable_errors = next( + handler = next( filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers) - ).export_logs() + ) + preexisting_recoverable_errors = handler.export_logs() # Write status.json prior to running init / module code atomic_helper.write_json(status_path, status) @@ -847,11 +860,8 @@ def status_wrapper(name, args): v1["stage"] = None # merge new recoverable errors into existing recoverable error list - new_recoverable_errors = next( - filter( - lambda h: isinstance(h, log.LogExporter), root_logger.handlers - ) - ).export_logs() + new_recoverable_errors = handler.export_logs() + handler.clean_logs() for key in new_recoverable_errors.keys(): if key in preexisting_recoverable_errors: v1[mode]["recoverable_errors"][key] = list( @@ -953,9 +963,19 @@ def main(sysv_args=None): default=False, ) + parser.add_argument( + "--all-stages", + dest="all_stages", + action="store_true", + help=( + "Run cloud-init's stages under a single process using a " + "syncronization protocol. This is not intended for CLI usage." + ), + default=False, + ) + parser.set_defaults(reporter=None) subparsers = parser.add_subparsers(title="Subcommands", dest="subcommand") - subparsers.required = True # Each action and its sub-options (if any) parser_init = subparsers.add_parser( @@ -1143,8 +1163,76 @@ def main(sysv_args=None): status_parser(parser_status) parser_status.set_defaults(action=("status", handle_status_args)) + else: + parser.error("a subcommand is required") args = parser.parse_args(args=sysv_args) + setattr(args, "skip_log_setup", False) + if not args.all_stages: + return sub_main(args) + return all_stages(parser) + + +def all_stages(parser): + """Run all stages in a single process using an ordering protocol.""" + LOG.info("Running cloud-init in single process mode.") + + # this _must_ be called before sd_notify is called otherwise netcat may + # attempt to send "start" before a socket exists + sync = socket.SocketSync("local", "network", "config", "final") + + # notify systemd that this stage has completed + socket.sd_notify("READY=1") + # wait for cloud-init-local.service to start + with sync("local"): + # set up logger + args = parser.parse_args(args=["init", "--local"]) + args.skip_log_setup = False + # run local stage + sync.systemd_exit_code = sub_main(args) + + # wait for cloud-init-network.service to start + with sync("network"): + # skip re-setting up logger + args = parser.parse_args(args=["init"]) + args.skip_log_setup = True + # run init stage + sync.systemd_exit_code = sub_main(args) + + # wait for cloud-config.service to start + with sync("config"): + # skip re-setting up logger + args = parser.parse_args(args=["modules", "--mode=config"]) + args.skip_log_setup = True + # run config stage + sync.systemd_exit_code = sub_main(args) + + # wait for cloud-final.service to start + with sync("final"): + # skip re-setting up logger + args = parser.parse_args(args=["modules", "--mode=final"]) + args.skip_log_setup = True + # run final stage + sync.systemd_exit_code = sub_main(args) + + # signal completion to cloud-init-main.service + if sync.experienced_any_error: + message = "a stage of cloud-init exited non-zero" + if sync.first_exception: + message = f"first exception received: {sync.first_exception}" + socket.sd_notify( + f"STATUS=Completed with failure, {message}. Run 'cloud-init status" + " --long' for more details." + ) + socket.sd_notify("STOPPING=1") + # exit 1 for a fatal failure in any stage + return 1 + else: + socket.sd_notify("STATUS=Completed") + socket.sd_notify("STOPPING=1") + + +def sub_main(args): # Subparsers.required = True and each subparser sets action=(name, functor) (name, functor) = args.action diff --git a/cloudinit/cmd/status.py b/cloudinit/cmd/status.py index 39089802984..f027321ce22 100644 --- a/cloudinit/cmd/status.py +++ b/cloudinit/cmd/status.py @@ -318,8 +318,9 @@ def systemd_failed(wait: bool) -> bool: for service in [ "cloud-final.service", "cloud-config.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-init-local.service", + "cloud-init-main.service", ]: try: stdout = query_systemctl( diff --git a/cloudinit/config/cc_mounts.py b/cloudinit/config/cc_mounts.py index 5efcec946d8..1d9f821bbd0 100644 --- a/cloudinit/config/cc_mounts.py +++ b/cloudinit/config/cc_mounts.py @@ -524,7 +524,7 @@ def handle(name: str, cfg: Config, cloud: Cloud, args: list) -> None: # fs_spec, fs_file, fs_vfstype, fs_mntops, fs-freq, fs_passno uses_systemd = cloud.distro.uses_systemd() default_mount_options = ( - "defaults,nofail,x-systemd.after=cloud-init.service,_netdev" + "defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev" if uses_systemd else "defaults,nobootwait" ) diff --git a/cloudinit/config/schemas/schema-cloud-config-v1.json b/cloudinit/config/schemas/schema-cloud-config-v1.json index c05bd994212..4ae8b4a8f70 100644 --- a/cloudinit/config/schemas/schema-cloud-config-v1.json +++ b/cloudinit/config/schemas/schema-cloud-config-v1.json @@ -2022,12 +2022,12 @@ }, "mount_default_fields": { "type": "array", - "description": "Default mount configuration for any mount entry with less than 6 options provided. When specified, 6 items are required and represent ``/etc/fstab`` entries. Default: ``defaults,nofail,x-systemd.after=cloud-init.service,_netdev``", + "description": "Default mount configuration for any mount entry with less than 6 options provided. When specified, 6 items are required and represent ``/etc/fstab`` entries. Default: ``defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev``", "default": [ null, null, "auto", - "defaults,nofail,x-systemd.after=cloud-init.service", + "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2" ], diff --git a/cloudinit/log.py b/cloudinit/log.py index 61b96262aa1..983b426b7ce 100644 --- a/cloudinit/log.py +++ b/cloudinit/log.py @@ -152,6 +152,9 @@ def emit(self, record: logging.LogRecord): def export_logs(self): return copy.deepcopy(self.holder) + def clean_logs(self): + self.holder = defaultdict(list) + def flush(self): pass diff --git a/cloudinit/socket.py b/cloudinit/socket.py new file mode 100644 index 00000000000..7ef19f43798 --- /dev/null +++ b/cloudinit/socket.py @@ -0,0 +1,174 @@ +# This file is part of cloud-init. See LICENSE file for license information. +"""A module for common socket helpers.""" +import logging +import os +import socket +import sys +import time +from contextlib import suppress + +from cloudinit.settings import DEFAULT_RUN_DIR + +LOG = logging.getLogger(__name__) + + +def sd_notify(message: str): + """Send a sd_notify message. + + :param message: sd-notify message (must be valid ascii) + """ + socket_path = os.environ.get("NOTIFY_SOCKET", "") + + if not socket_path: + # not running under systemd, no-op + return + + elif socket_path[0] == "@": + # abstract + socket_path.replace("@", "\0", 1) + + # unix domain + elif socket_path[0] != "/": + raise OSError("Unsupported socket type") + + with socket.socket( + socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC + ) as sock: + LOG.info("Sending sd_notify(%s)", str(message)) + sock.connect(socket_path) + sock.sendall(message.encode("ascii")) + + +class SocketSync: + """A two way synchronization protocol over Unix domain sockets.""" + + def __init__(self, *names: str): + """Initialize a synchronization context. + + 1) Ensure that the socket directory exists. + 2) Bind a socket for each stage. + + Binding the sockets on initialization allows receipt of stage + "start" notifications prior to the cloud-init stage being ready to + start. + + :param names: stage names, used as a unique identifiers + """ + self.stage = "" + self.remote = "" + self.first_exception = "" + self.systemd_exit_code = 0 + self.experienced_any_error = False + self.sockets = { + name: socket.socket( + socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC + ) + for name in names + } + # ensure the directory exists + os.makedirs(f"{DEFAULT_RUN_DIR}/share", mode=0o700, exist_ok=True) + # removing stale sockets and bind + for name, sock in self.sockets.items(): + socket_path = f"{DEFAULT_RUN_DIR}/share/{name}.sock" + with suppress(FileNotFoundError): + os.remove(socket_path) + sock.bind(socket_path) + + def __call__(self, stage: str): + """Set the stage before entering context. + + This enables the context manager to be initialized separately from + each stage synchronization. + + :param stage: the name of a stage to synchronize + + Example: + sync = SocketSync("stage 1", "stage 2"): + with sync("stage 1"): + pass + with sync("stage 2"): + pass + """ + if stage not in self.sockets: + raise ValueError(f"Invalid stage name: {stage}") + self.stage = stage + return self + + def __enter__(self): + """Wait until a message has been received on this stage's socket. + + Once the message has been received, enter the context. + """ + if os.isatty(sys.stdin.fileno()): + LOG.info( + "Stdin is a tty, so skipping stage synchronization protocol" + ) + return + self.systemd_exit_code = 0 + sd_notify( + "STATUS=Waiting on external services to " + f"complete before starting the {self.stage} stage." + ) + start_time = time.monotonic() + # block until init system sends us data + # the first value returned contains a message from the init system + # (should be "start") + # the second value contains the path to a unix socket on which to + # reply, which is expected to be /path/to/{self.stage}-return.sock + sock = self.sockets[self.stage] + chunk, self.remote = sock.recvfrom(5) + + if b"start" != chunk: + # The protocol expects to receive a command "start" + self.__exit__(None, None, None) + raise ValueError(f"Received invalid message: [{str(chunk)}]") + elif f"{DEFAULT_RUN_DIR}/share/{self.stage}-return.sock" != str( + self.remote + ): + # assert that the return path is in a directory with appropriate + # permissions + self.__exit__(None, None, None) + raise ValueError(f"Unexpected path to unix socket: {self.remote}") + + total = time.monotonic() - start_time + time_msg = f"took {total: .3f}s to " if total > 0.01 else "" + sd_notify(f"STATUS=Running ({self.stage} stage)") + LOG.debug("sync(%s): synchronization %scomplete", self.stage, time_msg) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Notify the socket that this stage is complete.""" + message = f"Completed socket interaction for boot stage {self.stage}" + if exc_type: + # handle exception thrown in context + self.systemd_exit_code = 1 + self.experienced_any_error = True + status = f"{repr(exc_val)} in {exc_tb.tb_frame}" + message = ( + 'fatal error, run "systemctl status cloud-init-main.service" ' + 'and "cloud-init status --long" for more details' + ) + if not self.first_exception: + self.first_exception = status + LOG.fatal(status) + sd_notify(f"STATUS={status}") + + self.experienced_any_error = self.experienced_any_error or bool( + self.systemd_exit_code + ) + sock = self.sockets[self.stage] + sock.connect(self.remote) + + # the returned message will be executed in a subshell + # hardcode this message rather than sending a more informative message + # to avoid having to sanitize inputs (to prevent escaping the shell) + sock.sendall( + f"echo '{message}'; exit {self.systemd_exit_code};".encode() + ) + sock.close() + + # suppress exception - the exception was logged and the init system + # notified of stage completion (and the exception received as a status + # message). Raising an exception would block the rest of boot, so carry + # on in a degraded state. + return True diff --git a/config/cloud.cfg.tmpl b/config/cloud.cfg.tmpl index 4b1efdbcbf1..bc3e6067ec4 100644 --- a/config/cloud.cfg.tmpl +++ b/config/cloud.cfg.tmpl @@ -62,7 +62,7 @@ disable_root: true "openmandriva", "photon", "TencentOS"] or is_rhel %} {% if is_rhel %} -mount_default_fields: [~, ~, 'auto', 'defaults,nofail,x-systemd.after=cloud-init.service,_netdev', '0', '2'] +mount_default_fields: [~, ~, 'auto', 'defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev', '0', '2'] {% else %} mount_default_fields: [~, ~, 'auto', 'defaults,nofail', '0', '2'] {% endif %} diff --git a/doc/module-docs/cc_mounts/data.yaml b/doc/module-docs/cc_mounts/data.yaml index 751b301d501..18193f062d3 100644 --- a/doc/module-docs/cc_mounts/data.yaml +++ b/doc/module-docs/cc_mounts/data.yaml @@ -18,7 +18,7 @@ cc_mounts: .. code-block:: yaml mounts: - - ["ephemeral0", "/mnt", "auto", "defaults,nofail,x-systemd.after=cloud-init.service", "0", "2"] + - ["ephemeral0", "/mnt", "auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] - ["swap", "none", "swap", "sw", "0", "0"] In order to remove a previously-listed mount, an entry can be added to the @@ -32,7 +32,7 @@ cc_mounts: .. code-block:: yaml - mount_default_fields: [none, none, "auto", "defaults,nofail,x-systemd.after=cloud-init.service", "0", "2"] + mount_default_fields: [none, none, "auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] Non-systemd init systems will vary in ``mount_default_fields``. diff --git a/doc/rtd/explanation/boot.rst b/doc/rtd/explanation/boot.rst index a975ca7a093..ff3b65ebd28 100644 --- a/doc/rtd/explanation/boot.rst +++ b/doc/rtd/explanation/boot.rst @@ -108,7 +108,7 @@ Network ======= +------------------+----------------------------------------------------------+ -| systemd service | ``cloud-init.service`` | +| systemd service | ``cloud-init-network.service`` | +---------+--------+----------------------------------------------------------+ | runs | after local stage and configured networking is up | +---------+--------+----------------------------------------------------------+ diff --git a/doc/rtd/howto/debugging.rst b/doc/rtd/howto/debugging.rst index c8b2a2634bc..546e8dd9f45 100644 --- a/doc/rtd/howto/debugging.rst +++ b/doc/rtd/howto/debugging.rst @@ -55,7 +55,7 @@ Cloud-init did not run .. code-block:: - systemctl status cloud-init-local.service cloud-init.service\ + systemctl status cloud-init-local.service cloud-init-network.service\ cloud-config.service cloud-final.service Cloud-init may have started to run, but not completed. This shows how many, diff --git a/packages/redhat/cloud-init.spec.in b/packages/redhat/cloud-init.spec.in index bc57fe9aac9..672cd426673 100644 --- a/packages/redhat/cloud-init.spec.in +++ b/packages/redhat/cloud-init.spec.in @@ -124,7 +124,7 @@ if [ $1 -eq 1 ] then /bin/systemctl enable cloud-config.service >/dev/null 2>&1 || : /bin/systemctl enable cloud-final.service >/dev/null 2>&1 || : - /bin/systemctl enable cloud-init.service >/dev/null 2>&1 || : + /bin/systemctl enable cloud-init-network.service >/dev/null 2>&1 || : /bin/systemctl enable cloud-init-local.service >/dev/null 2>&1 || : fi %else @@ -141,7 +141,7 @@ if [ $1 -eq 0 ] then /bin/systemctl --no-reload disable cloud-config.service >/dev/null 2>&1 || : /bin/systemctl --no-reload disable cloud-final.service >/dev/null 2>&1 || : - /bin/systemctl --no-reload disable cloud-init.service >/dev/null 2>&1 || : + /bin/systemctl --no-reload disable cloud-init-network.service >/dev/null 2>&1 || : /bin/systemctl --no-reload disable cloud-init-local.service >/dev/null 2>&1 || : fi %else diff --git a/systemd/cloud-config.service.tmpl b/systemd/cloud-config.service.tmpl index 79c75c71ae6..9067d6e4bc0 100644 --- a/systemd/cloud-config.service.tmpl +++ b/systemd/cloud-config.service.tmpl @@ -10,7 +10,14 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init modules --mode=config +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/config.sock -s /run/cloud-init/share/config-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-config.target b/systemd/cloud-config.target index 2d65e3433ce..be754bbd19d 100644 --- a/systemd/cloud-config.target +++ b/systemd/cloud-config.target @@ -14,5 +14,5 @@ [Unit] Description=Cloud-config availability -Wants=cloud-init-local.service cloud-init.service -After=cloud-init-local.service cloud-init.service +Wants=cloud-init-local.service cloud-init-network.service +After=cloud-init-local.service cloud-init-network.service diff --git a/systemd/cloud-final.service.tmpl b/systemd/cloud-final.service.tmpl index b66533643d3..9fb2f681f73 100644 --- a/systemd/cloud-final.service.tmpl +++ b/systemd/cloud-final.service.tmpl @@ -15,10 +15,16 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init modules --mode=final +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/final.sock -s /run/cloud-init/share/final-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 -KillMode=process {% if variant in ["almalinux", "cloudlinux", "rhel"] %} # Restart NetworkManager if it is present and running. ExecStartPost=/bin/sh -c 'u=NetworkManager.service; \ diff --git a/systemd/cloud-init-local.service.tmpl b/systemd/cloud-init-local.service.tmpl index 0da2d8337e9..b0a534b8f9a 100644 --- a/systemd/cloud-init-local.service.tmpl +++ b/systemd/cloud-init-local.service.tmpl @@ -7,7 +7,6 @@ DefaultDependencies=no {% endif %} Wants=network-pre.target After=hv_kvp_daemon.service -After=systemd-remount-fs.service {% if variant in ["almalinux", "cloudlinux", "rhel"] %} Requires=dbus.socket After=dbus.socket @@ -38,7 +37,14 @@ ExecStartPre=/bin/mkdir -p /run/cloud-init ExecStartPre=/sbin/restorecon /run/cloud-init ExecStartPre=/usr/bin/touch /run/cloud-init/enabled {% endif %} -ExecStart=/usr/bin/cloud-init init --local +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/local.sock -s /run/cloud-init/share/local-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/systemd/cloud-init-main.service.tmpl b/systemd/cloud-init-main.service.tmpl new file mode 100644 index 00000000000..1ddfd62073e --- /dev/null +++ b/systemd/cloud-init-main.service.tmpl @@ -0,0 +1,52 @@ +## template:jinja +# systemd ordering resources +# ========================== +# https://systemd.io/NETWORK_ONLINE/ +# https://docs.cloud-init.io/en/latest/explanation/boot.html +# https://www.freedesktop.org/wiki/Software/systemd/NetworkTarget/ +# https://www.freedesktop.org/software/systemd/man/latest/systemd.special.html +# https://www.freedesktop.org/software/systemd/man/latest/systemd-remount-fs.service.html +[Unit] +Description=Cloud-init: Single Process +Wants=network-pre.target +{% if variant in ["almalinux", "cloudlinux", "ubuntu", "unknown", "debian", "rhel"] %} +DefaultDependencies=no +{% endif %} +{% if variant in ["almalinux", "cloudlinux", "rhel"] %} +Requires=dbus.socket +After=dbus.socket +Before=network.service +Before=firewalld.target +Conflicts=shutdown.target +{% endif %} +{% if variant in ["ubuntu", "unknown", "debian"] %} +Before=sysinit.target +Conflicts=shutdown.target +{% endif %} + +After=systemd-remount-fs.service +Before=sysinit.target +Before=cloud-init-local.service +Conflicts=shutdown.target +RequiresMountsFor=/var/lib/cloud +ConditionPathExists=!/etc/cloud/cloud-init.disabled +ConditionKernelCommandLine=!cloud-init=disabled +ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled + +[Service] +Type=notify +ExecStart=/usr/bin/cloud-init --all-stages +KillMode=process +TasksMax=infinity +TimeoutStartSec=infinity +{% if variant in ["almalinux", "cloudlinux", "rhel"] %} +ExecStartPre=/bin/mkdir -p /run/cloud-init +ExecStartPre=/sbin/restorecon /run/cloud-init +ExecStartPre=/usr/bin/touch /run/cloud-init/enabled +{% endif %} + +# Output needs to appear in instance console output +StandardOutput=journal+console + +[Install] +WantedBy=cloud-init.target diff --git a/systemd/cloud-init.service.tmpl b/systemd/cloud-init-network.service.tmpl similarity index 71% rename from systemd/cloud-init.service.tmpl rename to systemd/cloud-init-network.service.tmpl index 58031cc4331..6957b39f1ee 100644 --- a/systemd/cloud-init.service.tmpl +++ b/systemd/cloud-init-network.service.tmpl @@ -46,7 +46,14 @@ ConditionEnvironment=!KERNEL_CMDLINE=cloud-init=disabled [Service] Type=oneshot -ExecStart=/usr/bin/cloud-init init +# This service is a shim which preserves systemd ordering while allowing a +# single Python process to run cloud-init's logic. This works by communicating +# with the cloud-init process over a unix socket to tell the process that this +# stage can start, and then wait on a return socket until the cloud-init +# process has completed this stage. The output from the return socket is piped +# into a shell so that the process can send a completion message (defaults to +# "done", otherwise includes an error message) and an exit code to systemd. +ExecStart=sh -c 'echo "start" | nc.openbsd -Uu -W1 /run/cloud-init/share/network.sock -s /run/cloud-init/share/network-return.sock | sh' RemainAfterExit=yes TimeoutSec=0 diff --git a/tests/integration_tests/assets/enable_coverage.py b/tests/integration_tests/assets/enable_coverage.py index ed71ceef8f5..1d18fcbef04 100644 --- a/tests/integration_tests/assets/enable_coverage.py +++ b/tests/integration_tests/assets/enable_coverage.py @@ -2,7 +2,7 @@ services = [ "cloud-init-local.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-config.service", "cloud-final.service", ] diff --git a/tests/integration_tests/assets/enable_profile.py b/tests/integration_tests/assets/enable_profile.py index a6a0070c3c5..9b68e42ce05 100644 --- a/tests/integration_tests/assets/enable_profile.py +++ b/tests/integration_tests/assets/enable_profile.py @@ -2,7 +2,7 @@ services = [ "cloud-init-local.service", - "cloud-init.service", + "cloud-init-network.service", "cloud-config.service", "cloud-final.service", ] diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index c3b8531ae92..8ba5a81b2b5 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -191,7 +191,7 @@ def _collect_profile(instance: IntegrationInstance, log_dir: Path): log_dir / "profile" / "local.stats", ) instance.pull_file( - "/var/log/cloud-init.service.stats", + "/var/log/cloud-init-network.service.stats", log_dir / "profile" / "network.stats", ) instance.pull_file( diff --git a/tests/integration_tests/datasources/test_nocloud.py b/tests/integration_tests/datasources/test_nocloud.py index 24aecc0bd8d..c3462c433a3 100644 --- a/tests/integration_tests/datasources/test_nocloud.py +++ b/tests/integration_tests/datasources/test_nocloud.py @@ -162,7 +162,7 @@ def test_smbios_seed_network(self, client: IntegrationInstance): """\ [Unit] Description=Serve a local webserver - Before=cloud-init.service + Before=cloud-init-network.service Wants=cloud-init-local.service DefaultDependencies=no After=systemd-networkd-wait-online.service @@ -354,7 +354,7 @@ def _boot_with_cmdline( # and NoCloud operates in network timeframe After=systemd-networkd-wait-online.service After=networking.service - Before=cloud-init.service + Before=cloud-init-network.service [Service] Type=exec diff --git a/tests/unittests/cmd/test_main.py b/tests/unittests/cmd/test_main.py index f9b3faab130..bad728f2a72 100644 --- a/tests/unittests/cmd/test_main.py +++ b/tests/unittests/cmd/test_main.py @@ -13,7 +13,9 @@ from cloudinit.util import ensure_dir, load_text_file, write_file from tests.unittests.helpers import FilesystemMockingTestCase, wrap_and_call -MyArgs = namedtuple("MyArgs", "debug files force local reporter subcommand") +MyArgs = namedtuple( + "MyArgs", "debug files force local reporter subcommand skip_log_setup" +) class TestMain(FilesystemMockingTestCase): @@ -76,6 +78,7 @@ def test_main_init_run_net_runs_modules(self): local=False, reporter=None, subcommand="init", + skip_log_setup=False, ) (_item1, item2) = wrap_and_call( "cloudinit.cmd.main", @@ -122,6 +125,7 @@ def test_main_init_run_net_calls_set_hostname_when_metadata_present(self): local=False, reporter=None, subcommand="init", + skip_log_setup=False, ) def set_hostname(name, cfg, cloud, args): diff --git a/tests/unittests/config/test_cc_mounts.py b/tests/unittests/config/test_cc_mounts.py index 9982b6741c6..7e85987b744 100644 --- a/tests/unittests/config/test_cc_mounts.py +++ b/tests/unittests/config/test_cc_mounts.py @@ -565,9 +565,9 @@ def test_fstab_mounts_combinations(self): LABEL=keepme none ext4 defaults 0 0 LABEL=UEFI /dev/sda4 /mnt2 auto nofail,comment=cloudconfig 1 2 - /dev/sda5 /mnt3 auto defaults,nofail,x-systemd.after=cloud-init.service,_netdev,comment=cloudconfig 0 2 + /dev/sda5 /mnt3 auto defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 /dev/sda1 /mnt xfs auto,comment=cloudconfig 0 2 - /dev/sda3 /mnt4 btrfs defaults,nofail,x-systemd.after=cloud-init.service,_netdev,comment=cloudconfig 0 2 + /dev/sda3 /mnt4 btrfs defaults,nofail,x-systemd.after=cloud-init-network.service,_netdev,comment=cloudconfig 0 2 /dev/sdb1 none swap sw,comment=cloudconfig 0 0 """ # noqa: E501 ).strip() diff --git a/tests/unittests/test_all_stages.py b/tests/unittests/test_all_stages.py new file mode 100644 index 00000000000..90bde5e1add --- /dev/null +++ b/tests/unittests/test_all_stages.py @@ -0,0 +1,208 @@ +import random +import signal +import socket +import time +from threading import Thread +from unittest import mock + +from cloudinit import socket as ci_socket + + +class Sync: + """A device to send and receive synchronization messages + + Creating an instance of the device sends a b"start" + """ + + def __init__(self, name: str, path: str): + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + self.sock.connect(f"{path}/share/{name}.sock") + self.sock.bind(f"{path}/share/{name}-return.sock") + self.sock.sendall(b"start") + + def receive(self): + """receive 5 bytes from the socket""" + received = self.sock.recv(4096) + self.sock.close() + return received + + +class Timeout: + """A utility which may be used to verify that a timeout occurs + + TimeoutError is raised on successful timeout. + + Create a signal handler and use signal.alarm to verify that the + timeout occured. + """ + + def handle_timeout(self, *_): + raise TimeoutError() + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + # 1 second is, unfortunately, the minimum + signal.alarm(1) + + def __exit__(self, *_): + signal.alarm(0) + + +def test_all_stages_times_out(tmp_path): + """Verify that no "start" makes the protocol block""" + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): + sync = ci_socket.SocketSync("first") + + try: + with Timeout(): + # this should block for 1 second + with sync("first"): + pass + except TimeoutError: + # success is a timeout + pass + else: + raise AssertionError("Expected the thing to timeout!") + + +def test_all_stages(tmp_path): + """Verify that a socket can store "start" messages + + After a socket has been been bound but before it has started listening + """ + expected = "echo 'Completed socket interaction for boot stage {}'; exit 0;" + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): + sync = ci_socket.SocketSync("first", "second", "third") + + # send all three syncs to the sockets + first = Sync("first", tmp_path) + second = Sync("second", tmp_path) + third = Sync("third", tmp_path) + + # "wait" on the first sync event + with sync("first"): + pass + + # check that the first sync returned + assert expected.format("first").encode() == first.receive() + # "wait" on the second sync event + with sync("second"): + pass + # check that the second sync returned + assert expected.format("second").encode() == second.receive() + # "wait" on the third sync event + with sync("third"): + pass + # check that the third sync returned + assert expected.format("third").encode() == third.receive() + + +def test_all_stages_threaded(tmp_path): + """Verify that arbitrary "start" order works""" + + # in milliseconds + max_sleep = 100 + # initialize random number generator + random.seed(time.time()) + expected = "echo 'Completed socket interaction for boot stage {}'; exit 0;" + sync_storage = {} + + def syncer(index: int, name: str): + """sleep for 0-100ms then send a sync notification + + this allows sync order to be arbitrary + """ + time.sleep(0.001 * random.randint(0, max_sleep)) + sync_storage[index] = Sync(name, tmp_path) + + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): + + sync = ci_socket.SocketSync( + "first", "second", "third", "fourth", "fifth" + ) + + for i, name in { + 1: "first", + 2: "second", + 3: "third", + 4: "fourth", + 5: "fifth", + }.items(): + t = Thread(target=syncer, args=(i, name)) + t.run() + + # wait on the first sync event + with sync("first"): + pass + + # check that the first sync returned + assert expected.format("first").encode() == sync_storage[1].receive() + + # wait on the second sync event + with sync("second"): + pass + + # check that the second sync returned + assert expected.format("second").encode() == sync_storage[2].receive() + + # wait on the third sync event + with sync("third"): + pass + + # check that the third sync returned + assert expected.format("third").encode() == sync_storage[3].receive() + with sync("fourth"): + pass + + # check that the fourth sync returned + assert expected.format("fourth").encode() == sync_storage[4].receive() + + with sync("fifth"): + pass + + # check that the fifth sync returned + assert expected.format("fifth").encode() == sync_storage[5].receive() + + +def test_all_stages_exception(tmp_path): + """Verify that exceptions log messages produce a valid warning message""" + with mock.patch.object( + ci_socket, "DEFAULT_RUN_DIR", tmp_path + ), mock.patch.object(ci_socket, "sd_notify"), mock.patch.object( + ci_socket.os, "isatty", return_value=False + ), mock.patch.object( + ci_socket.sys.stdin, "fileno" + ): + sync = ci_socket.SocketSync("first", "second", "third") + + # send all three syncs to the sockets + first = Sync("first", tmp_path) + + # "wait" on the first sync event + with sync("first"): + # verify that an exception in context doesn't raise + 1 / 0 # pylint: disable=W0104 + + assert ( + b"echo 'fatal error, run \"systemctl status cloud-init-main." + b'service" and "cloud-init status --long" for ' + b"more details'; exit 1;" == first.receive() + ) diff --git a/tests/unittests/test_cli.py b/tests/unittests/test_cli.py index 6ab6d496b16..a7c3b1ba38b 100644 --- a/tests/unittests/test_cli.py +++ b/tests/unittests/test_cli.py @@ -160,9 +160,7 @@ def test_no_arguments_shows_usage(self, capsys): def test_no_arguments_shows_error_message(self, capsys): exit_code = self._call_main() - missing_subcommand_message = ( - "the following arguments are required: subcommand" - ) + missing_subcommand_message = "a subcommand is required" _out, err = capsys.readouterr() assert ( missing_subcommand_message in err