From fae8941196354274b91081d1237bd293a5aaa8f3 Mon Sep 17 00:00:00 2001 From: Dan Nicholson Date: Fri, 30 Aug 2024 05:53:10 -0600 Subject: [PATCH 1/2] tests: Add mount propagation test This tests the current behavior of making /sysroot a private mount so that submounts on /var do not propagate back to /sysroot. It also shows how submounts of /sysroot do not propagate into separate mount namespaces for the same reason. --- .../kolainst/destructive/mount-propagation.sh | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 tests/kolainst/destructive/mount-propagation.sh diff --git a/tests/kolainst/destructive/mount-propagation.sh b/tests/kolainst/destructive/mount-propagation.sh new file mode 100755 index 0000000000..b7fc87e696 --- /dev/null +++ b/tests/kolainst/destructive/mount-propagation.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# https://bugzilla.redhat.com/show_bug.cgi?id=1498281 +set -xeuo pipefail + +. ${KOLA_EXT_DATA}/libinsttest.sh + +require_writable_sysroot + +get_mount() { + local target=${1:?No target specified} + local pid=${2:?No PID specified} + + # findmnt always looks at /proc/self/mountinfo, so we have to first enter the + # mount namespace of the desired process. + nsenter --target "${pid}" --mount -- \ + findmnt --json --list --output +PROPAGATION,OPT-FIELDS \ + | jq ".filesystems[] | select(.target == \"${target}\")" +} + +assert_has_mount() { + local target=${1:?No target specified} + local pid=${2:-$$} + local mount + + mount=$(get_mount "${target}" "${pid}") + if [ -n "${mount}" ]; then + echo -e "Process ${pid} has mount '${target}':\n${mount}" + else + cat "/proc/${pid}/mountinfo" >&2 + fatal "Mount '${target}' not found in process ${pid}" + fi +} + +assert_not_has_mount() { + local target=${1:?No target specified} + local pid=${2:-$$} + local mount + + mount=$(get_mount "${target}" "${pid}") + if [ -n "${mount}" ]; then + cat "/proc/${pid}/mountinfo" >&2 + fatal "Mount '${target}' found in process ${pid}" + else + echo "Process ${pid} does not have mount '${target}'" + fi +} + +test_mounts() { + local stateroot + + echo "Root namespace mountinfo:" + cat "/proc/$$/mountinfo" + + echo "Sub namespace mountinfo:" + cat "/proc/${ns_pid}/mountinfo" + + # Make sure the 2 PIDs are really in different mount namespaces. + root_ns=$(readlink "/proc/$$/ns/mnt") + sub_ns=$(readlink "/proc/${ns_pid}/ns/mnt") + assert_not_streq "${root_ns}" "${sub_ns}" + + stateroot=$(rpmostree_query_json '.deployments[0].osname') + + # Check the mounts exist in the root namespace and the /var/foo mount has not + # propagated back to /sysroot. + assert_has_mount /var/foo + assert_has_mount /sysroot/bar + assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo" + + # Repeat with the sub mount namespace. Since /sysroot is marked private, + # /sysroot/bar will not be propagated into it. + assert_has_mount /var/foo "${ns_pid}" + assert_not_has_mount /sysroot/bar "${ns_pid}" + assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo" "${ns_pid}" +} + +case "${AUTOPKGTEST_REBOOT_MARK:-}" in + "") + mkdir -p /var/foo /sysroot/bar + + # Create a process in a separate mount namespace to see if the mounts + # propagate into it correctly. + unshare -m --propagation unchanged -- sleep infinity & + ns_pid=$! + + mount -t tmpfs foo /var/foo + mount -t tmpfs bar /sysroot/bar + + test_mounts + + # Now setup for the same test but with the mounts made early via fstab. + cat >> /etc/fstab <<"EOF" +foo /var/foo tmpfs defaults 0 0 +bar /sysroot/bar tmpfs defaults 0 0 +EOF + + # We want to start a process in a separate namespace after ostree-remount + # has completed but before systemd starts the fstab generated mount units. + cat > /etc/systemd/system/test-mounts.service <<"EOF" +[Unit] +DefaultDependencies=no +After=ostree-remount.service +Before=var-foo.mount sysroot-bar.mount +RequiresMountsFor=/var /sysroot +Conflicts=shutdown.target +Before=shutdown.target + +[Service] +Type=exec +ExecStart=/usr/bin/sleep infinity +ProtectSystem=strict + +[Install] +WantedBy=local-fs.target +EOF + systemctl enable test-mounts.service + + /tmp/autopkgtest-reboot 2 + ;; + 2) + # Check that the test service is running and get its PID. + ns_state=$(systemctl show -P ActiveState test-mounts.service) + assert_streq "${ns_state}" active + ns_pid=$(systemctl show -P MainPID test-mounts.service) + + # Make sure that test-mounts.service started after ostree-remount.service + # but before /var/foo and /sysroot/bar were mounted so that we can see if + # the mounts were propagated into its mount namespace. + remount_finished=$(journalctl -o json -g Finished -u ostree-remount.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP) + test_starting=$(journalctl -o json -g Starting -u test-mounts.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP) + test_started=$(journalctl -o json -g Started -u test-mounts.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP) + foo_mounting=$(journalctl -o json -g Mounting -u var-foo.mount | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP) + bar_mounting=$(journalctl -o json -g Mounting -u sysroot-bar.mount | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP) + test "${remount_finished}" -lt "${test_starting}" + test "${test_started}" -lt "${foo_mounting}" + test "${test_started}" -lt "${bar_mounting}" + + test_mounts + ;; + *) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;; +esac From 2973ec591008be94d74d08807079d242b914dcd2 Mon Sep 17 00:00:00 2001 From: Dan Nicholson Date: Thu, 29 Aug 2024 18:19:30 -0600 Subject: [PATCH 2/2] switchroot: Stop making /sysroot mount private Back in 2b8d586c5, /sysroot was changed to be a private mount so that submounts of /var do not propagate back to the stateroot /var. That's laudible, but it makes /sysroot different than every other shared mount in the root namespace. In particular, it means that submounts of /sysroot do not propagate into separate mount namespaces. Rather than make /sysroot private, make /var a slave+shared mount so that it receives mount events from /sysroot but not vice versa. That achieves the same effect of preventing /var submount events from propagating back to /sysroot while allowing /sysroot mount events to propagate forward like every other system mount. See mount_namespaces(7)[1] and the linux shared subtrees[2] documentation for details on slave+shared mount propagation. When /var is mounted in the initramfs, this is accomplished with mount(2) syscalls. When /var is mounted after switching to the real root, the mount propagation flags are applied as options in the generated var.mount unit. This depends on a mount(8) feature that has been present since util-linux 2.23. That's available in RHEL 7 and every non-EOL Debian and Ubuntu release. Applying the propagation from var.mount fixes a small race, too. Previously, if a /var submount was added before /sysroot was made private, it would have propagated back into /sysroot. That was possible since ostree-remount.service orders itself after var.mount but not before any /var submounts. 1. https://man7.org/linux/man-pages/man7/mount_namespaces.7.html 2. https://docs.kernel.org/filesystems/sharedsubtree.html Fixes: #2086 --- src/libostree/ostree-impl-system-generator.c | 11 +++++++++- src/switchroot/ostree-prepare-root.c | 21 +++++++++---------- src/switchroot/ostree-remount.c | 9 -------- .../kolainst/destructive/mount-propagation.sh | 5 ++--- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/libostree/ostree-impl-system-generator.c b/src/libostree/ostree-impl-system-generator.c index c5fe503287..6968c73884 100644 --- a/src/libostree/ostree-impl-system-generator.c +++ b/src/libostree/ostree-impl-system-generator.c @@ -232,6 +232,15 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor * * Note that our unit doesn't run if systemd.volatile is enabled; * see https://github.com/ostreedev/ostree/pull/856 + * + * To avoid having submounts of /var propagate into $stateroot/var, the mount + * is made with slave+shared propagation. This means that /var will receive + * mount events from the parent /sysroot mount, but not vice versa. Adding a + * shared peer group below the slave group means that submounts of /var will + * inherit normal shared propagation. See mount_namespaces(7), Linux + * Documentation/filesystems/sharedsubtree.txt and + * https://github.com/ostreedev/ostree/issues/2086. This also happens in + * ostree-prepare-root.c for the INITRAMFS_MOUNT_VAR case. */ if (!g_output_stream_printf (outstream, &bytes_written, cancellable, error, "##\n# Automatically generated by ostree-system-generator\n##\n\n" @@ -243,7 +252,7 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor "[Mount]\n" "Where=%s\n" "What=%s\n" - "Options=bind\n", + "Options=bind,slave,shared\n", var_path, stateroot_var_path)) return FALSE; if (!g_output_stream_flush (outstream, cancellable, error)) diff --git a/src/switchroot/ostree-prepare-root.c b/src/switchroot/ostree-prepare-root.c index 70bf78f630..7754673eeb 100644 --- a/src/switchroot/ostree-prepare-root.c +++ b/src/switchroot/ostree-prepare-root.c @@ -644,6 +644,16 @@ main (int argc, char *argv[]) { if (mount ("../../var", TMP_SYSROOT "/var", NULL, MS_BIND | MS_SILENT, NULL) < 0) err (EXIT_FAILURE, "failed to bind mount ../../var to var"); + + /* To avoid having submounts of /var propagate into $stateroot/var, the + * mount is made with slave+shared propagation. See the comment in + * ostree-impl-system-generator.c when /var isn't mounted in the + * initramfs for further explanation. + */ + if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SLAVE | MS_SILENT, NULL) < 0) + err (EXIT_FAILURE, "failed to change /var to slave mount"); + if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SHARED | MS_SILENT, NULL) < 0) + err (EXIT_FAILURE, "failed to change /var to slave+shared mount"); } /* This can be used by other things to signal ostree is in use */ @@ -687,16 +697,5 @@ main (int argc, char *argv[]) err (EXIT_FAILURE, "failed to make /sysroot read-only"); } - /* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache - * also propagate to /sysroot/ostree/deploy/$stateroot/var/cache - * - * Now in reality, today this is overridden by systemd: the *actual* way we fix this up - * is in ostree-remount.c. But let's do it here to express the semantics we want - * at the very start (perhaps down the line systemd will have compile/runtime option - * to say that the initramfs environment did everything right from the start). - */ - if (mount ("none", "sysroot", NULL, MS_PRIVATE | MS_SILENT, NULL) < 0) - err (EXIT_FAILURE, "remounting 'sysroot' private"); - exit (EXIT_SUCCESS); } diff --git a/src/switchroot/ostree-remount.c b/src/switchroot/ostree-remount.c index 3babb75141..f0a4b3d925 100644 --- a/src/switchroot/ostree-remount.c +++ b/src/switchroot/ostree-remount.c @@ -167,15 +167,6 @@ main (int argc, char *argv[]) } g_autoptr (GVariantDict) ostree_run_metadata = g_variant_dict_new (ostree_run_metadata_v); - /* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache - * also propagate to /sysroot/ostree/deploy/$stateroot/var/cache - * - * Today systemd remounts / (recursively) as shared, so we're undoing that as early - * as possible. See also a copy of this in ostree-prepare-root.c. - */ - if (mount ("none", "/sysroot", NULL, MS_REC | MS_PRIVATE, NULL) < 0) - perror ("warning: While remounting /sysroot MS_PRIVATE"); - const char *transient_etc = NULL; g_variant_dict_lookup (ostree_run_metadata, OTCORE_RUN_BOOTED_KEY_TRANSIENT_ETC, "&s", &transient_etc); diff --git a/tests/kolainst/destructive/mount-propagation.sh b/tests/kolainst/destructive/mount-propagation.sh index b7fc87e696..981a04b768 100755 --- a/tests/kolainst/destructive/mount-propagation.sh +++ b/tests/kolainst/destructive/mount-propagation.sh @@ -67,10 +67,9 @@ test_mounts() { assert_has_mount /sysroot/bar assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo" - # Repeat with the sub mount namespace. Since /sysroot is marked private, - # /sysroot/bar will not be propagated into it. + # Repeat with the sub mount namespace. assert_has_mount /var/foo "${ns_pid}" - assert_not_has_mount /sysroot/bar "${ns_pid}" + assert_has_mount /sysroot/bar "${ns_pid}" assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo" "${ns_pid}" }