Skip to content

Commit

Permalink
Merge pull request #3292 from dbnicholson/var-slave-shared
Browse files Browse the repository at this point in the history
switchroot: Stop making /sysroot mount private
  • Loading branch information
cgwalters authored Sep 6, 2024
2 parents fbb1cc7 + 2973ec5 commit 413b0ad
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 21 deletions.
11 changes: 10 additions & 1 deletion src/libostree/ostree-impl-system-generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,15 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor
*
* Note that our unit doesn't run if systemd.volatile is enabled;
* see https://github.com/ostreedev/ostree/pull/856
*
* To avoid having submounts of /var propagate into $stateroot/var, the mount
* is made with slave+shared propagation. This means that /var will receive
* mount events from the parent /sysroot mount, but not vice versa. Adding a
* shared peer group below the slave group means that submounts of /var will
* inherit normal shared propagation. See mount_namespaces(7), Linux
* Documentation/filesystems/sharedsubtree.txt and
* https://github.com/ostreedev/ostree/issues/2086. This also happens in
* ostree-prepare-root.c for the INITRAMFS_MOUNT_VAR case.
*/
if (!g_output_stream_printf (outstream, &bytes_written, cancellable, error,
"##\n# Automatically generated by ostree-system-generator\n##\n\n"
Expand All @@ -243,7 +252,7 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor
"[Mount]\n"
"Where=%s\n"
"What=%s\n"
"Options=bind\n",
"Options=bind,slave,shared\n",
var_path, stateroot_var_path))
return FALSE;
if (!g_output_stream_flush (outstream, cancellable, error))
Expand Down
21 changes: 10 additions & 11 deletions src/switchroot/ostree-prepare-root.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,16 @@ main (int argc, char *argv[])
{
if (mount ("../../var", TMP_SYSROOT "/var", NULL, MS_BIND | MS_SILENT, NULL) < 0)
err (EXIT_FAILURE, "failed to bind mount ../../var to var");

/* To avoid having submounts of /var propagate into $stateroot/var, the
* mount is made with slave+shared propagation. See the comment in
* ostree-impl-system-generator.c when /var isn't mounted in the
* initramfs for further explanation.
*/
if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SLAVE | MS_SILENT, NULL) < 0)
err (EXIT_FAILURE, "failed to change /var to slave mount");
if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SHARED | MS_SILENT, NULL) < 0)
err (EXIT_FAILURE, "failed to change /var to slave+shared mount");
}

/* This can be used by other things to signal ostree is in use */
Expand Down Expand Up @@ -687,16 +697,5 @@ main (int argc, char *argv[])
err (EXIT_FAILURE, "failed to make /sysroot read-only");
}

/* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache
* also propagate to /sysroot/ostree/deploy/$stateroot/var/cache
*
* Now in reality, today this is overridden by systemd: the *actual* way we fix this up
* is in ostree-remount.c. But let's do it here to express the semantics we want
* at the very start (perhaps down the line systemd will have compile/runtime option
* to say that the initramfs environment did everything right from the start).
*/
if (mount ("none", "sysroot", NULL, MS_PRIVATE | MS_SILENT, NULL) < 0)
err (EXIT_FAILURE, "remounting 'sysroot' private");

exit (EXIT_SUCCESS);
}
9 changes: 0 additions & 9 deletions src/switchroot/ostree-remount.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,6 @@ main (int argc, char *argv[])
}
g_autoptr (GVariantDict) ostree_run_metadata = g_variant_dict_new (ostree_run_metadata_v);

/* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache
* also propagate to /sysroot/ostree/deploy/$stateroot/var/cache
*
* Today systemd remounts / (recursively) as shared, so we're undoing that as early
* as possible. See also a copy of this in ostree-prepare-root.c.
*/
if (mount ("none", "/sysroot", NULL, MS_REC | MS_PRIVATE, NULL) < 0)
perror ("warning: While remounting /sysroot MS_PRIVATE");

const char *transient_etc = NULL;
g_variant_dict_lookup (ostree_run_metadata, OTCORE_RUN_BOOTED_KEY_TRANSIENT_ETC, "&s",
&transient_etc);
Expand Down
140 changes: 140 additions & 0 deletions tests/kolainst/destructive/mount-propagation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/bin/bash
# https://bugzilla.redhat.com/show_bug.cgi?id=1498281
set -xeuo pipefail

. ${KOLA_EXT_DATA}/libinsttest.sh

require_writable_sysroot

get_mount() {
local target=${1:?No target specified}
local pid=${2:?No PID specified}

# findmnt always looks at /proc/self/mountinfo, so we have to first enter the
# mount namespace of the desired process.
nsenter --target "${pid}" --mount -- \
findmnt --json --list --output +PROPAGATION,OPT-FIELDS \
| jq ".filesystems[] | select(.target == \"${target}\")"
}

assert_has_mount() {
local target=${1:?No target specified}
local pid=${2:-$$}
local mount

mount=$(get_mount "${target}" "${pid}")
if [ -n "${mount}" ]; then
echo -e "Process ${pid} has mount '${target}':\n${mount}"
else
cat "/proc/${pid}/mountinfo" >&2
fatal "Mount '${target}' not found in process ${pid}"
fi
}

assert_not_has_mount() {
local target=${1:?No target specified}
local pid=${2:-$$}
local mount

mount=$(get_mount "${target}" "${pid}")
if [ -n "${mount}" ]; then
cat "/proc/${pid}/mountinfo" >&2
fatal "Mount '${target}' found in process ${pid}"
else
echo "Process ${pid} does not have mount '${target}'"
fi
}

test_mounts() {
local stateroot

echo "Root namespace mountinfo:"
cat "/proc/$$/mountinfo"

echo "Sub namespace mountinfo:"
cat "/proc/${ns_pid}/mountinfo"

# Make sure the 2 PIDs are really in different mount namespaces.
root_ns=$(readlink "/proc/$$/ns/mnt")
sub_ns=$(readlink "/proc/${ns_pid}/ns/mnt")
assert_not_streq "${root_ns}" "${sub_ns}"

stateroot=$(rpmostree_query_json '.deployments[0].osname')

# Check the mounts exist in the root namespace and the /var/foo mount has not
# propagated back to /sysroot.
assert_has_mount /var/foo
assert_has_mount /sysroot/bar
assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo"

# Repeat with the sub mount namespace.
assert_has_mount /var/foo "${ns_pid}"
assert_has_mount /sysroot/bar "${ns_pid}"
assert_not_has_mount "/sysroot/ostree/deploy/${stateroot}/var/foo" "${ns_pid}"
}

case "${AUTOPKGTEST_REBOOT_MARK:-}" in
"")
mkdir -p /var/foo /sysroot/bar

# Create a process in a separate mount namespace to see if the mounts
# propagate into it correctly.
unshare -m --propagation unchanged -- sleep infinity &
ns_pid=$!

mount -t tmpfs foo /var/foo
mount -t tmpfs bar /sysroot/bar

test_mounts

# Now setup for the same test but with the mounts made early via fstab.
cat >> /etc/fstab <<"EOF"
foo /var/foo tmpfs defaults 0 0
bar /sysroot/bar tmpfs defaults 0 0
EOF

# We want to start a process in a separate namespace after ostree-remount
# has completed but before systemd starts the fstab generated mount units.
cat > /etc/systemd/system/test-mounts.service <<"EOF"
[Unit]
DefaultDependencies=no
After=ostree-remount.service
Before=var-foo.mount sysroot-bar.mount
RequiresMountsFor=/var /sysroot
Conflicts=shutdown.target
Before=shutdown.target
[Service]
Type=exec
ExecStart=/usr/bin/sleep infinity
ProtectSystem=strict
[Install]
WantedBy=local-fs.target
EOF
systemctl enable test-mounts.service

/tmp/autopkgtest-reboot 2
;;
2)
# Check that the test service is running and get its PID.
ns_state=$(systemctl show -P ActiveState test-mounts.service)
assert_streq "${ns_state}" active
ns_pid=$(systemctl show -P MainPID test-mounts.service)

# Make sure that test-mounts.service started after ostree-remount.service
# but before /var/foo and /sysroot/bar were mounted so that we can see if
# the mounts were propagated into its mount namespace.
remount_finished=$(journalctl -o json -g Finished -u ostree-remount.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP)
test_starting=$(journalctl -o json -g Starting -u test-mounts.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP)
test_started=$(journalctl -o json -g Started -u test-mounts.service | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP)
foo_mounting=$(journalctl -o json -g Mounting -u var-foo.mount | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP)
bar_mounting=$(journalctl -o json -g Mounting -u sysroot-bar.mount | tail -n1 | jq -r .__MONOTONIC_TIMESTAMP)
test "${remount_finished}" -lt "${test_starting}"
test "${test_started}" -lt "${foo_mounting}"
test "${test_started}" -lt "${bar_mounting}"

test_mounts
;;
*) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;;
esac

0 comments on commit 413b0ad

Please sign in to comment.