Skip to content

Commit b187cfc

Browse files
committed
Use a minimal initrd to switch to the full initrd stored in /usr
The growth of binaries over time and the inclusion of new features filled the available boot partition space, so that the kernel+initrd almost couldn't fit twice anymore as required for updates. We employed workarounds such as wrapper scripts for ignition, afterburn and other binaries so that they are loaded from /usr. However, this was still not enough and we would have to do the same for (network) kernel modules and firmware. To avoid making this ever more complex we can use a dedicated initrd focused on loading the full initrd from /usr and then this full initrd can use dracut as before and even drop all the workarounds we accumulated. Introduce a busybox init script that prepares a minimal environment, has debug toggles and an emergency shell, and only loads the real initrd from /usr to switch over to it. Because mdev is not a proper udev replacement, some additional scripting is needed. Busybox's modprobe can't work with dependencies well and we need the real kmod for that (which is also good to guarantee have the same modprobe options set). Also, some other busybox commands are often lacking things such as loading a kernel module automatically and this has to be done explicitly. We still set up dm-verity for /usr so that we have the same security properties (The code comes from the bootengine systemd generators we have and also covers the PXE boot with a squashfs /usr passed from an additional cpio). The real initrd then reuses the mount point for /usr, and loads any kernel modules and firmware that wasn't loaded already. We also have to make the dependencies for parse-ip-for-networkd.service a bit more explicit because the removal of the /sysusr mount in the full initrd exposed a race condition. Signed-off-by: Kai Lueke <[email protected]>
1 parent 31ba296 commit b187cfc

File tree

5 files changed

+202
-1
lines changed

5 files changed

+202
-1
lines changed

dracut/03flatcar-network/parse-ip-for-networkd.service

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Description=Write systemd-networkd units from cmdline
33
DefaultDependencies=false
44

5-
After=afterburn-network-kargs.service
5+
After=afterburn-network-kargs.service dracut-cmdline.service
66
PartOf=systemd-networkd.service
77
Before=systemd-networkd.service initrd-switch-root.target
88
# Switching the root filesystem terminates all running services with binaries from the initramfs, we need to finish before that happens

dracut/10diskless-generator/diskless-generator

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
33
# ex: ts=8 sw=4 sts=4 et filetype=sh
44

5+
# NOTE: The /usr.squashfs mounting for /sysusr is done in /minimal-init
6+
# (making the mount unit here a no-op) but the /sysroot mounting is
7+
# and must still be done here, same for the rootfs RAM setup
8+
59
set -e
610

711
UNIT_DIR="${1:-/tmp}"

dracut/10usr-generator/usr-generator

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
# by systemd-fstab-generator. This module is only needed for old
1111
# bootloaders that pass usr=.
1212

13+
# NOTE: Now done in /minimal-init but since the "mount.usr" generator also runs,
14+
# it seems ok to also keep the "usr" generator even though the mount units are
15+
# a no-op
1316
set -e
1417

1518
UNIT_DIR="${1:-/tmp}"

dracut/10verity-generator/verity-generator

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
# This script generates a service that manages a dm-verity device for the chosen USR partition
66

7+
# NOTE: The verity setup is now done in /minimal-init and this logic not used:
8+
exit 0
9+
# (We could also delete this file but once most users have a large /boot partition we can also
10+
# come back to a state where we have one initrd)
11+
712
set -e
813

914
UNIT_DIR="${1:-/tmp}"

minimal-init

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/bin/sh
2+
set -u
3+
busybox mount -n -t proc proc /proc
4+
busybox mount -n -t devtmpfs devtmpfs /dev
5+
busybox mount -n -t sysfs sysfs /sys
6+
busybox --install -s
7+
if [ ! -x "/dev/pts" ]; then mkdir /dev/pts; fi
8+
if [ ! -x "/dev/shm" ]; then mkdir /dev/shm; fi
9+
busybox mount -n -t devpts devpts /dev/pts -o gid=5,mode=620,ptmxmode=000
10+
11+
cmdline_arg() {
12+
local name="$1"
13+
local value="${2-}"
14+
for arg in $(cat /proc/cmdline); do
15+
if [[ "${arg%%=*}" == "${name}" ]]; then
16+
value="${arg#*=}"
17+
fi
18+
done
19+
echo "${value}"
20+
}
21+
22+
emergency() {
23+
echo "ERROR: The early initrd has failed. To active debug shell breakpoints, boot with rd.earlyshell in the kernel cmdline, and to active tracing, boot with rd.earlytrace" >&2
24+
if read -s -p "Press Enter for emergency shell or wait 60 seconds for reboot." -t 60; then
25+
echo >&2; echo "Entering emergency mode. Exit the shell to retry /init (you might need to clean up mounts first) or reboot with 'reboot -f'." >&2
26+
busybox sh || true
27+
exec /init
28+
else
29+
echo >&2; echo "INFO: Rebooting" >&2
30+
exec reboot -f
31+
fi
32+
}
33+
trap 'emergency' ERR
34+
35+
# Custom debug breakpoint
36+
debug_sh() {
37+
if [ "$(cmdline_arg rd.earlyshell)" != "" ]; then
38+
echo "INFO: Entering debug shell breakpoint ($*), exit to continue booting (reboot with 'reboot -f')">&2
39+
busybox sh || true
40+
fi
41+
}
42+
debug_sh 1/4: before mdev
43+
if [ "$(cmdline_arg rd.earlytrace)" != "" ]; then
44+
set -x
45+
fi
46+
47+
mdev -d
48+
mdev -s
49+
# Coldplugging but with using /sbin/modprobe (which is kmod) instead of busybox's modprobe
50+
# because busybox doesn't properly support the globs in modules.alias
51+
find /sys/ -name modalias -print0 | xargs -0 sort -u | tr '\n' '\0' | xargs -0 /sbin/modprobe -abq || true
52+
# Required to access disks, but not autoloaded:
53+
modprobe sd_mod
54+
55+
debug_sh 2/4: before verity
56+
57+
find_drive() {
58+
local search="$1"
59+
local ueventline=
60+
local blkidmatch=
61+
local drive=
62+
local waitingmsg=
63+
local starttime=
64+
local now=
65+
case "${search}" in
66+
LABEL=*)
67+
blkidmatch="${search#LABEL=}"
68+
# Needs " around the value
69+
blkidmatch="LABEL=\"${blkidmatch}\""
70+
;;
71+
UUID=*)
72+
blkidmatch="${search#UUID=}"
73+
# Needs " around the value
74+
blkidmatch="UUID=\"$(echo "${blkidmatch}" | tr "[:upper:]" "[:lower:]")\""
75+
;;
76+
PARTUUID=*)
77+
ueventline="${search#PARTUUID=}"
78+
ueventline="PARTUUID=$(echo "${ueventline}" | tr "[:upper:]" "[:lower:]")"
79+
;;
80+
PARTLABEL=*)
81+
ueventline="PARTNAME=${search#PARTLABEL=}"
82+
;;
83+
*)
84+
echo "${search}"
85+
return
86+
;;
87+
esac
88+
starttime=$(date +%s)
89+
while [ "${drive}" = "" ]; do
90+
now=$(date +%s)
91+
# Timeout of 5 minutes for finding the device
92+
# NOTE: Only mdev -d runs as this point and the kernel also can spawn modprobe to load modules.
93+
# First make sure that all required modules and their deps are actually in the initrd,
94+
# (but if that's not enough we might even have to trigger the find /sys ... xargs coldplugging
95+
# here again every now and then?)
96+
if [ $((now - starttime)) -gt 300 ]; then
97+
echo "ERROR: Timeout waiting for drive: ${ueventline}${blkidmatch}" >&2
98+
return 1 # Throw error
99+
fi
100+
# No "sleep 0.1", so this is rather busy polling
101+
if [ "${ueventline}" != "" ]; then
102+
drive="$({ grep -s -l -m 1 -r "${ueventline}" /sys/class/block/*/uevent || true; } | cut -d / -f 5)"
103+
else
104+
drive="$(blkid | { grep -m 1 "${blkidmatch}" || true ; } | cut -d : -f 1 | cut -d / -f 3-)"
105+
fi
106+
if [ "${drive}" = "" ] && [ "${waitingmsg}" = "" ]; then
107+
echo "Waiting for drive..." >&2
108+
waitingmsg=1
109+
fi
110+
done
111+
drive="/dev/${drive}"
112+
echo "${drive}"
113+
}
114+
115+
# Ported code from the generators
116+
verityusr=$(cmdline_arg verity.usr)
117+
usrhash=$(cmdline_arg verity.usrhash)
118+
119+
verityusr=$(find_drive "${verityusr}")
120+
121+
# Only proceed if the source is a path and we have sufficient parameters.
122+
if echo "${verityusr}" | grep -q "^/" && [ "${usrhash}" != "" ]; then
123+
# Hardcoded expected value from the image GPT layout
124+
veritysetup --panic-on-corruption --hash-offset=1065345024 open "${verityusr}" usr "${verityusr}" "${usrhash}"
125+
# If there's a hash mismatch during table initialization,
126+
# veritysetup reports it on stderr but still exits 0.
127+
# Manually check the target status and fail if invalid.
128+
status=$(dmsetup status usr | cut -d " " -f 4)
129+
if [ "${status}" != V ]; then
130+
echo "Verity setup failed" >&2
131+
false # Throw error
132+
fi
133+
fi
134+
135+
usr=$(cmdline_arg mount.usr $(cmdline_arg usr))
136+
usrfstype=$(cmdline_arg mount.usrfstype $(cmdline_arg usrfstype auto))
137+
usrflags=$(cmdline_arg mount.usrflags $(cmdline_arg usrflags ro))
138+
139+
usr=$(find_drive "${usr}")
140+
141+
if [ "${usr}" = "" ] && [ -f /usr.squashfs ]; then
142+
usr=/usr.squashfs
143+
usrfstype=squashfs
144+
elif [ "${usrfstype}" = btrfs ] || [ "${usrfstype}" = auto ]; then
145+
if [ "$(echo ",${usrflags}," | grep -v -F ',ro,')" != "" ]; then
146+
true # Don't set "norecovery" when mounting rw
147+
else
148+
usrflags="${usrflags},rescue=nologreplay"
149+
fi
150+
fi
151+
# Only proceed if the source is a path.
152+
if echo "${usr}" | grep -v -q "^/"; then
153+
echo "No mountable /usr partition given (usr='${usr}')" >&2
154+
false # Throw error
155+
fi
156+
157+
debug_sh 3/4: before /sysusr mount
158+
159+
echo "Mounting /usr from ${usr}" >&2
160+
# mount -t auto only works if btrfs is already loaded
161+
modprobe btrfs
162+
mount -t "${usrfstype}" -o "${usrflags}" "${usr}" /sysusr/usr
163+
164+
# Busybox doesn't load this for us
165+
modprobe loop
166+
LOOP=$(losetup -f)
167+
losetup -r "${LOOP}" /sysusr/usr/lib/flatcar/bootengine.img
168+
mkdir /underlay /work
169+
mount -t tmpfs tmpfs /work
170+
mkdir /work/realinit /work/work
171+
mount -t squashfs "${LOOP}" /underlay
172+
mount -t overlay -o rw,lowerdir=/underlay,upperdir=/work/realinit,workdir=/work/work overlay /realinit
173+
mkdir -p /realinit/sysusr/usr
174+
mount -o move /sysusr/usr /realinit/sysusr/usr
175+
if [ "${usr}" = /usr.squashfs ]; then
176+
mkdir -p /oem
177+
mkdir -p /realinit/oem
178+
mount -o bind /oem /realinit/oem
179+
touch /realinit/usr.squashfs
180+
mount -o bind /usr.squashfs /realinit/usr.squashfs
181+
fi
182+
debug_sh 4/4: before switch_root to /realinit
183+
killall mdev || true
184+
umount /proc
185+
umount /sys
186+
umount /dev/pts
187+
# Lazy unmount because /dev/console is held by the current process
188+
umount -l /dev
189+
exec switch_root /realinit /init

0 commit comments

Comments
 (0)