From c844f7a78eac41165abe723d26ab9b77d0a6e0ad Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Fri, 3 Mar 2023 11:49:06 -0500 Subject: [PATCH] daemon: Make switchKernel less stateful This is prep for fixing RHEL9 upgrades while maintaining `kernel-rt`. Previously the `switchKernel` logic tried to carefully handle all 4 cases (default -> default, default -> rt, rt -> default, rt -> rt). But, the last one (rt -> rt) was not quite right because the previous `rpm-ostree rebase` command already preserved the previous kernel. In fact it was pretty expensive to do things this way because we'd e.g. regenerate the initramfs *twice*. To say this another way: when doing a RHEL9 update, it's actually the first `rpm-ostree rebase` command which fails before we even get to `switchKernel`. And the reason is due to the introduction of a new `-core` subpackage; xref https://issues.redhat.com/browse/OCPBUGS-8113 So here's the new logic to handle this: - Before we do the `rebase` operation to the new OS, we detect any previous overrides of any packages starting with `kernel-rt` and we remove them. Notably this avoids hardcoding any specific kernel subpackages; we just remove *everything* starting with `kernel-rt` which should be more robust to subpackage changes in the future. - Consequently the `rebase` operation will hence start out by deploying the stock image i.e. with throughput kernel (though note we *are* carefully preserving other local overrides) - The `switchKernel` function now longer needs to take the *previous* machineconfig state into account (except for logging). Instead, we just detect if the target is RT, and if so we then we apply the latest packages. This significantly simplifies the logic in `switchKernel`, and will help fix RHEL9 upgrades. (cherry picked from commit 8ac5beeddc331dd0186844d4714500d17454621d) --- pkg/daemon/update.go | 94 ++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index f41efe4a0c..3a6c585186 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -1161,8 +1161,9 @@ func (dn *CoreOSDaemon) switchKernel(oldConfig, newConfig *mcfgv1.MachineConfig) oldKtype := canonicalizeKernelType(oldConfig.Spec.KernelType) newKtype := canonicalizeKernelType(newConfig.Spec.KernelType) - // Do nothing if both old and new KernelType are of type default - if oldKtype == ctrlcommon.KernelTypeDefault && newKtype == ctrlcommon.KernelTypeDefault { + // In the OS update path, we removed overrides for kernel-rt. So if the target (new) config + // is also default (i.e. throughput) then we have nothing to do. + if newKtype == ctrlcommon.KernelTypeDefault { return nil } @@ -1173,21 +1174,13 @@ func (dn *CoreOSDaemon) switchKernel(oldConfig, newConfig *mcfgv1.MachineConfig) // kernel-rt also has a split off kernel-rt-kvm subpackage because it's in a separate subscription in RHEL. realtimeKernel := []string{"kernel-rt-core", "kernel-rt-modules", "kernel-rt-modules-extra", "kernel-rt-kvm"} - dn.logSystem("Initiating switch from kernel %s to %s", oldKtype, newKtype) - - switchingToThroughput := oldKtype == ctrlcommon.KernelTypeRealtime && newKtype == ctrlcommon.KernelTypeDefault - if switchingToThroughput { - args := []string{"override", "reset"} - args = append(args, defaultKernel...) - for _, pkg := range realtimeKernel { - args = append(args, "--uninstall", pkg) - } - dn.logSystem("Switching to kernelType=%s, invoking rpm-ostree %+q", newConfig.Spec.KernelType, args) - return runRpmOstree(args...) + if oldKtype != newKtype { + dn.logSystem("Initiating switch to kernel %s", newKtype) + } else { + dn.logSystem("Re-applying kernel type %s", newKtype) } - switchingToRealtime := oldKtype == ctrlcommon.KernelTypeDefault && newKtype == ctrlcommon.KernelTypeRealtime - if switchingToRealtime { + if newKtype == ctrlcommon.KernelTypeRealtime { // Switch to RT kernel args := []string{"override", "remove"} args = append(args, defaultKernel...) @@ -1195,19 +1188,9 @@ func (dn *CoreOSDaemon) switchKernel(oldConfig, newConfig *mcfgv1.MachineConfig) args = append(args, "--install", pkg) } - dn.logSystem("Switching to kernelType=%s, invoking rpm-ostree %+q", newConfig.Spec.KernelType, args) return runRpmOstree(args...) } - - if oldKtype == ctrlcommon.KernelTypeRealtime && newKtype == ctrlcommon.KernelTypeRealtime { - if oldConfig.Spec.OSImageURL != newConfig.Spec.OSImageURL { - args := []string{"update"} - dn.logSystem("Updating rt-kernel packages on host: %+q", args) - return runRpmOstree(args...) - } - } - - return nil + return fmt.Errorf("Unhandled kernel type %s", newKtype) } // updateFiles writes files specified by the nodeconfig to disk. it also writes @@ -1772,6 +1755,56 @@ func (dn *Daemon) InplaceUpdateViaNewContainer(target string) error { return nil } +// queueRevertRTKernel undoes the layering of the RT kernel +func (dn *Daemon) queueRevertRTKernel() error { + booted, _, err := dn.NodeUpdaterClient.GetBootedAndStagedDeployment() + if err != nil { + return err + } + + // Before we attempt to do an OS update, we must remove the kernel-rt switch + // because in the case of updating from RHEL8 to RHEL9, the kernel packages are + // OS version dependent. See also https://github.com/coreos/rpm-ostree/issues/2542 + // (Now really what we want to do here is something more like rpm-ostree override reset --kernel + // i.e. the inverse of https://github.com/coreos/rpm-ostree/pull/4322 so that + // we're again not hardcoding even the prefix of kernel packages) + kernelOverrides := []string{} + kernelRtLayers := []string{} + for _, removal := range booted.RequestedBaseRemovals { + if removal == "kernel" || strings.HasPrefix(removal, "kernel-") { + kernelOverrides = append(kernelOverrides, removal) + } + } + for _, pkg := range booted.RequestedPackages { + if strings.HasPrefix(pkg, "kernel-rt-") { + kernelRtLayers = append(kernelRtLayers, pkg) + } + } + // We *only* do this switch if the node has done a switch from kernel -> kernel-rt. + // We don't want to override any machine-local hotfixes for the kernel package. + // Implicitly in this we don't really support machine-local hotfixes for kernel-rt. + // The only sane way to handle that is declarative drop-ins, but really we want to + // just go to deploying pre-built images and not doing per-node mutation with rpm-ostree + // at all. + if len(kernelOverrides) > 0 && len(kernelRtLayers) > 0 { + args := []string{"override", "reset"} + args = append(args, kernelOverrides...) + for _, pkg := range kernelRtLayers { + args = append(args, "--uninstall", pkg) + } + err := runRpmOstree(args...) + if err != nil { + return err + } + } else if len(kernelOverrides) > 0 || len(kernelRtLayers) > 0 { + glog.Infof("notice: detected %d overrides and %d kernel-rt layers", len(kernelOverrides), len(kernelRtLayers)) + } else { + glog.Infof("No kernel overrides or replacement detected") + } + + return nil +} + // updateLayeredOS updates the system OS to the one specified in newConfig func (dn *Daemon) updateLayeredOS(config *mcfgv1.MachineConfig) error { newURL := config.Spec.OSImageURL @@ -2024,6 +2057,15 @@ func (dn *CoreOSDaemon) applyLayeredOSChanges(mcDiff machineConfigDiff, oldConfi defer os.Remove(extensionsRepo) } + // If we have an OS update *or* a kernel type change, then we must undo the RT kernel + // enablement. + if mcDiff.osUpdate || mcDiff.kernelType { + if err := dn.queueRevertRTKernel(); err != nil { + mcdPivotErr.Inc() + return err + } + } + // Update OS if mcDiff.osUpdate { if err := dn.updateLayeredOS(newConfig); err != nil {