From c52c3cc5941457dad559555de6e7d1b5882450e4 Mon Sep 17 00:00:00 2001 From: William Zhao Date: Tue, 15 Aug 2023 16:56:28 -0400 Subject: [PATCH] Error out when Kernel Parameters cannot be set We need to make sure that the kernel parameters are set, thus we should pass the errors up to the config daemon. Signed-off-by: William Zhao --- pkg/plugins/generic/generic_plugin.go | 47 +++++++++++++++++---------- pkg/utils/utils.go | 16 ++++----- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index f8f0e9fb0..b1ececc8f 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -71,7 +71,7 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt p.DesireState = new needDrain = needDrainNode(new.Spec.Interfaces, new.Status.Interfaces) - needReboot = p.needRebootNode(new) + needReboot, err = p.needRebootNode(new) if needReboot { needDrain = true @@ -202,30 +202,34 @@ func (p *GenericPlugin) AddToDesiredKernelParams(kparam string) { } } -// SetAllDesiredKernelParams Should be called to set all the kernel parameters. Returns true if reboot of the node is needed. -func (p *GenericPlugin) SetAllDesiredKernelParams() bool { - needReboot := false +// SetAllDesiredKernelParams Should be called to set all the kernel parameters. Updates needReboot if reboot is needed. +func (p *GenericPlugin) SetAllDesiredKernelParams(needReboot *bool) error { for kparam, attempts := range p.DesiredKernelParams { - if !utils.IsKernelCmdLineParamSet(kparam, false) { - if attempts > 0 && attempts <= 4 { - glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s after reboot with attempts %d", kparam, attempts) - } else if attempts > 4 { - // If we tried several times and was unsuccessful, we should give up. - continue + set, err := utils.IsKernelCmdLineParamSet(kparam, false) + if err != nil { + return err + } + if !set { + if attempts > 0 { + glog.Errorf("generic-plugin SetAllDesiredKernelParams(): failed to set kernel param %s with attempts %d", kparam, attempts) } + // There is a case when we try to set the kernel parameter here, the daemon could decide to not reboot because + // the daemon encountered a potentially one-time error. However we always want to make sure that the kernel + // parameter is set once the daemon goes through node state sync again. update, err := trySetKernelParam(kparam) if err != nil { - glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s: %v", kparam, err) + glog.Errorf("generic-plugin SetAllDesiredKernelParams(): fail to set kernel param %s: %v", kparam, err) + return err } if update { - glog.V(2).Infof("generic-plugin SetAllDesiredKernelParams(): Need reboot for setting kernel param %s", kparam) + glog.V(2).Infof("generic-plugin SetAllDesiredKernelParams(): need reboot for setting kernel param %s", kparam) } // Update the number of attempts we tried to set the kernel parameter. p.DesiredKernelParams[kparam]++ - needReboot = needReboot || update + *needReboot = *needReboot || update } } - return needReboot + return nil } func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.InterfaceExts) (needDrain bool) { @@ -257,8 +261,8 @@ func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.Int return } -func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool) { - needReboot = false +func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + needReboot := false if p.LoadVfioDriver != loaded { if needVfioDriver(state) { p.LoadVfioDriver = loading @@ -273,13 +277,20 @@ func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeSta } } + err := p.SetAllDesiredKernelParams(&needReboot) + if err != nil { + glog.Errorf("generic-plugin needRebootNode(): failed to set the desired kernel parameters") + return false, err + } + update, err := utils.WriteSwitchdevConfFile(state) if err != nil { glog.Errorf("generic-plugin needRebootNode(): fail to write switchdev device config file") + return false, err } if update { glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for updating switchdev device configuration") } - needReboot = needReboot || update || p.SetAllDesiredKernelParams() - return + needReboot = needReboot || update + return needReboot, nil } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 45e6381b3..d0c7b4bc3 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -32,7 +32,7 @@ const ( sysBusPciDrivers = "/sys/bus/pci/drivers" sysBusPciDriversProbe = "/sys/bus/pci/drivers_probe" sysClassNet = "/sys/class/net" - sysKernelCmdLine = "/proc/cmdline" + procKernelCmdLine = "/proc/cmdline" netClass = 0x02 numVfsFile = "sriov_numvfs" @@ -57,24 +57,20 @@ func init() { } // IsKernelCmdLineParamSet This checks if the kernel cmd line is set properly. -func IsKernelCmdLineParamSet(param string, chroot bool) bool { - path := sysKernelCmdLine +func IsKernelCmdLineParamSet(param string, chroot bool) (bool, error) { + path := procKernelCmdLine if !chroot { path = "/host" + path } cmdLine, err := os.ReadFile(path) if err != nil { - glog.Warningf("IsKernelCmdLineParamSet(): Error reading %s, Please check the Kernel Params manually!", sysKernelCmdLine) - // Although intuitively we should report false here. However if we cannot reliably read the Kernel's cmd line params - // then it is unknown whether it was successfully set, and to avoid unnecessary reboots we should just post a warning - // and return true. - return true + return false, fmt.Errorf("IsKernelCmdLineParamSet(): Error reading %s: %v", procKernelCmdLine, err) } if strings.Contains(string(cmdLine), param) { - return true + return true, nil } - return false + return false, nil } func DiscoverSriovDevices(withUnsupported bool) ([]sriovnetworkv1.InterfaceExt, error) {