From 1e08d75643e33f6c2a37261a13aa993ffbd3589b Mon Sep 17 00:00:00 2001 From: Ivan Kolodiazhnyi Date: Wed, 18 Oct 2023 17:54:55 +0300 Subject: [PATCH] Parallel NICs configuration Signed-off-by: Ivan Kolodiazhnyi --- bindata/manifests/daemon/daemonset.yaml | 5 +- cmd/sriov-network-config-daemon/service.go | 4 +- cmd/sriov-network-config-daemon/start.go | 9 ++- controllers/sriovoperatorconfig_controller.go | 5 ++ pkg/daemon/daemon.go | 6 +- pkg/daemon/daemon_test.go | 1 + pkg/daemon/plugin.go | 4 +- pkg/plugins/generic/generic_plugin.go | 6 +- pkg/plugins/generic/generic_plugin_test.go | 2 +- pkg/utils/utils.go | 71 ++++++++++++++++++- 10 files changed, 99 insertions(+), 14 deletions(-) diff --git a/bindata/manifests/daemon/daemonset.yaml b/bindata/manifests/daemon/daemonset.yaml index 6e1b3ba8d..651f3344d 100644 --- a/bindata/manifests/daemon/daemonset.yaml +++ b/bindata/manifests/daemon/daemonset.yaml @@ -94,9 +94,12 @@ spec: privileged: true args: - "start" - {{- if .UsedSystemdMode}} + {{- if .UsedSystemdMode }} - --use-systemd-service {{- end }} + {{- if .ParallelNicConfig }} + - --parallel-nic-config + {{- end }} env: - name: NODE_NAME valueFrom: diff --git a/cmd/sriov-network-config-daemon/service.go b/cmd/sriov-network-config-daemon/service.go index fc9cfc6d9..0a79bcc76 100644 --- a/cmd/sriov-network-config-daemon/service.go +++ b/cmd/sriov-network-config-daemon/service.go @@ -120,9 +120,9 @@ func runServiceCmd(cmd *cobra.Command, args []string) error { setupLog.Error(err, "failed to discover sriov devices on the host") return fmt.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err) } - + // TODO(e0ne): read ParallelNicConfig from SriovOperatorConfig CR // Create the generic plugin - configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager) + configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager, false) if err != nil { setupLog.Error(err, "failed to create generic plugin") return fmt.Errorf("sriov-config-service failed to create generic plugin %v", err) diff --git a/cmd/sriov-network-config-daemon/start.go b/cmd/sriov-network-config-daemon/start.go index a7feffbde..db30500e1 100644 --- a/cmd/sriov-network-config-daemon/start.go +++ b/cmd/sriov-network-config-daemon/start.go @@ -52,9 +52,10 @@ var ( } startOpts struct { - kubeconfig string - nodeName string - systemd bool + kubeconfig string + nodeName string + systemd bool + parallelNicConfig bool } ) @@ -63,6 +64,7 @@ func init() { startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)") startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing") startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode") + startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "NICs configuration in a parallel on the same node") } func runStartCmd(cmd *cobra.Command, args []string) error { @@ -216,6 +218,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error { startOpts.systemd, eventRecorder, devMode, + startOpts.parallelNicConfig, ).Run(stopCh, exitCh) if err != nil { setupLog.Error(err, "failed to run daemon") diff --git a/controllers/sriovoperatorconfig_controller.go b/controllers/sriovoperatorconfig_controller.go index 7e9c548fb..9d6850a80 100644 --- a/controllers/sriovoperatorconfig_controller.go +++ b/controllers/sriovoperatorconfig_controller.go @@ -196,6 +196,11 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context, } else { data.Data["UsedSystemdMode"] = false } + if parallelConfig, ok := dc.Spec.FeatureGates["parallelNicConfig"]; ok { + data.Data["ParallelNicConfig"] = parallelConfig + } else { + data.Data["ParallelNicConfig"] = false + } envCniBinPath := os.Getenv("SRIOV_CNI_BIN_PATH") if envCniBinPath == "" { diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 3ec571924..26843faf8 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -70,6 +70,8 @@ type Daemon struct { useSystemdService bool + parallelNicConfig bool + devMode bool client snclientset.Interface @@ -154,11 +156,13 @@ func New( useSystemdService bool, er *EventRecorder, devMode bool, + parallelNicConfig bool, ) *Daemon { return &Daemon{ name: nodeName, platform: platformType, useSystemdService: useSystemdService, + parallelNicConfig: parallelNicConfig, devMode: devMode, client: client, kubeClient: kubeClient, @@ -551,7 +555,7 @@ func (dn *Daemon) nodeStateSyncHandler() error { // load plugins if it has not loaded if len(dn.enabledPlugins) == 0 { - dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager) + dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager, dn.parallelNicConfig) if err != nil { log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins") return err diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index 5300a1a65..f12f83417 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -118,6 +118,7 @@ var _ = Describe("Config Daemon", func() { false, er, false, + false, ) sut.enabledPlugins = map[string]plugin.VendorPlugin{generic.PluginName: &fake.FakePlugin{}} diff --git a/pkg/daemon/plugin.go b/pkg/daemon/plugin.go index 09c69271c..5c439e217 100644 --- a/pkg/daemon/plugin.go +++ b/pkg/daemon/plugin.go @@ -29,7 +29,7 @@ var ( K8sPlugin = k8splugin.NewK8sPlugin ) -func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (map[string]plugin.VendorPlugin, error) { +func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (map[string]plugin.VendorPlugin, error) { log.Log.Info("enableVendorPlugins(): enabling plugins") enabledPlugins := map[string]plugin.VendorPlugin{} @@ -55,7 +55,7 @@ func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *srio } enabledPlugins[k8sPlugin.Name()] = k8sPlugin } - genericPlugin, err := GenericPlugin(false, hostManager, storeManager) + genericPlugin, err := GenericPlugin(false, hostManager, storeManager, parallelNicConfig) if err != nil { log.Log.Error(err, "enableVendorPlugins(): failed to load the generic plugin") return nil, err diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index 44069407e..80a623f30 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -55,6 +55,7 @@ type GenericPlugin struct { DriverStateMap DriverStateMapType DesiredKernelArgs map[string]bool RunningOnHost bool + ParallelNicConfig bool HostManager host.HostManagerInterface StoreManager utils.StoreManagerInterface } @@ -62,7 +63,7 @@ type GenericPlugin struct { const scriptsPath = "bindata/scripts/enable-kargs.sh" // Initialize our plugin and set up initial values -func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (plugin.VendorPlugin, error) { +func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (plugin.VendorPlugin, error) { driverStateMap := make(map[uint]*DriverState) driverStateMap[Vfio] = &DriverState{ DriverName: vfioPciDriver, @@ -91,6 +92,7 @@ func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, DriverStateMap: driverStateMap, DesiredKernelArgs: make(map[string]bool), RunningOnHost: runningOnHost, + ParallelNicConfig: parallelNicConfig, HostManager: hostManager, StoreManager: storeManager, }, nil @@ -173,7 +175,7 @@ func (p *GenericPlugin) Apply() error { defer exit() } - if err := utils.SyncNodeState(p.DesireState, pfsToSkip); err != nil { + if err := utils.SyncNodeState(p.DesireState, pfsToSkip, p.ParallelNicConfig); err != nil { // Catch the "cannot allocate memory" error and try to use PCI realloc if errors.Is(err, syscall.ENOMEM) { p.addToDesiredKernelArgs(utils.KernelArgPciRealloc) diff --git a/pkg/plugins/generic/generic_plugin_test.go b/pkg/plugins/generic/generic_plugin_test.go index e1211b392..2e3df38df 100644 --- a/pkg/plugins/generic/generic_plugin_test.go +++ b/pkg/plugins/generic/generic_plugin_test.go @@ -33,7 +33,7 @@ var _ = Describe("Generic plugin", func() { ctrl = gomock.NewController(t) mockHost = mock_host.NewMockHostManagerInterface(ctrl) mockStore = mock_utils.NewMockStoreManagerInterface(ctrl) - genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore) + genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore, false) Expect(err).ToNot(HaveOccurred()) }) diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 051a4aebb..6495b6918 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -13,6 +13,7 @@ import ( "regexp" "strconv" "strings" + "sync" "syscall" "time" @@ -200,8 +201,71 @@ func DiscoverSriovDevices(withUnsupported bool, storeManager StoreManagerInterfa } // SyncNodeState Attempt to update the node state to match the desired state -func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool) error { - return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig) +func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool, parallelNicConfig bool) error { + if !parallelNicConfig { + return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig) + } + return ConfigSriovInterfacesInParallel(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig) +} + +func ConfigSriovInterfacesInParallel(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error { + log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): start sriov configuration") + if IsKernelLockdownMode(true) && hasMellanoxInterfacesInSpec(ifaceStatuses, interfaces) { + log.Log.Error(nil, "cannot use mellanox devices when in kernel lockdown mode") + return fmt.Errorf("cannot use mellanox devices when in kernel lockdown mode") + } + // TODO(e0ne): store all errors in SriovNetworkNodeState + var result error + wg := sync.WaitGroup{} + for _, ifaceStatus := range ifaceStatuses { + configured := false + for _, iface := range interfaces { + if iface.PciAddress == ifaceStatus.PciAddress { + configured = true + + if skip := pfsToConfig[iface.PciAddress]; skip { + break + } + + if !NeedUpdate(&iface, &ifaceStatus) { + log.Log.V(2).Info("syncNodeState(): no need update interface", "address", iface.PciAddress) + break + } + + wg.Add(1) + go func(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetworkv1.InterfaceExt) { + if err := configSriovDevice(iface, ifaceStatus); err != nil { + log.Log.Error(err, "ConfigSriovInterfacesInParallel(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress) + result = err + if resetErr := resetSriovDevice(*ifaceStatus); resetErr != nil { + log.Log.Error(resetErr, "SyncNodeState(): fail to reset on error SR-IOV interface") + result = resetErr + } + } + wg.Done() + }(&iface, &ifaceStatus) + + break + } + } + if !configured && ifaceStatus.NumVfs > 0 { + if skip := pfsToConfig[ifaceStatus.PciAddress]; skip { + continue + } + + if err := resetSriovDevice(ifaceStatus); err != nil { + log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): reset failed", "address", ifaceStatus.PciAddress) + result = err + } + } + } + wg.Wait() + if result != nil { + log.Log.Error(result, "ConfigSriovInterfacesInParallel(): fail to configure sriov interface") + return result + } + log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): sriov configuration finished") + return nil } func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error { @@ -238,6 +302,7 @@ func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses break } + if err = configSriovDevice(&iface, &ifaceStatus); err != nil { log.Log.Error(err, "SyncNodeState(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress) if iface.ExternallyManaged { @@ -550,6 +615,7 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor return err } } + log.Log.V(2).Info("configSriovDevice(): config interface completed", "address", ifaceStatus.PciAddress) return nil } @@ -594,6 +660,7 @@ func setNetdevMTU(pciAddr string, mtu int) error { log.Log.Error(err, "setNetdevMTU(): fail to write mtu file after retrying") return err } + log.Log.V(2).Info("setNetdevMTU(): set MTU for device completed", "address", pciAddr, "mtu", mtu) return nil }