Skip to content

Commit

Permalink
Parallel NICs configuration
Browse files Browse the repository at this point in the history
Signed-off-by: Ivan Kolodiazhnyi <[email protected]>
  • Loading branch information
e0ne committed Nov 6, 2023
1 parent 2e90bef commit 1e08d75
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 14 deletions.
5 changes: 4 additions & 1 deletion bindata/manifests/daemon/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,12 @@ spec:
privileged: true
args:
- "start"
{{- if .UsedSystemdMode}}
{{- if .UsedSystemdMode }}
- --use-systemd-service
{{- end }}
{{- if .ParallelNicConfig }}
- --parallel-nic-config
{{- end }}
env:
- name: NODE_NAME
valueFrom:
Expand Down
4 changes: 2 additions & 2 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
setupLog.Error(err, "failed to discover sriov devices on the host")
return fmt.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err)
}

// TODO(e0ne): read ParallelNicConfig from SriovOperatorConfig CR
// Create the generic plugin
configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager)
configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager, false)
if err != nil {
setupLog.Error(err, "failed to create generic plugin")
return fmt.Errorf("sriov-config-service failed to create generic plugin %v", err)
Expand Down
9 changes: 6 additions & 3 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ var (
}

startOpts struct {
kubeconfig string
nodeName string
systemd bool
kubeconfig string
nodeName string
systemd bool
parallelNicConfig bool
}
)

Expand All @@ -63,6 +64,7 @@ func init() {
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing")
startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode")
startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "NICs configuration in a parallel on the same node")
}

func runStartCmd(cmd *cobra.Command, args []string) error {
Expand Down Expand Up @@ -216,6 +218,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
startOpts.systemd,
eventRecorder,
devMode,
startOpts.parallelNicConfig,
).Run(stopCh, exitCh)
if err != nil {
setupLog.Error(err, "failed to run daemon")
Expand Down
5 changes: 5 additions & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context,
} else {
data.Data["UsedSystemdMode"] = false
}
if parallelConfig, ok := dc.Spec.FeatureGates["parallelNicConfig"]; ok {
data.Data["ParallelNicConfig"] = parallelConfig
} else {
data.Data["ParallelNicConfig"] = false
}

envCniBinPath := os.Getenv("SRIOV_CNI_BIN_PATH")
if envCniBinPath == "" {
Expand Down
6 changes: 5 additions & 1 deletion pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ type Daemon struct {

useSystemdService bool

parallelNicConfig bool

devMode bool

client snclientset.Interface
Expand Down Expand Up @@ -154,11 +156,13 @@ func New(
useSystemdService bool,
er *EventRecorder,
devMode bool,
parallelNicConfig bool,
) *Daemon {
return &Daemon{
name: nodeName,
platform: platformType,
useSystemdService: useSystemdService,
parallelNicConfig: parallelNicConfig,
devMode: devMode,
client: client,
kubeClient: kubeClient,
Expand Down Expand Up @@ -551,7 +555,7 @@ func (dn *Daemon) nodeStateSyncHandler() error {

// load plugins if it has not loaded
if len(dn.enabledPlugins) == 0 {
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager)
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager, dn.parallelNicConfig)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
Expand Down
1 change: 1 addition & 0 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ var _ = Describe("Config Daemon", func() {
false,
er,
false,
false,
)

sut.enabledPlugins = map[string]plugin.VendorPlugin{generic.PluginName: &fake.FakePlugin{}}
Expand Down
4 changes: 2 additions & 2 deletions pkg/daemon/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ var (
K8sPlugin = k8splugin.NewK8sPlugin
)

func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (map[string]plugin.VendorPlugin, error) {
func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (map[string]plugin.VendorPlugin, error) {
log.Log.Info("enableVendorPlugins(): enabling plugins")
enabledPlugins := map[string]plugin.VendorPlugin{}

Expand All @@ -55,7 +55,7 @@ func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *srio
}
enabledPlugins[k8sPlugin.Name()] = k8sPlugin
}
genericPlugin, err := GenericPlugin(false, hostManager, storeManager)
genericPlugin, err := GenericPlugin(false, hostManager, storeManager, parallelNicConfig)
if err != nil {
log.Log.Error(err, "enableVendorPlugins(): failed to load the generic plugin")
return nil, err
Expand Down
6 changes: 4 additions & 2 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ type GenericPlugin struct {
DriverStateMap DriverStateMapType
DesiredKernelArgs map[string]bool
RunningOnHost bool
ParallelNicConfig bool
HostManager host.HostManagerInterface
StoreManager utils.StoreManagerInterface
}

const scriptsPath = "bindata/scripts/enable-kargs.sh"

// Initialize our plugin and set up initial values
func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (plugin.VendorPlugin, error) {
func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (plugin.VendorPlugin, error) {
driverStateMap := make(map[uint]*DriverState)
driverStateMap[Vfio] = &DriverState{
DriverName: vfioPciDriver,
Expand Down Expand Up @@ -91,6 +92,7 @@ func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface,
DriverStateMap: driverStateMap,
DesiredKernelArgs: make(map[string]bool),
RunningOnHost: runningOnHost,
ParallelNicConfig: parallelNicConfig,
HostManager: hostManager,
StoreManager: storeManager,
}, nil
Expand Down Expand Up @@ -173,7 +175,7 @@ func (p *GenericPlugin) Apply() error {
defer exit()
}

if err := utils.SyncNodeState(p.DesireState, pfsToSkip); err != nil {
if err := utils.SyncNodeState(p.DesireState, pfsToSkip, p.ParallelNicConfig); err != nil {
// Catch the "cannot allocate memory" error and try to use PCI realloc
if errors.Is(err, syscall.ENOMEM) {
p.addToDesiredKernelArgs(utils.KernelArgPciRealloc)
Expand Down
2 changes: 1 addition & 1 deletion pkg/plugins/generic/generic_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ var _ = Describe("Generic plugin", func() {
ctrl = gomock.NewController(t)
mockHost = mock_host.NewMockHostManagerInterface(ctrl)
mockStore = mock_utils.NewMockStoreManagerInterface(ctrl)
genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore)
genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore, false)
Expect(err).ToNot(HaveOccurred())
})

Expand Down
71 changes: 69 additions & 2 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -200,8 +201,71 @@ func DiscoverSriovDevices(withUnsupported bool, storeManager StoreManagerInterfa
}

// SyncNodeState Attempt to update the node state to match the desired state
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool) error {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool, parallelNicConfig bool) error {
if !parallelNicConfig {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}
return ConfigSriovInterfacesInParallel(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}

func ConfigSriovInterfacesInParallel(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): start sriov configuration")
if IsKernelLockdownMode(true) && hasMellanoxInterfacesInSpec(ifaceStatuses, interfaces) {
log.Log.Error(nil, "cannot use mellanox devices when in kernel lockdown mode")
return fmt.Errorf("cannot use mellanox devices when in kernel lockdown mode")
}
// TODO(e0ne): store all errors in SriovNetworkNodeState
var result error
wg := sync.WaitGroup{}
for _, ifaceStatus := range ifaceStatuses {
configured := false
for _, iface := range interfaces {
if iface.PciAddress == ifaceStatus.PciAddress {
configured = true

if skip := pfsToConfig[iface.PciAddress]; skip {
break
}

if !NeedUpdate(&iface, &ifaceStatus) {
log.Log.V(2).Info("syncNodeState(): no need update interface", "address", iface.PciAddress)
break
}

wg.Add(1)
go func(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetworkv1.InterfaceExt) {
if err := configSriovDevice(iface, ifaceStatus); err != nil {
log.Log.Error(err, "ConfigSriovInterfacesInParallel(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress)
result = err
if resetErr := resetSriovDevice(*ifaceStatus); resetErr != nil {
log.Log.Error(resetErr, "SyncNodeState(): fail to reset on error SR-IOV interface")
result = resetErr
}
}
wg.Done()
}(&iface, &ifaceStatus)

break
}
}
if !configured && ifaceStatus.NumVfs > 0 {
if skip := pfsToConfig[ifaceStatus.PciAddress]; skip {
continue
}

if err := resetSriovDevice(ifaceStatus); err != nil {
log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): reset failed", "address", ifaceStatus.PciAddress)
result = err
}
}
}
wg.Wait()
if result != nil {
log.Log.Error(result, "ConfigSriovInterfacesInParallel(): fail to configure sriov interface")
return result
}
log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): sriov configuration finished")
return nil
}

func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
Expand Down Expand Up @@ -238,6 +302,7 @@ func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses

break
}

if err = configSriovDevice(&iface, &ifaceStatus); err != nil {
log.Log.Error(err, "SyncNodeState(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress)
if iface.ExternallyManaged {
Expand Down Expand Up @@ -550,6 +615,7 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
return err
}
}
log.Log.V(2).Info("configSriovDevice(): config interface completed", "address", ifaceStatus.PciAddress)
return nil
}

Expand Down Expand Up @@ -594,6 +660,7 @@ func setNetdevMTU(pciAddr string, mtu int) error {
log.Log.Error(err, "setNetdevMTU(): fail to write mtu file after retrying")
return err
}
log.Log.V(2).Info("setNetdevMTU(): set MTU for device completed", "address", pciAddr, "mtu", mtu)
return nil
}

Expand Down

0 comments on commit 1e08d75

Please sign in to comment.