Skip to content

Commit

Permalink
Parallel NICs configuraion
Browse files Browse the repository at this point in the history
  • Loading branch information
e0ne committed Sep 20, 2023
1 parent 73e7564 commit 37a73d8
Show file tree
Hide file tree
Showing 11 changed files with 112 additions and 25 deletions.
2 changes: 2 additions & 0 deletions api/v1/sriovoperatorconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ type SriovOperatorConfigSpec struct {
// Default mode: daemon
// +kubebuilder:validation:Enum=daemon;systemd
ConfigurationMode ConfigurationModeType `json:"configurationMode,omitempty"`
// Flag to enable NICs configuration in a parallel on the same node
ParallelNicConfig bool `json:"parallelNicConfig,omitempty"`
}

// SriovOperatorConfigStatus defines the observed state of SriovOperatorConfig
Expand Down
4 changes: 2 additions & 2 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
glog.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err)
return fmt.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err)
}

// TODO(e0ne): read ParallelNicConfig from SriovOperatorConfig CR
// Create the generic plugin
configPlugin, err = generic.NewGenericPlugin(true)
configPlugin, err = generic.NewGenericPlugin(true, false)
if err != nil {
glog.Errorf("sriov-config-service: failed to create generic plugin %v", err)
return fmt.Errorf("sriov-config-service failed to create generic plugin %v", err)
Expand Down
9 changes: 6 additions & 3 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ var (
}

startOpts struct {
kubeconfig string
nodeName string
systemd bool
kubeconfig string
nodeName string
systemd bool
parallelNicConfig bool
}
)

Expand All @@ -63,6 +64,7 @@ func init() {
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing")
startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode")
startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "NICs configuration in a parallel on the same node")
}

func runStartCmd(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -208,6 +210,7 @@ func runStartCmd(cmd *cobra.Command, args []string) {
platformType,
startOpts.systemd,
devMode,
startOpts.parallelNicConfig,
).Run(stopCh, exitCh)
if err != nil {
glog.Errorf("failed to run daemon: %v", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ spec:
maximum: 2
minimum: 0
type: integer
parallelNicConfig:
description: Flag to enable NICs configuration in a parallel on the
same node
type: boolean
type: object
status:
description: SriovOperatorConfigStatus defines the observed state of SriovOperatorConfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ spec:
maximum: 2
minimum: 0
type: integer
parallelNicConfig:
description: Flag to enable NICs configuration in a parallel on the
same node
type: boolean
type: object
status:
description: SriovOperatorConfigStatus defines the observed state of SriovOperatorConfig
Expand Down
6 changes: 5 additions & 1 deletion pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type Daemon struct {

useSystemdService bool

parallelNicConfig bool

devMode bool

client snclientset.Interface
Expand Down Expand Up @@ -149,11 +151,13 @@ func New(
platformType utils.PlatformType,
useSystemdService bool,
devMode bool,
parallelNicConfig bool,
) *Daemon {
return &Daemon{
name: nodeName,
platform: platformType,
useSystemdService: useSystemdService,
parallelNicConfig: parallelNicConfig,
devMode: devMode,
client: client,
kubeClient: kubeClient,
Expand Down Expand Up @@ -517,7 +521,7 @@ func (dn *Daemon) nodeStateSyncHandler() error {

// load plugins if it has not loaded
if len(dn.enabledPlugins) == 0 {
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState)
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, dn.parallelNicConfig, latestState)
if err != nil {
glog.Errorf("nodeStateSyncHandler(): failed to enable vendor plugins error: %v", err)
return err
Expand Down
1 change: 1 addition & 0 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ var _ = Describe("Config Daemon", func() {
utils.Baremetal,
false,
false,
false,
)

sut.enabledPlugins = map[string]plugin.VendorPlugin{generic.PluginName: &fake.FakePlugin{}}
Expand Down
4 changes: 2 additions & 2 deletions pkg/daemon/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var (
K8sPlugin = k8splugin.NewK8sPlugin
)

func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState) (map[string]plugin.VendorPlugin, error) {
func enablePlugins(platform utils.PlatformType, useSystemdService, parallelNicConfig bool, ns *sriovnetworkv1.SriovNetworkNodeState) (map[string]plugin.VendorPlugin, error) {
glog.Infof("enableVendorPlugins(): enabling plugins")
enabledPlugins := map[string]plugin.VendorPlugin{}

Expand All @@ -54,7 +54,7 @@ func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *srio
}
enabledPlugins[k8sPlugin.Name()] = k8sPlugin
}
genericPlugin, err := GenericPlugin(false)
genericPlugin, err := GenericPlugin(false, parallelNicConfig)
if err != nil {
glog.Errorf("enableVendorPlugins(): failed to load the generic plugin error: %v", err)
return nil, err
Expand Down
30 changes: 16 additions & 14 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,20 @@ type DriverState struct {
type DriverStateMapType map[uint]*DriverState

type GenericPlugin struct {
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
RunningOnHost bool
HostManager host.HostManagerInterface
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
RunningOnHost bool
ParallelNicConfig bool
HostManager host.HostManagerInterface
}

const scriptsPath = "bindata/scripts/enable-kargs.sh"

// Initialize our plugin and set up initial values
func NewGenericPlugin(runningOnHost bool) (plugin.VendorPlugin, error) {
func NewGenericPlugin(runningOnHost, parallelNicConfig bool) (plugin.VendorPlugin, error) {
driverStateMap := make(map[uint]*DriverState)
driverStateMap[Vfio] = &DriverState{
DriverName: vfioPciDriver,
Expand All @@ -84,11 +85,12 @@ func NewGenericPlugin(runningOnHost bool) (plugin.VendorPlugin, error) {
}

return &GenericPlugin{
PluginName: PluginName,
SpecVersion: "1.0",
DriverStateMap: driverStateMap,
RunningOnHost: runningOnHost,
HostManager: host.NewHostManager(runningOnHost),
PluginName: PluginName,
SpecVersion: "1.0",
DriverStateMap: driverStateMap,
RunningOnHost: runningOnHost,
ParallelNicConfig: parallelNicConfig,
HostManager: host.NewHostManager(runningOnHost),
}, nil
}

Expand Down Expand Up @@ -166,7 +168,7 @@ func (p *GenericPlugin) Apply() error {
defer exit()
}

if err := utils.SyncNodeState(p.DesireState, pfsToSkip); err != nil {
if err := utils.SyncNodeState(p.DesireState, pfsToSkip, p.ParallelNicConfig); err != nil {
return err
}
p.LastState = &sriovnetworkv1.SriovNetworkNodeState{}
Expand Down
2 changes: 1 addition & 1 deletion pkg/plugins/generic/generic_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ var _ = Describe("Generic plugin", func() {
var genericPlugin plugin.VendorPlugin
var err error
BeforeEach(func() {
genericPlugin, err = NewGenericPlugin(false)
genericPlugin, err = NewGenericPlugin(false, false)
Expect(err).ToNot(HaveOccurred())
})

Expand Down
71 changes: 69 additions & 2 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -147,8 +148,71 @@ func DiscoverSriovDevices(withUnsupported bool) ([]sriovnetworkv1.InterfaceExt,
}

// SyncNodeState Attempt to update the node state to match the desired state
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool) error {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool, parallelNicConfig bool) error {
if !parallelNicConfig {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}
return ConfigSriovInterfacesInParallel(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}

func ConfigSriovInterfacesInParallel(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): start sriov configuration")
if IsKernelLockdownMode(true) && hasMellanoxInterfacesInSpec(ifaceStatuses, interfaces) {
glog.Warningf("cannot use mellanox devices when in kernel lockdown mode")
return fmt.Errorf("cannot use mellanox devices when in kernel lockdown mode")
}
// TODO(e0ne): store all errors in SriovNetworkNodeState
var result error
wg := sync.WaitGroup{}
for _, ifaceStatus := range ifaceStatuses {
configured := false
for _, iface := range interfaces {
if iface.PciAddress == ifaceStatus.PciAddress {
configured = true

if skip := pfsToConfig[iface.PciAddress]; skip {
break
}

if !NeedUpdate(&iface, &ifaceStatus) {
glog.V(2).Infof("syncNodeState(): no need update interface %s", iface.PciAddress)
break
}

wg.Add(1)
go func(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetworkv1.InterfaceExt) {
if err := configSriovDevice(iface, ifaceStatus); err != nil {
glog.Errorf("ConfigSriovInterfacesInParallel(): fail to configure sriov interface %s: %v. resetting interface.", iface.PciAddress, err)
result = err
if resetErr := resetSriovDevice(*ifaceStatus); resetErr != nil {
glog.Errorf("SyncNodeState(): fail to reset on error SR-IOV interface: %s", resetErr)
result = resetErr
}
}
wg.Done()
}(&iface, &ifaceStatus)

break
}
}
if !configured && ifaceStatus.NumVfs > 0 {
if skip := pfsToConfig[ifaceStatus.PciAddress]; skip {
continue
}

if err := resetSriovDevice(ifaceStatus); err != nil {
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): reset failed %v", ifaceStatus.PciAddress)
result = err
}
}
}
wg.Wait()
if result != nil {
glog.Errorf("ConfigSriovInterfacesInParallel(): fail to configure sriov interface: %v", result)
return result
}
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): sriov configuration finished")
return nil
}

func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
Expand All @@ -171,6 +235,7 @@ func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses
glog.V(2).Infof("syncNodeState(): no need update interface %s", iface.PciAddress)
break
}

if err = configSriovDevice(&iface, &ifaceStatus); err != nil {
glog.Errorf("SyncNodeState(): fail to configure sriov interface %s: %v. resetting interface.", iface.PciAddress, err)
if resetErr := resetSriovDevice(ifaceStatus); resetErr != nil {
Expand Down Expand Up @@ -415,6 +480,7 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
return err
}
}
glog.V(2).Infof("configSriovDevice(): config interface %s completed", ifaceStatus.PciAddress)
return nil
}

Expand Down Expand Up @@ -459,6 +525,7 @@ func setNetdevMTU(pciAddr string, mtu int) error {
glog.Warningf("setNetdevMTU(): fail to write mtu file after retrying: %v", err)
return err
}
glog.V(2).Infof("setNetdevMTU(): set MTU for device %s to %d completed", pciAddr, mtu)
return nil
}

Expand Down

0 comments on commit 37a73d8

Please sign in to comment.