Skip to content

Commit

Permalink
Parallel NICs configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
e0ne committed Feb 9, 2024
1 parent d162021 commit aafe487
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 59 deletions.
7 changes: 7 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions bindata/manifests/daemon/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ spec:
{{- with index . "DisablePlugins" }}
- --disable-plugins={{.}}
{{- end }}
{{- if .ParallelNicConfig }}
- --parallel-nic-config
{{- end }}
env:
- name: NODE_NAME
valueFrom:
Expand Down
12 changes: 8 additions & 4 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@ var (
}

startOpts struct {
kubeconfig string
nodeName string
systemd bool
disabledPlugins stringList
kubeconfig string
nodeName string
systemd bool
disabledPlugins stringList
parallelNicConfig bool
}
)

Expand All @@ -91,6 +92,7 @@ func init() {
startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing")
startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode")
startCmd.PersistentFlags().VarP(&startOpts.disabledPlugins, "disable-plugins", "", "comma-separated list of plugins to disable")
startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "perform NIC configuration in parallel")
}

func runStartCmd(cmd *cobra.Command, args []string) error {
Expand All @@ -104,6 +106,8 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
vars.UsingSystemdMode = true
}

vars.ParallelNicConfig = startOpts.parallelNicConfig

if startOpts.nodeName == "" {
name, ok := os.LookupEnv("NODE_NAME")
if !ok || name == "" {
Expand Down
4 changes: 4 additions & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context,
} else {
data.Data["UsedSystemdMode"] = false
}
data.Data["ParallelNicConfig"] = false
if parallelConfig, ok := dc.Spec.FeatureGates[consts.ParallelNicConfigFeatureGate]; ok {
data.Data["ParallelNicConfig"] = parallelConfig
}

envCniBinPath := os.Getenv("SRIOV_CNI_BIN_PATH")
if envCniBinPath == "" {
Expand Down
2 changes: 2 additions & 0 deletions pkg/consts/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ const (
KernelArgPciRealloc = "pci=realloc"
KernelArgIntelIommu = "intel_iommu=on"
KernelArgIommuPt = "iommu=pt"

ParallelNicConfigFeatureGate = "parallelNicConfig"
)

const (
Expand Down
255 changes: 200 additions & 55 deletions pkg/host/internal/sriov/sriov.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ import (
mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox"
)

type interfaceToConfigure struct {
iface sriovnetworkv1.Interface
ifaceStatus sriovnetworkv1.InterfaceExt
}

type sriov struct {
utilsHelper utils.CmdInterface
kernelHelper types.KernelInterface
Expand Down Expand Up @@ -450,13 +455,43 @@ func (s *sriov) ConfigSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *
return nil
}

func (s *sriov) ConfigSriovInterfaces(storeManager store.ManagerInterface,
interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
func (s *sriov) ConfigSriovInterfaces(storeManager store.ManagerInterface, interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
if s.kernelHelper.IsKernelLockdownMode() && mlx.HasMellanoxInterfacesInSpec(ifaceStatuses, interfaces) {
log.Log.Error(nil, "cannot use mellanox devices when in kernel lockdown mode")
return fmt.Errorf("cannot use mellanox devices when in kernel lockdown mode")
}

toBeConfigured, toBeResetted, err := s.getConfigureAndReset(storeManager, interfaces, ifaceStatuses, pfsToConfig)
if err != nil {
log.Log.Error(err, "cannot get a list of interfaces to configure")
return fmt.Errorf("cannot get a list of interfaces to configure")
}

if vars.ParallelNicConfig {
err = s.configSriovInterfacesInParallel(storeManager, toBeConfigured)
} else {
err = s.configSriovInterfaces(storeManager, toBeConfigured)
}
if err != nil {
log.Log.Error(err, "cannot configure sriov interfaces")
return fmt.Errorf("cannot configure sriov interfaces")
}

if vars.ParallelNicConfig {
err = s.resetSriovInterfacesInParallel(storeManager, toBeResetted)
} else {
err = s.resetSriovInterfaces(storeManager, toBeResetted)
}
if err != nil {
log.Log.Error(err, "cannot reset reset interfaces")
return fmt.Errorf("cannot reset sriov interfaces")
}
return nil
}

func (s *sriov) getConfigureAndReset(storeManager store.ManagerInterface, interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) ([]interfaceToConfigure, []sriovnetworkv1.InterfaceExt, error) {
toBeConfigured := []interfaceToConfigure{}
toBeResetted := []sriovnetworkv1.InterfaceExt{}
for _, ifaceStatus := range ifaceStatuses {
configured := false
for _, iface := range interfaces {
Expand All @@ -467,76 +502,186 @@ func (s *sriov) ConfigSriovInterfaces(storeManager store.ManagerInterface,
break
}

if !sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) {
log.Log.V(2).Info("syncNodeState(): no need update interface", "address", iface.PciAddress)

// Save the PF status to the host
err := storeManager.SaveLastPfAppliedStatus(&iface)
if err != nil {
log.Log.Error(err, "SyncNodeState(): failed to save PF applied config to host")
return err
}

break
}
if err := s.ConfigSriovDevice(&iface, &ifaceStatus); err != nil {
log.Log.Error(err, "SyncNodeState(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress)
if iface.ExternallyManaged {
log.Log.Info("SyncNodeState(): skipping device reset as the nic is marked as externally created")
} else {
if resetErr := s.ResetSriovDevice(ifaceStatus); resetErr != nil {
log.Log.Error(resetErr, "SyncNodeState(): failed to reset on error SR-IOV interface")
}
}
return err
}

// Save the PF status to the host
err := storeManager.SaveLastPfAppliedStatus(&iface)
skip, err := skipSriovConfig(&iface, &ifaceStatus, storeManager)
if err != nil {
log.Log.Error(err, "SyncNodeState(): failed to save PF applied config to host")
return err
log.Log.Error(err, "getConfigureAndReset(): failed to check interface")
return nil, nil, err
}
break
if skip {
break
}
toBeConfigured = append(toBeConfigured, interfaceToConfigure{iface: iface, ifaceStatus: ifaceStatus})
}
}

if !configured && ifaceStatus.NumVfs > 0 {
if skip := pfsToConfig[ifaceStatus.PciAddress]; skip {
continue
if skip := pfsToConfig[ifaceStatus.PciAddress]; !skip {
toBeResetted = append(toBeResetted, ifaceStatus)
}
}
}
return toBeConfigured, toBeResetted, nil
}

// load the PF info
pfStatus, exist, err := storeManager.LoadPfsStatus(ifaceStatus.PciAddress)
if err != nil {
log.Log.Error(err, "SyncNodeState(): failed to load info about PF status for device",
"address", ifaceStatus.PciAddress)
return err
func (s *sriov) configSriovInterfacesInParallel(storeManager store.ManagerInterface, interfaces []interfaceToConfigure) error {
log.Log.V(2).Info("configSriovInterfacesInParallel(): start sriov configuration")

var result error
errChannel := make(chan error)
interfacesToConfigure := 0
for ifaceIndex, iface := range interfaces {
interfacesToConfigure += 1
go func(iface *interfaceToConfigure) {
if err := s.ConfigSriovDevice(&iface.iface, &iface.ifaceStatus); err != nil {
log.Log.Error(err, "configSriovInterfacesInParallel(): fail to configure sriov interface. resetting interface.", "address", iface.iface.PciAddress)
if iface.iface.ExternallyManaged {
log.Log.Info("configSriovInterfacesInParallel(): skipping device reset as the nic is marked as externally created")
} else {
if resetErr := s.ResetSriovDevice(iface.ifaceStatus); resetErr != nil {
log.Log.Error(resetErr, "configSriovInterfacesInParallel(): failed to reset on error SR-IOV interface")
}
}
errChannel <- err
} else {
errChannel <- nil
}
}(&interfaces[ifaceIndex])
// Save the PF status to the host
err := storeManager.SaveLastPfAppliedStatus(&iface.iface)
if err != nil {
log.Log.Error(err, "configSriovInterfacesInParallel(): failed to save PF applied config to host")
return err
}
break
}

if !exist {
log.Log.Info("SyncNodeState(): PF name with pci address has VFs configured but they weren't created by the sriov operator. Skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)
continue
for i := 0; i < interfacesToConfigure; i++ {
errMsg := <-errChannel
result = errors.Join(result, errMsg)
}
if result != nil {
log.Log.Error(result, "ConfigSriovInterfacesInParallel(): fail to configure sriov interface")
return result
}
log.Log.V(2).Info("ConfigSriovInterfacesInParallel(): sriov configuration finished")
return nil
}

func (s *sriov) resetSriovInterfacesInParallel(storeManager store.ManagerInterface, interfaces []sriovnetworkv1.InterfaceExt) error {
var result error
errChannel := make(chan error)
interfacesToReset := 0
for ifaceIndex := range interfaces {
interfacesToReset += 1
go func(iface *sriovnetworkv1.InterfaceExt) {
if err := s.checkForConfigAndReset(*iface, storeManager); err != nil {
log.Log.Error(err, "resetSriovInterfacesInParallel(): fail to configure sriov interface. resetting interface.", "address", iface.PciAddress)
errChannel <- err
} else {
errChannel <- nil
}
}(&interfaces[ifaceIndex])
}

if pfStatus.ExternallyManaged {
log.Log.Info("SyncNodeState(): PF name with pci address was externally created skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)
continue
for i := 0; i < interfacesToReset; i++ {
errMsg := <-errChannel
result = errors.Join(result, errMsg)
}
if result != nil {
log.Log.Error(result, "resetSriovInterfacesInParallel(): fail to reset sriov interface")
return result
}
log.Log.V(2).Info("resetSriovInterfacesInParallel(): sriov reset finished")

return nil
}

func (s *sriov) configSriovInterfaces(storeManager store.ManagerInterface, interfaces []interfaceToConfigure) error {
log.Log.V(2).Info("configSriovInterfaces(): start sriov configuration")
for _, iface := range interfaces {
if err := s.ConfigSriovDevice(&iface.iface, &iface.ifaceStatus); err != nil {
log.Log.Error(err, "configSriovInterfaces(): fail to configure sriov interface. resetting interface.", "address", iface.iface.PciAddress)
if iface.iface.ExternallyManaged {
log.Log.Info("configSriovInterfaces(): skipping device reset as the nic is marked as externally created")
} else {
err = s.udevHelper.RemoveUdevRule(ifaceStatus.PciAddress)
if err != nil {
return err
if resetErr := s.ResetSriovDevice(iface.ifaceStatus); resetErr != nil {
log.Log.Error(resetErr, "configSriovInterfaces(): failed to reset on error SR-IOV interface")
}
}
return err
}

if err = s.ResetSriovDevice(ifaceStatus); err != nil {
return err
}
// Save the PF status to the host
err := storeManager.SaveLastPfAppliedStatus(&iface.iface)
if err != nil {
log.Log.Error(err, "configSriovInterfaces(): failed to save PF applied config to host")
return err
}
}
log.Log.V(2).Info("configSriovInterfaces(): sriov configuration finished")
return nil
}

func (s *sriov) resetSriovInterfaces(storeManager store.ManagerInterface, interfaces []sriovnetworkv1.InterfaceExt) error {
for _, iface := range interfaces {
if err := s.checkForConfigAndReset(iface, storeManager); err != nil {
log.Log.Error(err, "resetSriovInterfaces(): failed to reset sriov interface. resetting interface.", "address", iface.PciAddress)
return err
}
}
log.Log.V(2).Info("resetSriovInterfaces(): sriov reset finished")
return nil
}

// / skipSriovConfig checks if we need to apply SR-IOV configuration specified specific interface
func skipSriovConfig(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetworkv1.InterfaceExt, storeManager store.ManagerInterface) (bool, error) {
if !sriovnetworkv1.NeedToUpdateSriov(iface, ifaceStatus) {
log.Log.V(2).Info("ConfigSriovInterfaces(): no need update interface", "address", iface.PciAddress)

// Save the PF status to the host
err := storeManager.SaveLastPfAppliedStatus(iface)
if err != nil {
log.Log.Error(err, "ConfigSriovInterfaces(): failed to save PF applied config to host")
return false, err
}

return true, nil
}
return false, nil
}

func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, storeManager store.ManagerInterface) error {
// load the PF info
pfStatus, exist, err := storeManager.LoadPfsStatus(ifaceStatus.PciAddress)
if err != nil {
log.Log.Error(err, "checkForConfigAndReset(): failed to load info about PF status for device",
"address", ifaceStatus.PciAddress)
return err
}

if !exist {
log.Log.Info("checkForConfigAndReset(): PF name with pci address has VFs configured but they weren't created by the sriov operator. Skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)
return nil
}

if pfStatus.ExternallyManaged {
log.Log.Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)
return nil
} else {
err = s.udevHelper.RemoveUdevRule(ifaceStatus.PciAddress)
if err != nil {
return err
}
}

if err = s.ResetSriovDevice(ifaceStatus); err != nil {
return err
}

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/vars/vars.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ var (
// UsingSystemdMode global variable to mark the config-daemon is running on systemd mode
UsingSystemdMode = false

// ParallelNicConfig global variable to perform NIC configuration in parallel
ParallelNicConfig = false

// FilesystemRoot used by test to mock interactions with filesystem
FilesystemRoot = ""

Expand Down

0 comments on commit aafe487

Please sign in to comment.