Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add waitForDevicesInitialization to systemd service #808

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"os"
"time"

"github.com/go-logr/logr"
"github.com/spf13/cobra"
Expand All @@ -40,6 +41,12 @@ import (
const (
PhasePre = "pre"
PhasePost = "post"

// InitializationDeviceDiscoveryTimeoutSec constant defines the number of
// seconds to wait for devices to be registered in the system with the expected name.
InitializationDeviceDiscoveryTimeoutSec = 60
// InitializationDeviceUdevProcessingTimeoutSec constant defines the number of seconds to wait for udev rules to process
InitializationDeviceUdevProcessingTimeoutSec = 60
)

var (
Expand Down Expand Up @@ -104,6 +111,8 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to create hostHelpers: %v", err))
}

waitForDevicesInitialization(setupLog, sriovConf, hostHelpers)

if phaseArg == PhasePre {
err = phasePre(setupLog, sriovConf, hostHelpers)
} else {
Expand Down Expand Up @@ -303,3 +312,51 @@ func updateResult(setupLog logr.Logger, result, msg string) error {
setupLog.V(0).Info("result file updated", "SyncStatus", sriovResult.SyncStatus, "LastSyncError", msg)
return nil
}

// waitForDevicesInitialization should be executed in both the pre and post-networking stages.
// This function ensures that the network devices specified in the configuration are registered
// and handled by UDEV. Sometimes, the initialization of network devices might take a significant
// amount of time, and the sriov-config systemd service may start before the devices are fully
// processed, leading to failure.
//
// To address this, we not only check if the devices are registered with the correct name but also
// wait for the udev event queue to empty. This increases the likelihood that the service will start
// only when the devices are fully initialized. It is required to call this function in the
// "post-networking" phase as well because the OS network manager might change device configurations,
// and we need to ensure these changes are fully processed before starting the post-networking part.
//
// The timeouts used in this function are intentionally kept low to avoid blocking the OS loading
// process for too long in case of any issues.
//
// Note: Currently, this function handles only Baremetal clusters. We do not have evidence that
// this logic is required for virtual clusters.
func waitForDevicesInitialization(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) {
if conf.PlatformType != consts.Baremetal {
// skip waiting on virtual cluster
return
}
// wait for devices from the spec to be registered in the system with expected names
devicesToWait := make(map[string]string, len(conf.Spec.Interfaces))
for _, d := range conf.Spec.Interfaces {
devicesToWait[d.PciAddress] = d.Name
}
deadline := time.Now().Add(time.Second * time.Duration(InitializationDeviceDiscoveryTimeoutSec))
for time.Now().Before(deadline) {
for pciAddr, name := range devicesToWait {
if hostHelpers.TryGetInterfaceName(pciAddr) == name {
delete(devicesToWait, pciAddr)
}
}
if len(devicesToWait) == 0 {
break
}
time.Sleep(time.Second)
}
if len(devicesToWait) != 0 {
setupLog.Info("WARNING: some devices were not initialized", "devices", devicesToWait, "timeout", InitializationDeviceDiscoveryTimeoutSec)
}
if err := hostHelpers.WaitUdevEventsProcessed(InitializationDeviceUdevProcessingTimeoutSec); err != nil {
setupLog.Info("WARNING: failed to wait for udev events processing", "reason", err.Error(),
"timeout", InitializationDeviceUdevProcessingTimeoutSec)
}
}
20 changes: 20 additions & 0 deletions cmd/sriov-network-config-daemon/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"fmt"

"github.com/go-logr/logr"
"github.com/golang/mock/gomock"
"github.com/spf13/cobra"
"gopkg.in/yaml.v3"
Expand Down Expand Up @@ -158,6 +159,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand Down Expand Up @@ -211,6 +214,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand All @@ -236,6 +241,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": getTestResultFileContent("InProgress", ""),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{
Name: "enp216s0f0np0",
}}, nil)
Expand Down Expand Up @@ -276,4 +283,17 @@ var _ = Describe("Service", func() {
testHelpers.GinkgoAssertFileContentsEquals("/etc/sriov-operator/sriov-interface-result.yaml",
string(getTestResultFileContent("Failed", "post: unexpected result of the pre phase: Failed, syncError: pretest")))
})
It("waitForDevicesInitialization", func() {
cfg := &systemd.SriovConfig{Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: []sriovnetworkv1.Interface{
{Name: "name1", PciAddress: "0000:d8:00.0"},
{Name: "name2", PciAddress: "0000:d8:00.1"}}}}
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("other")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("name1")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("name2")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
waitForDevicesInitialization(logr.Discard(), cfg, hostHelpers)
})
})
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/udev/udev.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"path"
"path/filepath"
"strconv"
"strings"

"sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -126,6 +127,18 @@ func (u *udev) LoadUdevRules() error {
return nil
}

// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
func (u *udev) WaitUdevEventsProcessed(timeout int) error {
log.Log.V(2).Info("WaitUdevEventsProcessed()")
_, stderr, err := u.utilsHelper.RunCommand("udevadm", "settle", "-t", strconv.Itoa(timeout))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this supposed to run in the host chroot? I tried running it in the config-daemon pod:

[root@cnfdr22 /]# udevadm settle -t 10
Running in chroot, ignoring request.

Copy link
Collaborator Author

@ykulazhenkov ykulazhenkov Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently this function will never run inside the daemon Pod. The function is executed by the sriov-config and sriov-config-post-networking systemd services on boot. When running as a part of systemd service the functions have direct access to the host's tools and there is no need to use chroot

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I wrongly thought the systemd service was running the config-daemon via podman run ... .
thanks for explaining

if err != nil {
log.Log.Error(err, "WaitUdevEventsProcessed(): failed to wait for udev rules to process", "error", stderr, "timeout", timeout)
return err
}
return nil
}

func (u *udev) addUdevRule(pfPciAddress, ruleName, ruleContent string) error {
log.Log.V(2).Info("addUdevRule()", "device", pfPciAddress, "rule", ruleName)
rulePath := u.getRuleFolderPath()
Expand Down
10 changes: 10 additions & 0 deletions pkg/host/internal/udev/udev_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,14 @@ var _ = Describe("UDEV", func() {
Expect(s.LoadUdevRules()).To(MatchError(testError))
})
})
Context("WaitUdevEventsProcessed", func() {
It("Succeed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "10").Return("", "", nil)
Expect(s.WaitUdevEventsProcessed(10)).NotTo(HaveOccurred())
})
It("Command Failed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "20").Return("", "", testError)
Expect(s.WaitUdevEventsProcessed(20)).To(MatchError(testError))
})
})
})
14 changes: 14 additions & 0 deletions pkg/host/mock/mock_host.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkg/host/types/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ type UdevInterface interface {
RemoveVfRepresentorUdevRule(pfPciAddress string) error
// LoadUdevRules triggers udev rules for network subsystem
LoadUdevRules() error
// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
WaitUdevEventsProcessed(timeout int) error
}

type VdpaInterface interface {
Expand Down
Loading