Skip to content

Commit

Permalink
Merge branch 'k8snetworkplumbingwg:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
nvidia-ci-cd authored Oct 10, 2024
2 parents a35ffed + a85ab70 commit cc01d2c
Show file tree
Hide file tree
Showing 12 changed files with 211 additions and 90 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ jobs:
# Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version.
version: v1.55.2

shellcheck:
name: Shellcheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run ShellCheck
uses: ludeeus/action-shellcheck@master
with:
severity: error

test-coverage:
name: test-coverage
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion bindata/manifests/webhook/002-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ rules:
- apiGroups:
- ""
resources:
- configmap
- configmaps
verbs:
- 'watch'
- 'list'
Expand Down
2 changes: 1 addition & 1 deletion hack/deploy-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ load_manifest() {
fi
files="service_account.yaml role.yaml role_binding.yaml clusterrole.yaml clusterrolebinding.yaml configmap.yaml sriovoperatorconfig.yaml operator.yaml"
for m in ${files}; do
if [ "$(echo ${EXCLUSIONS[@]} | grep -o ${m} | wc -w | xargs)" == "0" ] ; then
if [ "$(echo "${EXCLUSIONS[@]}" | grep -o ${m} | wc -w | xargs)" == "0" ] ; then
envsubst< ${m} | ${OPERATOR_EXEC} apply ${namespace:-} --validate=false -f -
fi
done
Expand Down
2 changes: 2 additions & 0 deletions hack/env.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

if [ -z $SKIP_VAR_SET ]; then
export SRIOV_CNI_IMAGE=${SRIOV_CNI_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-cni}
export SRIOV_INFINIBAND_CNI_IMAGE=${SRIOV_INFINIBAND_CNI_IMAGE:-ghcr.io/k8snetworkplumbingwg/ib-sriov-cni}
Expand Down
6 changes: 3 additions & 3 deletions hack/run-e2e-test-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export SRIOV_NETWORK_OPERATOR_IMAGE="${SRIOV_NETWORK_OPERATOR_IMAGE:-sriov-netwo
export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE:-origin-sriov-network-config-daemon:e2e-test}"
export KUBECONFIG="${KUBECONFIG:-${HOME}/.kube/config}"
INTERFACES_SWITCHER="${INTERFACES_SWITCHER:-"test-suite"}"
SUPPORTED_INTERFACE_SWTICHER_MODES=("test-suite", "system-service")
SUPPORTED_INTERFACE_SWITCHER_MODES=("test-suite", "system-service")
RETRY_MAX=10
INTERVAL=10
TIMEOUT=300
Expand All @@ -16,9 +16,9 @@ while test $# -gt 0; do
case "$1" in
--device-netns-switcher)
INTERFACES_SWITCHER="$2"
if [[ ! "${SUPPORTED_INTERFACE_SWTICHER_MODES[@]}" =~ "${INTERFACES_SWITCHER}" ]]; then
if [[ ! "${SUPPORTED_INTERFACE_SWITCHER_MODES[*]}" =~ "${INTERFACES_SWITCHER}" ]]; then
echo "Error: unsupported interface switching mode: ${INTERFACES_SWITCHER}!"
echo "Supported modes are: ${SUPPORTED_INTERFACE_SWTICHER_MODES[@]}"
echo "Supported modes are: ${SUPPORTED_INTERFACE_SWITCHER_MODES[*]}"
exit 1
fi
shift
Expand Down
23 changes: 12 additions & 11 deletions hack/vf-netns-switcher.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ It must be of the form <netns>:<pf1>,<pf2>. This flag can be repeated to specify
done

return_interfaces_to_default_namespace(){
for netns in ${netnses[@]};do
for netns in "${netnses[@]}";do
for pf in ${pfs[$netns]};do
return_interface_to_default_namespace "${netns}" "${pf}"
done
Expand Down Expand Up @@ -277,19 +277,20 @@ switch_interface_vf_representors(){
return 0
fi

for interface in $(ls /sys/class/net);do
phys_switch_id=$(cat /sys/class/net/$interface/phys_switch_id)
for interface in /sys/class/net/*;do
phys_switch_id=$(cat $interface/phys_switch_id)
if [[ "$phys_switch_id" != "${pf_switch_ids[$pf_name]}" ]]; then
continue
fi
phys_port_name=$(cat /sys/class/net/$interface/phys_port_name)
phys_port_name=$(cat $interface/phys_port_name)
phys_port_name_pf_index=${phys_port_name%vf*}
phys_port_name_pf_index=${phys_port_name_pf_index#pf}
if [[ "$phys_port_name_pf_index" != "${pf_port_names[$pf_name]:1}" ]]; then
continue
fi
echo "Switching VF representor $interface of PF $pf_name to netns $worker_netns"
switch_vf $interface $worker_netns
interface_name=${interface##*/}
echo "Switching VF representor $interface_name of PF $pf_name to netns $worker_netns"
switch_vf $interface_name $worker_netns
done
}

Expand Down Expand Up @@ -348,7 +349,7 @@ variables_check(){
check_empty_var(){
local var_name="$1"

if [[ -z "${!var_name[@]}" ]];then
if [[ -z "${!var_name[*]}" ]];then
echo "Error: $var_name is empty..."
return 1
fi
Expand All @@ -360,7 +361,7 @@ main(){
trap return_interfaces_to_default_namespace INT EXIT TERM

while true;do
for netns in ${netnses[@]};do
for netns in "${netnses[@]}";do
switch_pfs "$netns" "${pfs[$netns]}"
sleep 2
switch_netns_vfs "$netns"
Expand Down Expand Up @@ -388,7 +389,7 @@ if [[ "$status" != "0" ]];then
exit $status
fi

for netns in ${netnses[@]};do
for netns in "${netnses[@]}";do
netns_create "$netns"
let status=$status+$?
if [[ "$status" != "0" ]];then
Expand All @@ -397,13 +398,13 @@ for netns in ${netnses[@]};do
fi
done

for netns in ${netnses[@]};do
for netns in "${netnses[@]}";do
get_pcis_from_pfs "$netns" "${pfs[$netns]}"
get_pf_switch_dev_info "$netns" "${pfs[$netns]}"
done

if [[ "${#pcis[@]}" == "0" ]];then
echo "Error: could not get pci addresses of interfaces ${pfs[@]}!!"
echo "Error: could not get pci addresses of interfaces ${pfs[*]}!!"
exit 1
fi

Expand Down
53 changes: 27 additions & 26 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func New(
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
mu: &sync.Mutex{},
}
}

Expand Down Expand Up @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {

var timeout int64 = 5
var metadataKey = "metadata.name"
dn.mu = &sync.Mutex{}
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
time.Second*15,
vars.Namespace,
Expand Down Expand Up @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
defer dn.mu.Unlock()
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")

var podToDelete string
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=" + vars.NodeName,
Expand All @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return nil
}
podToDelete = pods.Items[0].Name

log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
return nil
}
if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
for _, pod := range pods.Items {
podToDelete := pod.Name
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
continue
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}

return nil
Expand Down
61 changes: 47 additions & 14 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
)

var SriovDevicePluginPod corev1.Pod

func TestConfigDaemon(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Config Daemon Suite")
Expand Down Expand Up @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
},
}

SriovDevicePluginPod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}

err = sriovnetworkv1.AddToScheme(scheme.Scheme)
Expect(err).ToNot(HaveOccurred())
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
Expand All @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
Namespace: vars.Namespace,
}}).Build()

kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
snclient := snclientset.NewSimpleClientset()
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() {
err := sut.Run(stopCh, exitCh)
Expect(err).ToNot(HaveOccurred())
}()

SriovDevicePluginPod = corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

})

AfterEach(func() {
Expand Down Expand Up @@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() {

Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
})

It("restart all the sriov-device-plugin pods present on the node", func() {
otherPod1 := SriovDevicePluginPod.DeepCopy()
otherPod1.Name = "sriov-device-plugin-xxxa"
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

otherPod2 := SriovDevicePluginPod.DeepCopy()
otherPod2.Name = "sriov-device-plugin-xxxz"
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

err = sut.restartDevicePluginPod()
Expect(err).ToNot(HaveOccurred())

Eventually(func() (int, error) {
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=test-node",
})

if err != nil {
return 0, err
}

return len(podList.Items), nil
}, "1s").Should(BeZero())
})
})
})

Expand Down
Loading

0 comments on commit cc01d2c

Please sign in to comment.