From 2e983e7c5d48a063ee892056588bbbd05bdafd31 Mon Sep 17 00:00:00 2001 From: Hyeongju Johannes Lee Date: Thu, 19 Sep 2024 07:58:49 -0700 Subject: [PATCH] qat,e2e: add heartbeat and auto-reset validations Signed-off-by: Hyeongju Johannes Lee --- test/e2e/qat/qatplugin_dpdk.go | 107 +++++++++++++++++++++++++++++++-- test/e2e/utils/utils.go | 16 +++++ 2 files changed, 119 insertions(+), 4 deletions(-) diff --git a/test/e2e/qat/qatplugin_dpdk.go b/test/e2e/qat/qatplugin_dpdk.go index 8d513041c..f0072608f 100644 --- a/test/e2e/qat/qatplugin_dpdk.go +++ b/test/e2e/qat/qatplugin_dpdk.go @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/kubernetes/test/e2e/framework" e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" + e2ejob "k8s.io/kubernetes/test/e2e/framework/job" e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" admissionapi "k8s.io/pod-security-admission/api" @@ -38,6 +39,8 @@ const ( qatPluginKustomizationYaml = "deployments/qat_plugin/overlays/e2e/kustomization.yaml" cryptoTestYaml = "deployments/qat_dpdk_app/crypto-perf/crypto-perf-dpdk-pod-requesting-qat-cy.yaml" compressTestYaml = "deployments/qat_dpdk_app/compress-perf/compress-perf-dpdk-pod-requesting-qat-dc.yaml" + cyResource = "qat.intel.com/cy" + dcResource = "qat.intel.com/dc" ) const ( @@ -77,6 +80,7 @@ func describeQatDpdkPlugin() { var dpPodName string var resourceName v1.ResourceName + var nodeName string ginkgo.JustBeforeEach(func(ctx context.Context) { ginkgo.By("deploying QAT plugin in DPDK mode") @@ -97,6 +101,18 @@ func describeQatDpdkPlugin() { framework.Failf("container filesystem info checks failed: %v", err) } + // when running [Functionality] e2e tests + if resourceName == "" { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, cyResource, 100*time.Second, utils.WaitForPositiveResource); err != nil { + resourceName = dcResource + nodeName, _ = utils.FindNodeAndResourceCapacity(f, ctx, dcResource) + return + } + resourceName = cyResource + nodeName, _ = utils.FindNodeAndResourceCapacity(f, ctx, cyResource) + return + } + ginkgo.By("checking if the resource is allocatable") if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second, utils.WaitForPositiveResource); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) @@ -111,14 +127,14 @@ func describeQatDpdkPlugin() { } }) - ginkgo.Context("When QAT resources are available with crypto (cy) services enabled [Resource:cy]", func() { + ginkgo.Context("When QAT resources are continuously available with crypto (cy) services enabled [Resource:cy]", func() { // This BeforeEach runs even before the JustBeforeEach above. ginkgo.BeforeEach(func() { ginkgo.By("creating a configMap before plugin gets deployed") e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "create", "configmap", "--from-literal", "qat.conf=ServicesEnabled=sym;asym", "qat-config") ginkgo.By("setting resourceName for cy services") - resourceName = "qat.intel.com/cy" + resourceName = cyResource }) ginkgo.It("deploys a crypto pod (openssl) requesting QAT resources [App:openssl]", func(ctx context.Context) { @@ -139,13 +155,13 @@ func describeQatDpdkPlugin() { }) }) - ginkgo.Context("When QAT resources are available with compress (dc) services enabled [Resource:dc]", func() { + ginkgo.Context("When QAT resources are continuously available with compress (dc) services enabled [Resource:dc]", func() { ginkgo.BeforeEach(func() { ginkgo.By("creating a configMap before plugin gets deployed") e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "create", "configmap", "--from-literal", "qat.conf=ServicesEnabled=dc", "qat-config") ginkgo.By("setting resourceName for dc services") - resourceName = "qat.intel.com/dc" + resourceName = dcResource }) ginkgo.It("deploys a compress pod (openssl) requesting QAT resources [App:openssl]", func(ctx context.Context) { @@ -165,6 +181,57 @@ func describeQatDpdkPlugin() { ginkgo.It("does nothing", func() {}) }) }) + + ginkgo.Context("When a QAT device goes unresponsive", func() { + ginkgo.When("QAT's auto-reset is off", func() { + ginkgo.BeforeEach(func() { + ginkgo.By("creating a configMap before plugin gets deployed") + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "create", "configmap", "--from-literal", "qat.conf=AutoresetEnabled=off", "qat-config") + }) + + ginkgo.It("checks if the heartbeat is read correctly [Functionality:heartbeat]", func(ctx context.Context) { + injectError(ctx, f, nodeName) + + ginkgo.By("seeing if there is zero resource") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 100*time.Second, utils.WaitForZeroResource); err != nil { + framework.Failf("unable to wait for nodes to have no resource: %v", err) + } + }) + }) + + ginkgo.When("QAT's autoreset is on", func() { + ginkgo.BeforeEach(func() { + ginkgo.By("creating a configMap before plugin gets deployed") + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "create", "configmap", "--from-literal", "qat.conf=AutoresetEnabled=on", "qat-config") + }) + + ginkgo.It("checks if an injected error gets solved [Functionality:auto-reset]", func(ctx context.Context) { + injectError(ctx, f, nodeName) + + ginkgo.By("seeing if there is zero resource") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 100*time.Second, utils.WaitForZeroResource); err != nil { + framework.Logf("unable to wait for nodes to have no resource: %v", err) + // It should not be Failf since there can be a case auto-reset worked before plugin notices. + // It is still necessary to have utils.WaitForNodeWithResource for WaitForZeroResource + // because there can be also a case that the following part that checks WaitForPositiveResource may run + // before the injected error is recognized by plugin when this part does not exist. + // In other words, this is necessary to ensure that the injected error does not remain after this It() spec. + } + + ginkgo.By("seeing if there is positive allocatable resource") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 300*time.Second, utils.WaitForPositiveResource); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + + ginkgo.By("checking if openssl pod runs successfully") + if resourceName == cyResource { + runCpaSampleCode(ctx, f, symmetric, resourceName) + } else { + runCpaSampleCode(ctx, f, compression, resourceName) + } + }) + }) + }) } func runCpaSampleCode(ctx context.Context, f *framework.Framework, runTests int, resourceName v1.ResourceName) { @@ -199,3 +266,35 @@ func runCpaSampleCode(ctx context.Context, f *framework.Framework, runTests int, err = e2epod.WaitForPodSuccessInNamespaceTimeout(ctx, f.ClientSet, pod.ObjectMeta.Name, f.Namespace.Name, 300*time.Second) gomega.Expect(err).To(gomega.BeNil(), utils.GetPodLogs(ctx, f, pod.ObjectMeta.Name, pod.Spec.Containers[0].Name)) } + +func injectError(ctx context.Context, f *framework.Framework, nodeName string) { + job := e2ejob.NewTestJobOnNode("success", "qat-inject-error", v1.RestartPolicyNever, 1, 1, nil, 0, nodeName) + + job.Spec.Template.Spec.Containers[0].Command = []string{ + "/bin/sh", + "-c", + "find /sys/kernel/debug/qat_*/heartbeat/ -name inject_error -exec sh -c 'echo 1 > {}' \\;", + } + job.Spec.Template.Spec.Containers[0].VolumeMounts = []v1.VolumeMount{{ + Name: "debugfs", + MountPath: "/sys/kernel/debug/", + }} + job.Spec.Template.Spec.Volumes = []v1.Volume{{ + Name: "debugfs", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/sys/kernel/debug/", + }, + }, + }} + yes := true + job.Spec.Template.Spec.Containers[0].SecurityContext = &v1.SecurityContext{ + Privileged: &yes, + } + + job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) + framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) + + err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, nil, 1) + framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) +} diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index d33f5af53..aa33724a5 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -365,3 +365,19 @@ func Kubectl(ns string, cmd string, opt string, file string) { msg := e2ekubectl.RunKubectlOrDie(ns, cmd, opt, path) framework.Logf("%s", msg) } + +func FindNodeAndResourceCapacity(f *framework.Framework, ctx context.Context, resourceName string) (string, int64) { + nodelist, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + framework.Failf("failed to list Nodes: %v", err) + } + + // we have at least one node with resource capacity + for _, item := range nodelist.Items { + if q, ok := item.Status.Allocatable[v1.ResourceName(resourceName)]; ok && q.Value() > 0 { + return item.Name, q.Value() + } + } + + return "", 0 +}