From c732be27a0d3bf8cf22a923615ad088a6d4604fe Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Wed, 27 Nov 2024 12:39:09 +0100 Subject: [PATCH] Improve the reliability of the pod pending test case --- e2e/fixtures/fdb_cluster.go | 25 +++++++++++-------- e2e/test_operator/operator_test.go | 12 +++++---- .../operator_maintenance_mode_test.go | 2 +- .../operator_upgrades_test.go | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/e2e/fixtures/fdb_cluster.go b/e2e/fixtures/fdb_cluster.go index c2485301..f4956328 100644 --- a/e2e/fixtures/fdb_cluster.go +++ b/e2e/fixtures/fdb_cluster.go @@ -776,27 +776,22 @@ func (fdbCluster *FdbCluster) UpdateLogProcessCount(newLogProcessCount int) erro // SetPodAsUnschedulable sets the provided Pod on the NoSchedule list of the current FoundationDBCluster. This will make // sure that the Pod is stuck in Pending. -func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) error { +func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) { fdbCluster.SetProcessGroupsAsUnschedulable([]fdbv1beta2.ProcessGroupID{GetProcessGroupID(pod)}) - fetchedPod := &corev1.Pod{} - return wait.PollImmediate(2*time.Second, 5*time.Minute, func() (bool, error) { + gomega.Eventually(func(g gomega.Gomega) string { + fetchedPod := &corev1.Pod{} err := fdbCluster.getClient(). Get(context.Background(), client.ObjectKeyFromObject(&pod), fetchedPod) - if err != nil { - if kubeErrors.IsNotFound(err) { - return false, nil - } - return false, err - } + g.Expect(err).NotTo(gomega.HaveOccurred()) // Try deleting the Pod as a workaround until the operator handle all cases. if fetchedPod.Spec.NodeName != "" && fetchedPod.DeletionTimestamp.IsZero() { _ = fdbCluster.getClient().Delete(context.Background(), &pod) } - return fetchedPod.Spec.NodeName == "", nil - }) + return fetchedPod.Spec.NodeName + }).WithTimeout(5*time.Minute).WithPolling(2*time.Second).MustPassRepeatedly(5).Should(gomega.BeEmpty(), "Not able to set pod as unschedulable") } // SetProcessGroupsAsUnschedulable sets the provided process groups on the NoSchedule list of the current FoundationDBCluster. This will make @@ -1219,7 +1214,15 @@ func (fdbCluster *FdbCluster) CheckPodIsDeleted(podName string) bool { // EnsurePodIsDeletedWithCustomTimeout validates that a Pod is either not existing or is marked as deleted with a non-zero deletion timestamp. // It times out after timeoutMinutes. func (fdbCluster *FdbCluster) EnsurePodIsDeletedWithCustomTimeout(podName string, timeoutMinutes int) { + lastForceReconcile := time.Now() gomega.Eventually(func() bool { + // Force a reconciliation every minute to ensure the deletion will be done in a more timely manner (without + // the reconciliation getting delayed by the requeue mechanism). + if time.Since(lastForceReconcile) > 1*time.Minute { + fdbCluster.ForceReconcile() + lastForceReconcile = time.Now() + } + return fdbCluster.CheckPodIsDeleted(podName) }).WithTimeout(time.Duration(timeoutMinutes) * time.Minute).WithPolling(1 * time.Second).Should(gomega.BeTrue()) } diff --git a/e2e/test_operator/operator_test.go b/e2e/test_operator/operator_test.go index 8f215a88..5210bf80 100644 --- a/e2e/test_operator/operator_test.go +++ b/e2e/test_operator/operator_test.go @@ -580,7 +580,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() { initialPods := fdbCluster.GetStatelessPods() failedPod = factory.RandomPickOnePod(initialPods.Items) log.Printf("Setting pod %s to unschedulable.", failedPod.Name) - Expect(fdbCluster.SetPodAsUnschedulable(failedPod)).NotTo(HaveOccurred()) + fdbCluster.SetPodAsUnschedulable(failedPod) fdbCluster.ReplacePod(failedPod, true) }) @@ -590,7 +590,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() { }) It("should remove the targeted Pod", func() { - fdbCluster.EnsurePodIsDeletedWithCustomTimeout(failedPod.Name, 10) + fdbCluster.EnsurePodIsDeletedWithCustomTimeout(failedPod.Name, 15) }) }) @@ -599,6 +599,8 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() { var podToReplace *corev1.Pod BeforeEach(func() { + // We bring down two pods, which could cause some recoveries. + availabilityCheck = false failedPod = factory.ChooseRandomPod(fdbCluster.GetStatelessPods()) podToReplace = factory.ChooseRandomPod(fdbCluster.GetStatelessPods()) log.Println( @@ -607,17 +609,17 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() { ", Pod to replace:", podToReplace.Name, ) - Expect(fdbCluster.SetPodAsUnschedulable(*failedPod)).NotTo(HaveOccurred()) + fdbCluster.SetPodAsUnschedulable(*failedPod) fdbCluster.ReplacePod(*podToReplace, false) }) - + // 2024/11/26 01:41:00 Failed (unscheduled) Pod: operator-test-fdnhokqf-stateless-42898 , Pod to replace: operator-test-fdnhokqf-stateless-93723 AfterEach(func() { Expect(fdbCluster.ClearBuggifyNoSchedule(false)).NotTo(HaveOccurred()) Expect(fdbCluster.ClearProcessGroupsToRemove()).NotTo(HaveOccurred()) }) It("should remove the targeted Pod", func() { - fdbCluster.EnsurePodIsDeletedWithCustomTimeout(podToReplace.Name, 10) + fdbCluster.EnsurePodIsDeletedWithCustomTimeout(podToReplace.Name, 15) }) }) diff --git a/e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go b/e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go index effe8b60..85251e51 100644 --- a/e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go +++ b/e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go @@ -103,7 +103,7 @@ var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() { fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 240", faultDomain), false, 60) // Set this Pod as unschedulable to keep it pending. - Expect(fdbCluster.SetPodAsUnschedulable(failingStoragePod)).NotTo(HaveOccurred()) + fdbCluster.SetPodAsUnschedulable(failingStoragePod) }) AfterEach(func() { diff --git a/e2e/test_operator_upgrades/operator_upgrades_test.go b/e2e/test_operator_upgrades/operator_upgrades_test.go index e333d94a..937f3d53 100644 --- a/e2e/test_operator_upgrades/operator_upgrades_test.go +++ b/e2e/test_operator_upgrades/operator_upgrades_test.go @@ -595,7 +595,7 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() { clusterSetup(beforeVersion, true) pendingPod := factory.RandomPickOnePod(fdbCluster.GetPods().Items) // Set the pod in pending state. - Expect(fdbCluster.SetPodAsUnschedulable(pendingPod)).NotTo(HaveOccurred()) + fdbCluster.SetPodAsUnschedulable(pendingPod) fdbCluster.UpgradeAndVerify(targetVersion) }, EntryDescription("Upgrade from %[1]s to %[2]s with a pending pod"),