Skip to content

Commit

Permalink
Improve the reliability of the pod pending test case (#2176)
Browse files Browse the repository at this point in the history
  • Loading branch information
johscheuer authored Nov 29, 2024
1 parent 0ef7e5e commit 95f0e12
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 18 deletions.
25 changes: 14 additions & 11 deletions e2e/fixtures/fdb_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -776,27 +776,22 @@ func (fdbCluster *FdbCluster) UpdateLogProcessCount(newLogProcessCount int) erro

// SetPodAsUnschedulable sets the provided Pod on the NoSchedule list of the current FoundationDBCluster. This will make
// sure that the Pod is stuck in Pending.
func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) error {
func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) {
fdbCluster.SetProcessGroupsAsUnschedulable([]fdbv1beta2.ProcessGroupID{GetProcessGroupID(pod)})

fetchedPod := &corev1.Pod{}
return wait.PollImmediate(2*time.Second, 5*time.Minute, func() (bool, error) {
gomega.Eventually(func(g gomega.Gomega) string {
fetchedPod := &corev1.Pod{}
err := fdbCluster.getClient().
Get(context.Background(), client.ObjectKeyFromObject(&pod), fetchedPod)
if err != nil {
if kubeErrors.IsNotFound(err) {
return false, nil
}
return false, err
}
g.Expect(err).NotTo(gomega.HaveOccurred())

// Try deleting the Pod as a workaround until the operator handle all cases.
if fetchedPod.Spec.NodeName != "" && fetchedPod.DeletionTimestamp.IsZero() {
_ = fdbCluster.getClient().Delete(context.Background(), &pod)
}

return fetchedPod.Spec.NodeName == "", nil
})
return fetchedPod.Spec.NodeName
}).WithTimeout(5*time.Minute).WithPolling(2*time.Second).MustPassRepeatedly(5).Should(gomega.BeEmpty(), "Not able to set pod as unschedulable")
}

// SetProcessGroupsAsUnschedulable sets the provided process groups on the NoSchedule list of the current FoundationDBCluster. This will make
Expand Down Expand Up @@ -1219,7 +1214,15 @@ func (fdbCluster *FdbCluster) CheckPodIsDeleted(podName string) bool {
// EnsurePodIsDeletedWithCustomTimeout validates that a Pod is either not existing or is marked as deleted with a non-zero deletion timestamp.
// It times out after timeoutMinutes.
func (fdbCluster *FdbCluster) EnsurePodIsDeletedWithCustomTimeout(podName string, timeoutMinutes int) {
lastForceReconcile := time.Now()
gomega.Eventually(func() bool {
// Force a reconciliation every minute to ensure the deletion will be done in a more timely manner (without
// the reconciliation getting delayed by the requeue mechanism).
if time.Since(lastForceReconcile) > 1*time.Minute {
fdbCluster.ForceReconcile()
lastForceReconcile = time.Now()
}

return fdbCluster.CheckPodIsDeleted(podName)
}).WithTimeout(time.Duration(timeoutMinutes) * time.Minute).WithPolling(1 * time.Second).Should(gomega.BeTrue())
}
Expand Down
12 changes: 7 additions & 5 deletions e2e/test_operator/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
initialPods := fdbCluster.GetStatelessPods()
failedPod = factory.RandomPickOnePod(initialPods.Items)
log.Printf("Setting pod %s to unschedulable.", failedPod.Name)
Expect(fdbCluster.SetPodAsUnschedulable(failedPod)).NotTo(HaveOccurred())
fdbCluster.SetPodAsUnschedulable(failedPod)
fdbCluster.ReplacePod(failedPod, true)
})

Expand All @@ -590,7 +590,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
})

It("should remove the targeted Pod", func() {
fdbCluster.EnsurePodIsDeletedWithCustomTimeout(failedPod.Name, 10)
fdbCluster.EnsurePodIsDeletedWithCustomTimeout(failedPod.Name, 15)
})
})

Expand All @@ -599,6 +599,8 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
var podToReplace *corev1.Pod

BeforeEach(func() {
// We bring down two pods, which could cause some recoveries.
availabilityCheck = false
failedPod = factory.ChooseRandomPod(fdbCluster.GetStatelessPods())
podToReplace = factory.ChooseRandomPod(fdbCluster.GetStatelessPods())
log.Println(
Expand All @@ -607,17 +609,17 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
", Pod to replace:",
podToReplace.Name,
)
Expect(fdbCluster.SetPodAsUnschedulable(*failedPod)).NotTo(HaveOccurred())
fdbCluster.SetPodAsUnschedulable(*failedPod)
fdbCluster.ReplacePod(*podToReplace, false)
})

// 2024/11/26 01:41:00 Failed (unscheduled) Pod: operator-test-fdnhokqf-stateless-42898 , Pod to replace: operator-test-fdnhokqf-stateless-93723
AfterEach(func() {
Expect(fdbCluster.ClearBuggifyNoSchedule(false)).NotTo(HaveOccurred())
Expect(fdbCluster.ClearProcessGroupsToRemove()).NotTo(HaveOccurred())
})

It("should remove the targeted Pod", func() {
fdbCluster.EnsurePodIsDeletedWithCustomTimeout(podToReplace.Name, 10)
fdbCluster.EnsurePodIsDeletedWithCustomTimeout(podToReplace.Name, 15)
})
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 240", faultDomain), false, 60)

// Set this Pod as unschedulable to keep it pending.
Expect(fdbCluster.SetPodAsUnschedulable(failingStoragePod)).NotTo(HaveOccurred())
fdbCluster.SetPodAsUnschedulable(failingStoragePod)
})

AfterEach(func() {
Expand Down
2 changes: 1 addition & 1 deletion e2e/test_operator_upgrades/operator_upgrades_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() {
clusterSetup(beforeVersion, true)
pendingPod := factory.RandomPickOnePod(fdbCluster.GetPods().Items)
// Set the pod in pending state.
Expect(fdbCluster.SetPodAsUnschedulable(pendingPod)).NotTo(HaveOccurred())
fdbCluster.SetPodAsUnschedulable(pendingPod)
fdbCluster.UpgradeAndVerify(targetVersion)
},
EntryDescription("Upgrade from %[1]s to %[2]s with a pending pod"),
Expand Down

0 comments on commit 95f0e12

Please sign in to comment.