diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index 6d7e7f6..b78f52b 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -65,14 +65,21 @@ func IsEtcdDisruptionAllowed(ctx context.Context, cl client.Client, log logr.Log } for _, pod := range podList.Items { if pod.Spec.NodeName == node.Name { + // Track if the pod has a ready condition with status True or Unknown + // Consider Unknown as being ready to be on the safe side and prevent disruption of nodes which are about to be ready + // Missing ready condition also means the pod is not ready + isPodReady := false for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionFalse { - log.Info("Node is already disrupted, thus disrupting it (again) won't violate the etcd quorum", "Node", node.Name, "Guard pod", pod.Name) - return true, nil + if condition.Type == corev1.PodReady && (condition.Status == corev1.ConditionTrue || condition.Status == corev1.ConditionUnknown) { + isPodReady = true } } - log.Info("Node is not disrupted, and disrupting it will violate the etcd quorum", "Node", node.Name, "Guard pod", pod.Name) - return false, nil + if isPodReady { + log.Info("Node is not disrupted, and disrupting it will violate the etcd quorum", "Node", node.Name, "Guard pod", pod.Name) + return false, nil + } + log.Info("Node is already disrupted, thus disrupting it (again) won't violate the etcd quorum", "Node", node.Name, "Guard pod", pod.Name) + return true, nil } } // Node is either already disrupted (and guard pod is missing) or it wasn't configured with etcd (to have a guard pod) diff --git a/pkg/etcd/etcd_test.go b/pkg/etcd/etcd_test.go index 1136016..c703a79 100644 --- a/pkg/etcd/etcd_test.go +++ b/pkg/etcd/etcd_test.go @@ -107,13 +107,20 @@ var _ = Describe("Check if etcd disruption is allowed", func() { DeferCleanup(fakeClient.Delete, context.Background(), unguardedNode) Expect(IsEtcdDisruptionAllowed(context.Background(), fakeClient, log, unguardedNode)).To(BeTrue()) }) - It("should allow disruption for already disrupted control plane node", func() { + It("should allow disruption for already disrupted control plane node (guard pod ready condition = false)", func() { podGuard := &corev1.Pod{} Expect(fakeClient.Get(context.Background(), client.ObjectKeyFromObject(getPodGuard(nodeGuardDown.Name)), podGuard)).To(Succeed()) podGuard.Status.Conditions[0].Status = corev1.ConditionFalse Expect(fakeClient.Status().Update(context.Background(), podGuard)).To(Succeed()) Expect(IsEtcdDisruptionAllowed(context.Background(), fakeClient, log, nodeGuardDown)).To(BeTrue()) }) + It("should allow disruption for already disrupted control plane node (guard pod ready condition missing)", func() { + podGuard := &corev1.Pod{} + Expect(fakeClient.Get(context.Background(), client.ObjectKeyFromObject(getPodGuard(nodeGuardDown.Name)), podGuard)).To(Succeed()) + podGuard.Status.Conditions = []corev1.PodCondition{} + Expect(fakeClient.Status().Update(context.Background(), podGuard)).To(Succeed()) + Expect(IsEtcdDisruptionAllowed(context.Background(), fakeClient, log, nodeGuardDown)).To(BeTrue()) + }) It("should reject disruptions for any non-disrupted control plane node", func() { for _, node := range controlPlaneNodes { if node.Name != nodeGuardDown.Name {