From 46ca63dd8d2da6165e9c4e6792561606c7f61def Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Wed, 16 Oct 2024 10:02:06 +0200 Subject: [PATCH 1/2] NO-JIRA: fix exec issue on etcd dr suite --- test/extended/dr/common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go index d29ced817275..4592d69b1ed2 100644 --- a/test/extended/dr/common.go +++ b/test/extended/dr/common.go @@ -663,7 +663,7 @@ func removeMember(oc *exutil.CLI, memberID string) { _, err = utils.PodRunningReady(&pod) if err == nil { framework.Logf("found running etcd pod to exec member removal: %s", pod.Name) - member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "-c", "etcdctl", "--", "etcdctl", "member", "remove", memberID).Output() + member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "--", "etcdctl", "etcdctl", "member", "remove", memberID).Output() o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(member).To(o.ContainSubstring("removed from cluster")) return From 2829d3c62dbada0edb11ca0c62c8cb1d36dbb696 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Tue, 8 Oct 2024 10:33:10 +0200 Subject: [PATCH 2/2] NO-JIRA: add simple quorum restore test Signed-off-by: Thomas Jungblut --- test/extended/dr/common.go | 47 ++++++++++++- test/extended/dr/recovery.go | 68 +++++++++++++++++-- .../generated/zz_generated.annotations.go | 2 + 3 files changed, 109 insertions(+), 8 deletions(-) diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go index 4592d69b1ed2..f72164c4fd54 100644 --- a/test/extended/dr/common.go +++ b/test/extended/dr/common.go @@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI return runPod(oc, pod) } +func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error { + const name = "quorum-repair-etcd-pod" + framework.Logf("running quorum restore script on node: %v", restoreNode.Name) + + restoreScript := fmt.Sprintf(` + #!/bin/bash + set -exuo pipefail + + # ssh key dance + %s + + TARGET_NODE_NAME=%s + ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <=", 2)) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) // Pick one node to back up on backupNode := masters[0] @@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re o.Expect(err).ToNot(o.HaveOccurred()) masters := masterNodes(oc) - o.Expect(len(masters)).To(o.BeNumerically(">=", 2)) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) backupNode := masters[0] framework.Logf("Selecting node %q as the backup host", backupNode.Name) recoveryNode := masters[1] @@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re // we should come back with a single etcd static pod waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1) - - // TODO(thomas): since we're bumping resources, that should not be necessary anymore - // err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes) - // o.Expect(err).ToNot(o.HaveOccurred()) - forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) // CEO will bring back the other etcd static pods again waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters)) @@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re assertPostBackupResourcesAreNotFound(oc) }) }) + +var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() { + defer g.GinkgoRecover() + + f := framework.NewDefaultFramework("recovery") + f.SkipNamespaceCreation = true + oc := exutil.NewCLIWithoutNamespace("recovery") + + g.AfterEach(func() { + g.GinkgoT().Log("turning the quorum guard back on") + data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`) + _, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) + o.Expect(err).ToNot(o.HaveOccurred()) + + // we need to ensure this test also ends with a stable revision for api and etcd + g.GinkgoT().Log("waiting for api servers to stabilize on the same revision") + err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + g.GinkgoT().Log("waiting for etcd to stabilize on the same revision") + err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + }) + + g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() { + // ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys + data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`) + _, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) + o.Expect(err).ToNot(o.HaveOccurred()) + + // we need to ensure each test starts with a stable revision for api and etcd + g.GinkgoT().Log("waiting for api servers to stabilize on the same revision") + err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + g.GinkgoT().Log("waiting for etcd to stabilize on the same revision") + err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + err = InstallSSHKeyOnControlPlaneNodes(oc) + o.Expect(err).ToNot(o.HaveOccurred()) + + masters := masterNodes(oc) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) + recoveryNode := masters[2] + + err = runQuorumRestoreScript(oc, recoveryNode) + o.Expect(err).ToNot(o.HaveOccurred()) + + forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) + // CEO will bring back the other etcd static pods again + waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters)) + waitForOperatorsToSettle() + }) +}) diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index a7a4c6a72484..e8c8c19c2069 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1137,6 +1137,8 @@ var Annotations = map[string]string{ "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Disruptive] etcd is able to block the rollout of a revision when the quorum is not safe": " [Serial]", + "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h] [Feature:EtcdRecovery][Disruptive] Recover with quorum restore": " [Serial]", + "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:2h] [Feature:EtcdRecovery][Disruptive] Recover with snapshot with two unhealthy nodes and lost quorum": " [Serial]", "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:30m] [Feature:EtcdRecovery][Disruptive] Restore snapshot from node on another single unhealthy node": " [Serial]",