diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go index d29ced817275..f72164c4fd54 100644 --- a/test/extended/dr/common.go +++ b/test/extended/dr/common.go @@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI return runPod(oc, pod) } +func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error { + const name = "quorum-repair-etcd-pod" + framework.Logf("running quorum restore script on node: %v", restoreNode.Name) + + restoreScript := fmt.Sprintf(` + #!/bin/bash + set -exuo pipefail + + # ssh key dance + %s + + TARGET_NODE_NAME=%s + ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <=", 2)) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) // Pick one node to back up on backupNode := masters[0] @@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re o.Expect(err).ToNot(o.HaveOccurred()) masters := masterNodes(oc) - o.Expect(len(masters)).To(o.BeNumerically(">=", 2)) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) backupNode := masters[0] framework.Logf("Selecting node %q as the backup host", backupNode.Name) recoveryNode := masters[1] @@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re // we should come back with a single etcd static pod waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1) - - // TODO(thomas): since we're bumping resources, that should not be necessary anymore - // err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes) - // o.Expect(err).ToNot(o.HaveOccurred()) - forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) // CEO will bring back the other etcd static pods again waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters)) @@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re assertPostBackupResourcesAreNotFound(oc) }) }) + +var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() { + defer g.GinkgoRecover() + + f := framework.NewDefaultFramework("recovery") + f.SkipNamespaceCreation = true + oc := exutil.NewCLIWithoutNamespace("recovery") + + g.AfterEach(func() { + g.GinkgoT().Log("turning the quorum guard back on") + data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`) + _, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) + o.Expect(err).ToNot(o.HaveOccurred()) + + // we need to ensure this test also ends with a stable revision for api and etcd + g.GinkgoT().Log("waiting for api servers to stabilize on the same revision") + err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + g.GinkgoT().Log("waiting for etcd to stabilize on the same revision") + err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + }) + + g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() { + // ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys + data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`) + _, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) + o.Expect(err).ToNot(o.HaveOccurred()) + + // we need to ensure each test starts with a stable revision for api and etcd + g.GinkgoT().Log("waiting for api servers to stabilize on the same revision") + err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + g.GinkgoT().Log("waiting for etcd to stabilize on the same revision") + err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc) + err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + err = InstallSSHKeyOnControlPlaneNodes(oc) + o.Expect(err).ToNot(o.HaveOccurred()) + + masters := masterNodes(oc) + o.Expect(len(masters)).To(o.BeNumerically(">=", 3)) + recoveryNode := masters[2] + + err = runQuorumRestoreScript(oc, recoveryNode) + o.Expect(err).ToNot(o.HaveOccurred()) + + forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) + // CEO will bring back the other etcd static pods again + waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters)) + waitForOperatorsToSettle() + }) +}) diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 50902728bf3d..fbbe6fd6eae8 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1137,6 +1137,8 @@ var Annotations = map[string]string{ "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Disruptive] etcd is able to block the rollout of a revision when the quorum is not safe": " [Serial]", + "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h] [Feature:EtcdRecovery][Disruptive] Recover with quorum restore": " [Serial]", + "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:2h] [Feature:EtcdRecovery][Disruptive] Recover with snapshot with two unhealthy nodes and lost quorum": " [Serial]", "[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:30m] [Feature:EtcdRecovery][Disruptive] Restore snapshot from node on another single unhealthy node": " [Serial]",