From 46ca63dd8d2da6165e9c4e6792561606c7f61def Mon Sep 17 00:00:00 2001
From: Thomas Jungblut <tjungblu@redhat.com>
Date: Wed, 16 Oct 2024 10:02:06 +0200
Subject: [PATCH 1/2] NO-JIRA: fix exec issue on etcd dr suite

---
 test/extended/dr/common.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go
index d29ced817275..4592d69b1ed2 100644
--- a/test/extended/dr/common.go
+++ b/test/extended/dr/common.go
@@ -663,7 +663,7 @@ func removeMember(oc *exutil.CLI, memberID string) {
 		_, err = utils.PodRunningReady(&pod)
 		if err == nil {
 			framework.Logf("found running etcd pod to exec member removal: %s", pod.Name)
-			member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "-c", "etcdctl", "--", "etcdctl", "member", "remove", memberID).Output()
+			member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "--", "etcdctl", "etcdctl", "member", "remove", memberID).Output()
 			o.Expect(err).NotTo(o.HaveOccurred())
 			o.Expect(member).To(o.ContainSubstring("removed from cluster"))
 			return

From 2829d3c62dbada0edb11ca0c62c8cb1d36dbb696 Mon Sep 17 00:00:00 2001
From: Thomas Jungblut <tjungblu@redhat.com>
Date: Tue, 8 Oct 2024 10:33:10 +0200
Subject: [PATCH 2/2] NO-JIRA: add simple quorum restore test

Signed-off-by: Thomas Jungblut <tjungblu@redhat.com>
---
 test/extended/dr/common.go                    | 47 ++++++++++++-
 test/extended/dr/recovery.go                  | 68 +++++++++++++++++--
 .../generated/zz_generated.annotations.go     |  2 +
 3 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go
index 4592d69b1ed2..f72164c4fd54 100644
--- a/test/extended/dr/common.go
+++ b/test/extended/dr/common.go
@@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI
 	return runPod(oc, pod)
 }
 
+func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error {
+	const name = "quorum-repair-etcd-pod"
+	framework.Logf("running quorum restore script on node: %v", restoreNode.Name)
+
+	restoreScript := fmt.Sprintf(`
+        #!/bin/bash
+        set -exuo pipefail
+        
+        # ssh key dance
+        %s
+
+        TARGET_NODE_NAME=%s 
+        ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <<EOF
+        sudo /usr/local/bin/quorum-restore.sh
+        # this will cause the pod to disappear effectively, must be the last statement
+        sudo systemctl restart kubelet.service
+
+EOF`, sshKeyDance, internalIP(restoreNode))
+
+	podSpec := applycorev1.PodSpec().WithHostNetwork(true).WithRestartPolicy(corev1.RestartPolicyOnFailure)
+	podSpec.Containers = []applycorev1.ContainerApplyConfiguration{
+		*applycorev1.Container().
+			WithName("cluster-restore").
+			WithSecurityContext(applycorev1.SecurityContext().WithPrivileged(true)).
+			WithImage(image.ShellImage()).
+			WithVolumeMounts(
+				applycorev1.VolumeMount().WithName("keys").WithMountPath(sshPath),
+			).
+			WithCommand("/bin/bash", "-c", restoreScript),
+	}
+
+	podSpec.NodeSelector = map[string]string{"kubernetes.io/hostname": restoreNode.Labels["kubernetes.io/hostname"]}
+	podSpec.Tolerations = []applycorev1.TolerationApplyConfiguration{
+		*applycorev1.Toleration().WithKey("node-role.kubernetes.io/master").WithOperator(corev1.TolerationOpExists).WithEffect(corev1.TaintEffectNoSchedule),
+	}
+
+	podSpec.Volumes = []applycorev1.VolumeApplyConfiguration{
+		*applycorev1.Volume().WithName("keys").WithSecret(applycorev1.SecretVolumeSource().WithSecretName("dr-ssh")),
+	}
+
+	pod := applycorev1.Pod(name, openshiftEtcdNamespace).WithSpec(podSpec)
+	// we only run the pod and not wait for it, as it will not be tracked after the control plane comes back
+	return runPod(oc, pod)
+}
+
 func runPodAndWaitForSuccess(oc *exutil.CLI, pod *applycorev1.PodApplyConfiguration) error {
 	err := runPod(oc, pod)
 	if err != nil {
@@ -663,7 +708,7 @@ func removeMember(oc *exutil.CLI, memberID string) {
 		_, err = utils.PodRunningReady(&pod)
 		if err == nil {
 			framework.Logf("found running etcd pod to exec member removal: %s", pod.Name)
-			member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "--", "etcdctl", "etcdctl", "member", "remove", memberID).Output()
+			member, err := oc.AsAdmin().Run("exec").Args("-n", openshiftEtcdNamespace, pod.Name, "-c", "etcdctl", "--", "etcdctl", "member", "remove", memberID).Output()
 			o.Expect(err).NotTo(o.HaveOccurred())
 			o.Expect(member).To(o.ContainSubstring("removed from cluster"))
 			return
diff --git a/test/extended/dr/recovery.go b/test/extended/dr/recovery.go
index 0f30786bd2d0..37ae1cda653b 100644
--- a/test/extended/dr/recovery.go
+++ b/test/extended/dr/recovery.go
@@ -43,7 +43,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 
 		masters := masterNodes(oc)
 		// Need one node to back up from and another to restore to
-		o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
 
 		// Pick one node to back up on
 		backupNode := masters[0]
@@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 		o.Expect(err).ToNot(o.HaveOccurred())
 
 		masters := masterNodes(oc)
-		o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
 		backupNode := masters[0]
 		framework.Logf("Selecting node %q as the backup host", backupNode.Name)
 		recoveryNode := masters[1]
@@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 
 		// we should come back with a single etcd static pod
 		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1)
-
-		// TODO(thomas): since we're bumping resources, that should not be necessary anymore
-		// err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes)
-		// o.Expect(err).ToNot(o.HaveOccurred())
-
 		forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
 		// CEO will bring back the other etcd static pods again
 		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
@@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 		assertPostBackupResourcesAreNotFound(oc)
 	})
 })
+
+var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() {
+	defer g.GinkgoRecover()
+
+	f := framework.NewDefaultFramework("recovery")
+	f.SkipNamespaceCreation = true
+	oc := exutil.NewCLIWithoutNamespace("recovery")
+
+	g.AfterEach(func() {
+		g.GinkgoT().Log("turning the quorum guard back on")
+		data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`)
+		_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		// we need to ensure this test also ends with a stable revision for api and etcd
+		g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
+		err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
+		err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+	})
+
+	g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() {
+		// ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys
+		data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`)
+		_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		// we need to ensure each test starts with a stable revision for api and etcd
+		g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
+		err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
+		err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		err = InstallSSHKeyOnControlPlaneNodes(oc)
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		masters := masterNodes(oc)
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
+		recoveryNode := masters[2]
+
+		err = runQuorumRestoreScript(oc, recoveryNode)
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
+		// CEO will bring back the other etcd static pods again
+		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
+		waitForOperatorsToSettle()
+	})
+})
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go
index a7a4c6a72484..e8c8c19c2069 100644
--- a/test/extended/util/annotate/generated/zz_generated.annotations.go
+++ b/test/extended/util/annotate/generated/zz_generated.annotations.go
@@ -1137,6 +1137,8 @@ var Annotations = map[string]string{
 
 	"[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Disruptive] etcd is able to block the rollout of a revision when the quorum is not safe": " [Serial]",
 
+	"[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h] [Feature:EtcdRecovery][Disruptive] Recover with quorum restore": " [Serial]",
+
 	"[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:2h] [Feature:EtcdRecovery][Disruptive] Recover with snapshot with two unhealthy nodes and lost quorum": " [Serial]",
 
 	"[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:30m] [Feature:EtcdRecovery][Disruptive] Restore snapshot from node on another single unhealthy node": " [Serial]",