Skip to content

Commit

Permalink
Merge pull request #29169 from tjungblu/quorum_restore_test
Browse files Browse the repository at this point in the history
ETCD-657: add simple quorum restore test
  • Loading branch information
openshift-merge-bot[bot] authored Nov 15, 2024
2 parents 92addf5 + 2829d3c commit f355db7
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 7 deletions.
45 changes: 45 additions & 0 deletions test/extended/dr/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI
return runPod(oc, pod)
}

func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error {
const name = "quorum-repair-etcd-pod"
framework.Logf("running quorum restore script on node: %v", restoreNode.Name)

restoreScript := fmt.Sprintf(`
#!/bin/bash
set -exuo pipefail
# ssh key dance
%s
TARGET_NODE_NAME=%s
ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <<EOF
sudo /usr/local/bin/quorum-restore.sh
# this will cause the pod to disappear effectively, must be the last statement
sudo systemctl restart kubelet.service
EOF`, sshKeyDance, internalIP(restoreNode))

podSpec := applycorev1.PodSpec().WithHostNetwork(true).WithRestartPolicy(corev1.RestartPolicyOnFailure)
podSpec.Containers = []applycorev1.ContainerApplyConfiguration{
*applycorev1.Container().
WithName("cluster-restore").
WithSecurityContext(applycorev1.SecurityContext().WithPrivileged(true)).
WithImage(image.ShellImage()).
WithVolumeMounts(
applycorev1.VolumeMount().WithName("keys").WithMountPath(sshPath),
).
WithCommand("/bin/bash", "-c", restoreScript),
}

podSpec.NodeSelector = map[string]string{"kubernetes.io/hostname": restoreNode.Labels["kubernetes.io/hostname"]}
podSpec.Tolerations = []applycorev1.TolerationApplyConfiguration{
*applycorev1.Toleration().WithKey("node-role.kubernetes.io/master").WithOperator(corev1.TolerationOpExists).WithEffect(corev1.TaintEffectNoSchedule),
}

podSpec.Volumes = []applycorev1.VolumeApplyConfiguration{
*applycorev1.Volume().WithName("keys").WithSecret(applycorev1.SecretVolumeSource().WithSecretName("dr-ssh")),
}

pod := applycorev1.Pod(name, openshiftEtcdNamespace).WithSpec(podSpec)
// we only run the pod and not wait for it, as it will not be tracked after the control plane comes back
return runPod(oc, pod)
}

func runPodAndWaitForSuccess(oc *exutil.CLI, pod *applycorev1.PodApplyConfiguration) error {
err := runPod(oc, pod)
if err != nil {
Expand Down
68 changes: 61 additions & 7 deletions test/extended/dr/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re

masters := masterNodes(oc)
// Need one node to back up from and another to restore to
o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))

// Pick one node to back up on
backupNode := masters[0]
Expand Down Expand Up @@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
o.Expect(err).ToNot(o.HaveOccurred())

masters := masterNodes(oc)
o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
backupNode := masters[0]
framework.Logf("Selecting node %q as the backup host", backupNode.Name)
recoveryNode := masters[1]
Expand All @@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re

// we should come back with a single etcd static pod
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1)

// TODO(thomas): since we're bumping resources, that should not be necessary anymore
// err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes)
// o.Expect(err).ToNot(o.HaveOccurred())

forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
// CEO will bring back the other etcd static pods again
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
Expand All @@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
assertPostBackupResourcesAreNotFound(oc)
})
})

var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() {
defer g.GinkgoRecover()

f := framework.NewDefaultFramework("recovery")
f.SkipNamespaceCreation = true
oc := exutil.NewCLIWithoutNamespace("recovery")

g.AfterEach(func() {
g.GinkgoT().Log("turning the quorum guard back on")
data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`)
_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
o.Expect(err).ToNot(o.HaveOccurred())

// we need to ensure this test also ends with a stable revision for api and etcd
g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
o.Expect(err).ToNot(o.HaveOccurred())

g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
o.Expect(err).ToNot(o.HaveOccurred())
})

g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() {
// ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys
data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`)
_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
o.Expect(err).ToNot(o.HaveOccurred())

// we need to ensure each test starts with a stable revision for api and etcd
g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
o.Expect(err).ToNot(o.HaveOccurred())

g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
o.Expect(err).ToNot(o.HaveOccurred())

err = InstallSSHKeyOnControlPlaneNodes(oc)
o.Expect(err).ToNot(o.HaveOccurred())

masters := masterNodes(oc)
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
recoveryNode := masters[2]

err = runQuorumRestoreScript(oc, recoveryNode)
o.Expect(err).ToNot(o.HaveOccurred())

forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
// CEO will bring back the other etcd static pods again
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
waitForOperatorsToSettle()
})
})

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f355db7

Please sign in to comment.