Skip to content

Commit

Permalink
Issue 493 Allow operator to recover from FailedUpgrade (#496)
Browse files Browse the repository at this point in the history
* if a cluster upgrade ends in a UpgradeFailed state (because of a node taking more than the timeout to get back in the cluster) the operator cannot recover even if the cluster is healthy. adding steps to recover once the cluster is completley upgraded (potentially manual work)

Signed-off-by: Frank Vissing <[email protected]>

* go fmt

Signed-off-by: Frank Vissing <[email protected]>

* add testcase

Signed-off-by: Frank Vissing <[email protected]>

---------

Signed-off-by: Frank Vissing <[email protected]>
Co-authored-by: anishakj <[email protected]>
  • Loading branch information
lunarfs and anishakj authored Dec 12, 2023
1 parent 88b9307 commit 074d8b0
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
27 changes: 27 additions & 0 deletions controllers/zookeepercluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,33 @@ func compareResourceVersion(zk *zookeeperv1beta1.ZookeeperCluster, sts *appsv1.S
func (r *ZookeeperClusterReconciler) reconcileStatefulSet(instance *zookeeperv1beta1.ZookeeperCluster) (err error) {

// we cannot upgrade if cluster is in UpgradeFailed
if instance.Status.IsClusterInUpgradeFailedState() {
sts := zk.MakeStatefulSet(instance)
if err = controllerutil.SetControllerReference(instance, sts, r.Scheme); err != nil {
return err
}
foundSts := &appsv1.StatefulSet{}
err = r.Client.Get(context.TODO(), types.NamespacedName{
Name: sts.Name,
Namespace: sts.Namespace,
}, foundSts)
if err == nil {
err = r.Client.Update(context.TODO(), foundSts)
if err != nil {
return err
}
if foundSts.Status.Replicas == foundSts.Status.ReadyReplicas && foundSts.Status.CurrentRevision == foundSts.Status.UpdateRevision {
r.Log.Info("failed upgrade completed", "upgrade from:", instance.Status.CurrentVersion, "upgrade to:", instance.Status.TargetVersion)
instance.Status.CurrentVersion = instance.Status.TargetVersion
instance.Status.SetErrorConditionFalse()
return r.clearUpgradeStatus(instance)
} else {
r.Log.Info("Unable to recover failed upgrade, make sure all nodes are running the target version")
}

}
}

if instance.Status.IsClusterInUpgradeFailedState() {
return nil
}
Expand Down
46 changes: 46 additions & 0 deletions controllers/zookeepercluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,52 @@ var _ = Describe("ZookeeperCluster Controller", func() {
})
})

Context("Checking for healing of upgrade failed for zookeepercluster", func() {
var (
cl client.Client
err error
)

BeforeEach(func() {
z.WithDefaults()
z.Status.Init()
next := z.DeepCopy()
next.Status.SetErrorConditionTrue("UpgradeFailed", " ")
next.Status.TargetVersion = "0.2.7"
next.Status.CurrentVersion = "0.2.6"
next.Status.ReadyReplicas = 3
next.Spec.Replicas = 3
next.Spec.Image.Tag = "0.2.7"
st := zk.MakeStatefulSet(z)
cl = fake.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(next, st).Build()
st = &appsv1.StatefulSet{}
err = cl.Get(context.TODO(), req.NamespacedName, st)
// changing the Revision value to simulate the upgrade scenario
st.Status.CurrentRevision = "updateRevision"
st.Status.UpdateRevision = "updateRevision"
st.Status.UpdatedReplicas = 2
cl.Status().Update(context.TODO(), st)
r = &ZookeeperClusterReconciler{Client: cl, Scheme: s, ZkClient: mockZkClient}
res, err = r.Reconcile(context.TODO(), req)
// sleeping for 3 seconds
time.Sleep(3 * time.Second)
// checking if more than 2 secs have passed from the last update time
err = checkSyncTimeout(next, " ", 1, 2*time.Second)

})

It("checking update replicas", func() {
foundZookeeper := &v1beta1.ZookeeperCluster{}
_ = cl.Get(context.TODO(), req.NamespacedName, foundZookeeper)
condition := foundZookeeper.Status.CurrentVersion
Ω(condition).To(Equal("0.2.7"))
})

It("should not raise an error", func() {
Ω(err).To(BeNil())
})
})

Context("Upgrading with Targetversion empty", func() {
var (
cl client.Client
Expand Down

0 comments on commit 074d8b0

Please sign in to comment.