From 7f63ad0a391dcf0901edf9009717fb57f503f652 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Thu, 16 Dec 2021 00:00:30 +0300 Subject: [PATCH] fix: avoid long backoff when trying to bootstrap the cluster This makes `Sidero` tests fail, because the controller gets too many errors during the bootstrap, as we populate the address before `apid` is ready. Signed-off-by: Artem Chernyshev --- controllers/etcd.go | 10 ++--- controllers/taloscontrolplane_controller.go | 46 +++++++++++---------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/controllers/etcd.go b/controllers/etcd.go index c0a1afb..440d5c3 100644 --- a/controllers/etcd.go +++ b/controllers/etcd.go @@ -46,7 +46,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust params = append(params, "node", machine.Name) } - r.Log.Info("Verifying etcd health on all nodes", params...) + r.Log.Info("verifying etcd health on all nodes", params...) svcs, err := c.ServiceInfo(ctx, service) if err != nil { @@ -105,7 +105,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust // gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership // and issuing a "leave" request from the machine itself. func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave clusterv1.Machine) error { - r.Log.Info("Verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name) + r.Log.Info("verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name) svcs, err := c.ServiceInfo(ctx, "etcd") if err != nil { @@ -114,14 +114,14 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c * for _, svc := range svcs { if svc.Service.State != "Finished" { - r.Log.Info("Forfeiting leadership", "machine", machineToLeave.Status.NodeRef.Name) + r.Log.Info("forfeiting leadership", "machine", machineToLeave.Status.NodeRef.Name) _, err = c.EtcdForfeitLeadership(ctx, &machine.EtcdForfeitLeadershipRequest{}) if err != nil { return err } - r.Log.Info("Leaving etcd", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name) + r.Log.Info("leaving etcd", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name) err = c.EtcdLeaveCluster(ctx, &machine.EtcdLeaveClusterRequest{}) if err != nil { @@ -136,7 +136,7 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c * // forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member. // This is used in times when the machine was deleted out from under us. func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error { - r.Log.Info("Removing etcd member", "memberName", memberName) + r.Log.Info("removing etcd member", "memberName", memberName) return c.EtcdRemoveMember( ctx, diff --git a/controllers/taloscontrolplane_controller.go b/controllers/taloscontrolplane_controller.go index de2ae3d..93fda14 100644 --- a/controllers/taloscontrolplane_controller.go +++ b/controllers/taloscontrolplane_controller.go @@ -102,7 +102,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re cluster, err := util.GetOwnerCluster(ctx, r.Client, tcp.ObjectMeta) if err != nil { if !apierrors.IsNotFound(err) { - logger.Error(err, "Failed to retrieve owner Cluster from the API Server") + logger.Error(err, "failed to retrieve owner Cluster from the API Server") return ctrl.Result{}, err } @@ -111,19 +111,19 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re } if cluster == nil { - logger.Info("Cluster Controller has not yet set OwnerRef") + logger.Info("cluster Controller has not yet set OwnerRef") return ctrl.Result{Requeue: true}, nil } logger = logger.WithValues("cluster", cluster.Name) if annotations.IsPaused(cluster, tcp) { - logger.Info("Reconciliation is paused for this object") + logger.Info("reconciliation is paused for this object") return ctrl.Result{Requeue: true}, nil } // Wait for the cluster infrastructure to be ready before creating machines if !cluster.Status.InfrastructureReady { - logger.Info("Cluster infra not ready") + logger.Info("cluster infra not ready") return ctrl.Result{Requeue: true}, nil } @@ -131,7 +131,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re // Initialize the patch helper. patchHelper, err := patch.NewHelper(tcp, r.Client) if err != nil { - logger.Error(err, "Failed to configure the patch helper") + logger.Error(err, "failed to configure the patch helper") return ctrl.Result{Requeue: true}, nil } @@ -143,7 +143,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re // because the main defer may take too much time to get cluster status if err := patchTalosControlPlane(ctx, patchHelper, tcp, patch.WithStatusObservedGeneration{}); err != nil { - logger.Error(err, "Failed to add finalizer to TalosControlPlane") + logger.Error(err, "failed to add finalizer to TalosControlPlane") return ctrl.Result{}, err } @@ -158,18 +158,18 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re } defer func() { - r.Log.Info("Attempting to set control plane status") + r.Log.Info("attempting to set control plane status") // Always attempt to update status. if err := r.updateStatus(ctx, tcp, cluster); err != nil { - logger.Error(err, "Failed to update TalosControlPlane Status") + logger.Error(err, "failed to update TalosControlPlane Status") reterr = kerrors.NewAggregate([]error{reterr, err}) } // Always attempt to Patch the TalosControlPlane object and status after each reconciliation. if err := patchTalosControlPlane(ctx, patchHelper, tcp, patch.WithStatusObservedGeneration{}); err != nil { - logger.Error(err, "Failed to patch TalosControlPlane") + logger.Error(err, "failed to patch TalosControlPlane") reterr = kerrors.NewAggregate([]error{reterr, err}) } @@ -182,7 +182,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re } } - r.Log.Info("Successfully updated control plane status") + r.Log.Info("successfully updated control plane status") }() // Update ownerrefs on infra templates @@ -192,7 +192,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re // If ControlPlaneEndpoint is not set, return early if cluster.Spec.ControlPlaneEndpoint.IsZero() { - logger.Info("Cluster does not yet have a ControlPlaneEndpoint defined") + logger.Info("cluster does not yet have a ControlPlaneEndpoint defined") return ctrl.Result{}, nil } @@ -251,7 +251,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re // We are creating the first replica case numMachines < desiredReplicas && numMachines == 0: // Create new Machine w/ init - logger.Info("Initializing control plane", "Desired", desiredReplicas, "Existing", numMachines) + logger.Info("initializing control plane", "Desired", desiredReplicas, "Existing", numMachines) return r.bootControlPlane(ctx, cluster, tcp, controlPlane, true) // We are scaling up @@ -261,7 +261,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re desiredReplicas, numMachines) // Create a new Machine w/ join - logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines) + logger.Info("scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines) return r.bootControlPlane(ctx, cluster, tcp, controlPlane, false) // We are scaling down @@ -279,23 +279,23 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re } if err := r.ensureNodesBooted(ctx, cluster, ownedMachines); err != nil { - logger.Info("Waiting for all nodes to finish boot sequence", "error", err) + logger.Info("waiting for all nodes to finish boot sequence", "error", err) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } if !conditions.IsTrue(tcp, controlplanev1.EtcdClusterHealthyCondition) { - logger.Info("Waiting for etcd to become healthy before scaling down") + logger.Info("waiting for etcd to become healthy before scaling down") return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } - logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines) + logger.Info("scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines) res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name, ownedMachines) if err != nil { if res.Requeue || res.RequeueAfter > 0 { - logger.Info("Failed to scale down control plane", "error", err) + logger.Info("failed to scale down control plane", "error", err) return res, nil } @@ -307,7 +307,9 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re if err := r.bootstrapCluster(ctx, cluster, ownedMachines); err != nil { conditions.MarkFalse(tcp, controlplanev1.MachinesBootstrapped, controlplanev1.WaitingForTalosBootReason, clusterv1.ConditionSeverityInfo, err.Error()) - return ctrl.Result{}, err + logger.Info("bootstrap failed, retrying in 20 seconds", "error", err) + + return ctrl.Result{RequeueAfter: time.Second * 20}, nil } conditions.MarkTrue(tcp, controlplanev1.MachinesBootstrapped) @@ -336,7 +338,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re func (r *TalosControlPlaneReconciler) ClusterToTalosControlPlane(o client.Object) []ctrl.Request { c, ok := o.(*clusterv1.Cluster) if !ok { - r.Log.Error(nil, fmt.Sprintf("Expected a Cluster but got a %T", o)) + r.Log.Error(nil, fmt.Sprintf("expected a Cluster but got a %T", o)) return nil } @@ -352,7 +354,7 @@ func (r *TalosControlPlaneReconciler) reconcileDelete(ctx context.Context, clust // Get list of all control plane machines ownedMachines, err := r.getControlPlaneMachinesForCluster(ctx, util.ObjectKey(cluster), tcp.Name) if err != nil { - r.Log.Error(err, "Failed to retrieve control plane machines for cluster") + r.Log.Error(err, "failed to retrieve control plane machines for cluster") return ctrl.Result{}, err } @@ -370,7 +372,7 @@ func (r *TalosControlPlaneReconciler) reconcileDelete(ctx context.Context, clust } // Submit deletion request if err := r.Client.Delete(ctx, &ownedMachine); err != nil && !apierrors.IsNotFound(err) { - r.Log.Error(err, "Failed to cleanup owned machine") + r.Log.Error(err, "failed to cleanup owned machine") return ctrl.Result{}, err } } @@ -822,7 +824,7 @@ func (r *TalosControlPlaneReconciler) updateStatus(ctx context.Context, tcp *con conditions.MarkTrue(tcp, controlplanev1.AvailableCondition) } } else { - r.Log.Error(err, "Failed attempt to contact workload cluster") + r.Log.Error(err, "failed attempt to contact workload cluster") } conditions.SetAggregate(tcp, controlplanev1.MachinesReadyCondition, conditionGetters, conditions.AddSourceRef(), conditions.WithStepCounterIf(false))