Skip to content

Commit

Permalink
self-healing machine pools for rke2
Browse files Browse the repository at this point in the history
  • Loading branch information
paynejacob committed Jan 21, 2022
1 parent ae1ecae commit cf64177
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 16 deletions.
5 changes: 5 additions & 0 deletions pkg/apis/provisioning.cattle.io/v1/rke.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package v1
import (
rkev1 "github.com/rancher/rancher/pkg/apis/rke.cattle.io/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

Expand All @@ -21,6 +22,10 @@ type RKEMachinePool struct {
RollingUpdate *RKEMachinePoolRollingUpdate `json:"rollingUpdate,omitempty"`
MachineDeploymentLabels map[string]string `json:"machineDeploymentLabels,omitempty"`
MachineDeploymentAnnotations map[string]string `json:"machineDeploymentAnnotations,omitempty"`
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
UnhealthyNodeTimeout *metav1.Duration `json:"UnhealthyNodeTimeout,omitempty"`
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
UnhealthyRange *string `json:"unhealthyRange,omitempty"`
}

type RKEMachinePoolRollingUpdate struct {
Expand Down
21 changes: 21 additions & 0 deletions pkg/apis/provisioning.cattle.io/v1/zz_generated_deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/apis/rke.cattle.io/v1/controlplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ type RKEControlPlaneStatus struct {
ETCDSnapshotCreate *ETCDSnapshotCreate `json:"etcdSnapshotCreate,omitempty"`
ETCDSnapshotCreatePhase ETCDSnapshotPhase `json:"etcdSnapshotCreatePhase,omitempty"`
ConfigGeneration int64 `json:"configGeneration,omitempty"`
Initialized bool `json:"initialized,omitempty"`
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (

const (
byNodeInfra = "by-node-infra"
Provisioned = condition.Cond("Provisioned")
Ready = condition.Cond("Ready")
defaultMachineConfigAPIVersion = "rke-machine-config.cattle.io/v1"
)

Expand Down Expand Up @@ -202,13 +202,13 @@ func (h *handler) updateClusterProvisioningStatus(cluster *rancherv1.Cluster, st
return status, err
}

message := Provisioned.GetMessage(cp)
if (message == "" && Provisioned.GetMessage(mgmtCluster) != "") || strings.Contains(message, planner.ETCDRestoreMessage) {
message := Ready.GetMessage(cp)
if (message == "" && Ready.GetMessage(mgmtCluster) != "") || strings.Contains(message, planner.ETCDRestoreMessage) {
mgmtCluster = mgmtCluster.DeepCopy()

Provisioned.SetStatus(mgmtCluster, Provisioned.GetStatus(cp))
Provisioned.Reason(mgmtCluster, Provisioned.GetReason(cp))
Provisioned.Message(mgmtCluster, message)
Ready.SetStatus(mgmtCluster, Ready.GetStatus(cp))
Ready.Reason(mgmtCluster, Ready.GetReason(cp))
Ready.Message(mgmtCluster, message)

_, err = h.mgmtClusterClient.Update(mgmtCluster)
if err != nil {
Expand All @@ -217,9 +217,9 @@ func (h *handler) updateClusterProvisioningStatus(cluster *rancherv1.Cluster, st
}
}

Provisioned.SetStatus(&status, Provisioned.GetStatus(cp))
Provisioned.Reason(&status, Provisioned.GetReason(cp))
Provisioned.Message(&status, Provisioned.GetMessage(cp))
Ready.SetStatus(&status, Ready.GetStatus(cp))
Ready.Reason(&status, Ready.GetReason(cp))
Ready.Message(&status, Ready.GetMessage(cp))

return status, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,50 @@ func machineDeployments(cluster *rancherv1.Cluster, capiCluster *capi.Cluster, d
}

result = append(result, machineDeployment)

// if a health check timeout was specified create health checks for this machine pool
if machinePool.UnhealthyNodeTimeout != nil {
hc := deploymentHealthChecks(machineDeployment, machinePool)
result = append(result, hc)
}
}

return result, nil
}

// deploymentHealthChecks Health checks will mark a machine as failed if it has any of the conditions below for the duration of the given timeout. https://cluster-api.sigs.k8s.io/tasks/healthcheck.html#what-is-a-machinehealthcheck
func deploymentHealthChecks(machineDeployment *capi.MachineDeployment, machinePool rancherv1.RKEMachinePool) *capi.MachineHealthCheck {
return &capi.MachineHealthCheck{
ObjectMeta: metav1.ObjectMeta{
Name: machineDeployment.Name,
Namespace: machineDeployment.Namespace,
},
Spec: capi.MachineHealthCheckSpec{
ClusterName: machineDeployment.Spec.ClusterName,
Selector: metav1.LabelSelector{ // this health check only applies to machines in this deployment
MatchLabels: map[string]string{
capi.MachineDeploymentLabelName: machineDeployment.Name,
},
},
UnhealthyConditions: []capi.UnhealthyCondition{ // if a node status is unready or unknown for the timeout mark it unhealthy
{
Status: corev1.ConditionUnknown,
Type: corev1.NodeReady,
Timeout: *machinePool.UnhealthyNodeTimeout,
},
{
Status: corev1.ConditionFalse,
Type: corev1.NodeReady,
Timeout: *machinePool.UnhealthyNodeTimeout,
},
},
MaxUnhealthy: machinePool.MaxUnhealthy,
UnhealthyRange: machinePool.UnhealthyRange,
NodeStartupTimeout: machinePool.NodeStartupTimeout,
},
}
}

func assign(labels map[string]string, key string, value interface{}) error {
data, err := json.Marshal(value)
if err != nil {
Expand Down
12 changes: 6 additions & 6 deletions pkg/controllers/provisioningv2/rke2/rkecluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
)

var (
Provisioned condition.Cond = "Provisioned"
Ready condition.Cond = "Ready"
)

type handler struct {
Expand Down Expand Up @@ -41,7 +41,7 @@ func Register(ctx context.Context, clients *wrangler.Context) {
}, clients.RKE.RKECluster(), clients.RKE.RKEControlPlane())
}

func (h *handler) UpdateSpec(key string, cluster *v1.RKECluster) (*v1.RKECluster, error) {
func (h *handler) UpdateSpec(_ string, cluster *v1.RKECluster) (*v1.RKECluster, error) {
if cluster == nil {
return nil, nil
}
Expand All @@ -61,14 +61,14 @@ func (h *handler) UpdateSpec(key string, cluster *v1.RKECluster) (*v1.RKECluster
func (h *handler) OnChange(obj *v1.RKECluster, status v1.RKEClusterStatus) (v1.RKEClusterStatus, error) {
cp, err := h.rkeControlPlanes.Get(obj.Namespace, obj.Name)
if err == nil {
Provisioned.SetStatus(&status, Provisioned.GetStatus(cp))
Provisioned.Reason(&status, Provisioned.GetReason(cp))
Provisioned.Message(&status, Provisioned.GetMessage(cp))
Ready.SetStatus(&status, Ready.GetStatus(cp))
Ready.Reason(&status, Ready.GetReason(cp))
Ready.Message(&status, Ready.GetMessage(cp))
} else if !apierrors.IsNotFound(err) {
return status, err
}

status.Ready = condition.Cond("Provisioned").IsTrue(&status)
status.Ready = Ready.IsTrue(&status)
status.ObservedGeneration = obj.Generation
return status, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"k8s.io/apimachinery/pkg/runtime"
)

var Ready condition.Cond = "Ready"

func Register(ctx context.Context, clients *wrangler.Context) {
h := &handler{
clusterCache: clients.Mgmt.Cluster().Cache(),
Expand Down Expand Up @@ -42,6 +44,11 @@ func (h *handler) OnChange(obj *rkev1.RKEControlPlane, status rkev1.RKEControlPl
return status, nil
}

status.Ready = condition.Cond("Ready").IsTrue(cluster)
Ready.SetStatus(&status, Ready.GetStatus(cluster))
Ready.Reason(&status, Ready.GetReason(cluster))
Ready.Message(&status, Ready.GetMessage(cluster))

status.Ready = Ready.IsTrue(cluster)
status.Initialized = Ready.IsTrue(cluster)
return status, nil
}
7 changes: 7 additions & 0 deletions pkg/provisioningv2/capi/capi.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
clusterv1alpha3 "sigs.k8s.io/cluster-api/api/v1alpha3"
clusterv1alpha4 "sigs.k8s.io/cluster-api/api/v1alpha4"
clusterv1beta1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/api/v1beta1/index"
"sigs.k8s.io/cluster-api/controllers"
"sigs.k8s.io/cluster-api/controllers/remote"
addonsv1alpha3 "sigs.k8s.io/cluster-api/exp/addons/api/v1alpha3"
Expand Down Expand Up @@ -59,6 +60,12 @@ func Register(ctx context.Context, clients *wrangler.Context) (func(ctx context.
return nil, err
}

// add the node ref indexer for health checks
err = index.AddDefaultIndexes(ctx, mgr)
if err != nil {
return nil, err
}

reconcilers, err := reconcilers(mgr)
if err != nil {
return nil, err
Expand Down

0 comments on commit cf64177

Please sign in to comment.