Skip to content

Commit

Permalink
Do not stop Kubernetes services on node removal if annotation is set. (
Browse files Browse the repository at this point in the history
  • Loading branch information
bschimke95 committed Sep 20, 2024
1 parent e4dadd1 commit 27c91c8
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 15 deletions.
1 change: 1 addition & 0 deletions docs/src/snap/reference/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ the bootstrap configuration.
| Name | Description | Values |
|---------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|
| `k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove` | If set, only microcluster and file cleanup are performed. This is helpful when an external controller (e.g., CAPI) manages the Kubernetes node lifecycle. By default, k8sd will remove the Kubernetes node when it is removed from the cluster. | "true"\|"false" |
| `k8sd/v1alpha/lifecycle/skip-stop-services-on-remove` | If set, the k8s services will not be stopped on the leaving node when removing the node. This is helpful when an external controller (e.g., CAPI) manages the Kubernetes node lifecycle. By default, all services are stopped on leaving nodes. | "true"\|"false" |

<!-- Links -->

Expand Down
5 changes: 4 additions & 1 deletion src/k8s/cmd/k8s/k8s_bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ var testCases = []testCase{
Enabled: utils.Pointer(true),
},
CloudProvider: utils.Pointer("external"),
Annotations: map[string]string{apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove: "true"},
Annotations: map[string]string{
apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove: "true",
apiv1.AnnotationSkipStopServicesOnRemove: "true",
},
},
ControlPlaneTaints: []string{"node-role.kubernetes.io/control-plane:NoSchedule"},
PodCIDR: utils.Pointer("10.100.0.0/16"),
Expand Down
1 change: 1 addition & 0 deletions src/k8s/cmd/k8s/testdata/bootstrap-config-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ cluster-config:
cloud-provider: external
annotations:
k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove: true
k8sd/v1alpha/lifecycle/skip-stop-services-on-remove: true
control-plane-taints:
- node-role.kubernetes.io/control-plane:NoSchedule
pod-cidr: 10.100.0.0/16
Expand Down
2 changes: 1 addition & 1 deletion src/k8s/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.22.6
require (
dario.cat/mergo v1.0.0
github.com/canonical/go-dqlite v1.22.0
github.com/canonical/k8s-snap-api v1.0.5
github.com/canonical/k8s-snap-api v1.0.6
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230
github.com/canonical/microcluster/v3 v3.0.0-20240827143335-f7a4d3984970
github.com/go-logr/logr v1.4.2
Expand Down
4 changes: 2 additions & 2 deletions src/k8s/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0 h1:nvj0OLI3YqYXe
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/canonical/go-dqlite v1.22.0 h1:DuJmfcREl4gkQJyvZzjl2GHFZROhbPyfdjDRQXpkOyw=
github.com/canonical/go-dqlite v1.22.0/go.mod h1:Uvy943N8R4CFUAs59A1NVaziWY9nJ686lScY7ywurfg=
github.com/canonical/k8s-snap-api v1.0.5 h1:49bgi6CGtFjCPweeTz55Sv/waKgCl6ftx4BqXt3RI9k=
github.com/canonical/k8s-snap-api v1.0.5/go.mod h1:LDPoIYCeYnfgOFrwVPJ/4edGU264w7BB7g0GsVi36AY=
github.com/canonical/k8s-snap-api v1.0.6 h1:hUJ59ol9romwUz82bYIumitobcuBQwKjWMnge1AhGzM=
github.com/canonical/k8s-snap-api v1.0.6/go.mod h1:LDPoIYCeYnfgOFrwVPJ/4edGU264w7BB7g0GsVi36AY=
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230 h1:YOqZ+/14OPZ+/TOXpRHIX3KLT0C+wZVpewKIwlGUmW0=
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230/go.mod h1:YVGI7HStOKsV+cMyXWnJ7RaMPaeWtrkxyIPvGWbgACc=
github.com/canonical/microcluster/v3 v3.0.0-20240827143335-f7a4d3984970 h1:UrnpglbXELlxtufdk6DGDytu2JzyzuS3WTsOwPrkQLI=
Expand Down
1 change: 1 addition & 0 deletions src/k8s/pkg/k8sd/api/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,6 @@ func (e *Endpoints) postWorkerInfo(s state.State, r *http.Request) response.Resp
KubeProxyClientCert: workerCertificates.KubeProxyClientCert,
KubeProxyClientKey: workerCertificates.KubeProxyClientKey,
K8sdPublicKey: cfg.Certificates.GetK8sdPublicKey(),
Annotations: cfg.Annotations,
})
}
2 changes: 1 addition & 1 deletion src/k8s/pkg/k8sd/app/hooks_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
// onBootstrap is called after we bootstrap the first cluster node.
// onBootstrap configures local services then writes the cluster config on the database.
func (a *App) onBootstrap(ctx context.Context, s state.State, initConfig map[string]string) error {

// NOTE(neoaggelos): context timeout is passed over configuration, so that hook failures are propagated to the client
ctx, cancel := context.WithCancel(ctx)
defer cancel()
Expand Down Expand Up @@ -213,6 +212,7 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
CACert: utils.Pointer(response.CACert),
ClientCACert: utils.Pointer(response.ClientCACert),
},
Annotations: response.Annotations,
}

// Pre-init checks
Expand Down
23 changes: 13 additions & 10 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Error(err, "Failed to wait for node to finish microcluster join before removing. Continuing with the cleanup...")
}

if cfg, err := databaseutil.GetClusterConfig(ctx, s); err == nil {
if _, ok := cfg.Annotations[apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove]; !ok {
cfg, err := databaseutil.GetClusterConfig(ctx, s)
if err == nil {
if _, ok := cfg.Annotations.Get(apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove); !ok {
c, err := snap.KubernetesClient("")
if err != nil {
log.Error(err, "Failed to create Kubernetes client", err)
Expand Down Expand Up @@ -124,19 +125,21 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Error(err, "Failed to unmark node as worker")
}

log.Info("Stopping worker services")
if err := snaputil.StopWorkerServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop worker services")
}

log.Info("Cleaning up control plane certificates")
if _, err := setup.EnsureControlPlanePKI(snap, &pki.ControlPlanePKI{}); err != nil {
log.Error(err, "failed to cleanup control plane certificates")
}

log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
if _, ok := cfg.Annotations.Get(apiv1.AnnotationSkipStopServicesOnRemove); !ok {
log.Info("Stopping worker services")
if err := snaputil.StopWorkerServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop worker services")
}

log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
}
}

return nil
Expand Down
9 changes: 9 additions & 0 deletions tests/integration/templates/bootstrap-skip-service-stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cluster-config:
network:
enabled: true
dns:
enabled: true
metrics-server:
enabled: true
annotations:
k8sd/v1alpha/lifecycle/skip-stop-services-on-remove: true
54 changes: 54 additions & 0 deletions tests/integration/tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,60 @@ def test_no_remove(instances: List[harness.Instance]):
assert len(nodes) == 3, "worker node should not have been removed from cluster"


@pytest.mark.node_count(3)
@pytest.mark.bootstrap_config(
(config.MANIFESTS_DIR / "bootstrap-skip-service-stop.yaml").read_text()
)
def test_skip_services_stop_on_remove(instances: List[harness.Instance]):
cluster_node = instances[0]
joining_cp = instances[1]
worker = instances[2]

join_token = util.get_join_token(cluster_node, joining_cp)
util.join_cluster(joining_cp, join_token)

join_token_worker = util.get_join_token(cluster_node, worker, "--worker")
util.join_cluster(worker, join_token_worker)

# We don't care if the node is ready or the CNI is up.
util.stubbornly(retries=5, delay_s=3).until(util.get_nodes(cluster_node) == 3)

cluster_node.exec(["k8s", "remove-node", joining_cp.id])
nodes = util.ready_nodes(cluster_node)
assert len(nodes) == 2, "cp node should have been removed from the cluster"
services = joining_cp.exec(
["snap", "services", "k8s"], capture_output=True, text=True
).stdout.split("\n")[1:-1]
print(services)
for service in services:
if "k8s-apiserver-proxy" in service:
assert (
" inactive " in service
), "apiserver proxy should be inactive on control-plane"
else:
assert " active " in service, "service should be active"

cluster_node.exec(["k8s", "remove-node", worker.id])
nodes = util.ready_nodes(cluster_node)
assert len(nodes) == 1, "worker node should have been removed from the cluster"
services = worker.exec(
["snap", "services", "k8s"], capture_output=True, text=True
).stdout.split("\n")[1:-1]
print(services)
for service in services:
for expected_active_service in [
"containerd",
"k8sd",
"kubelet",
"kube-proxy",
"k8s-apiserver-proxy",
]:
if expected_active_service in service:
assert (
" active " in service
), f"{expected_active_service} should be active on worker"


@pytest.mark.node_count(3)
def test_join_with_custom_token_name(instances: List[harness.Instance]):
cluster_node = instances[0]
Expand Down

0 comments on commit 27c91c8

Please sign in to comment.