From 8ee80009af861aace44c989498c7dc2d6c3ae7ef Mon Sep 17 00:00:00 2001 From: Utku Ozdemir Date: Wed, 3 Apr 2024 01:45:00 +0200 Subject: [PATCH] fix: prevent link and clustermachine deletion from getting stuck This PR contains fixes for the following issues: - When a `Link` resource was deleted, `MachineCleanupController` was unable to remove finalizers on the `MachineSetNode` resource, as it did not pass the correct owner `""` when calling `Teardown`. - `ClusterMachineStatusController` not handling the cases where the matching `Machine` might be missing. - `MachineSetController` not handling the cases where the `Machine` resource was missing for any of the `MachineSetNode`s that were tearing down. The latter two fixes prevent clusters from getting stuck if they are in a state where there is a missing machine in their machine sets. Closes #100, closes #97. Signed-off-by: Utku Ozdemir (cherry picked from commit 5dc2eaa1024f0ea09a1a5571289ba2cbebd6f633) --- .../omni/cluster_machine_status.go | 9 +- .../omni/internal/machineset/machineset.go | 5 + .../omni/controllers/omni/machine_cleanup.go | 2 +- .../controllers/omni/machine_cleanup_test.go | 93 +++++++++++++++++++ 4 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 internal/backend/runtime/omni/controllers/omni/machine_cleanup_test.go diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go index 5a958a47..faecf7cc 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go @@ -40,7 +40,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController { }, TransformFunc: func(ctx context.Context, r controller.Reader, _ *zap.Logger, clusterMachine *omni.ClusterMachine, clusterMachineStatus *omni.ClusterMachineStatus) error { machine, err := safe.ReaderGet[*omni.Machine](ctx, r, resource.NewMetadata(resources.DefaultNamespace, omni.MachineType, clusterMachine.Metadata().ID(), resource.VersionUndefined)) - if err != nil { + if err != nil && !state.IsNotFoundError(err) { return err } @@ -51,7 +51,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController { omni.LabelMachineSet, ) - if machine.TypedSpec().Value.Connected { + if machine != nil && machine.TypedSpec().Value.Connected { clusterMachineStatus.Metadata().Labels().Set(omni.MachineStatusLabelConnected, "") } else { clusterMachineStatus.Metadata().Labels().Delete(omni.MachineStatusLabelConnected) @@ -140,11 +140,12 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController { _, isControlPlaneNode := clusterMachineStatus.Metadata().Labels().Get(omni.LabelControlPlaneRole) if (status.GetStage() == machineapi.MachineStatusEvent_BOOTING || status.GetStage() == machineapi.MachineStatusEvent_RUNNING) && isControlPlaneNode { - cmsVal.ApidAvailable = machine.TypedSpec().Value.Connected + cmsVal.ApidAvailable = machine != nil && machine.TypedSpec().Value.Connected } // should we also mark it as not ready when the clustermachine is tearing down (?) - cmsVal.Ready = status.GetStatus().GetReady() && machine.TypedSpec().Value.Connected + cmsVal.Ready = status.GetStatus().GetReady() && + machine != nil && machine.TypedSpec().Value.Connected if clusterMachine.Metadata().Phase() == resource.PhaseTearingDown { cmsVal.Stage = specs.ClusterMachineStatusSpec_DESTROYING diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go index 58947417..d746fec2 100644 --- a/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go @@ -12,6 +12,7 @@ import ( "github.com/cosi-project/runtime/pkg/controller" "github.com/cosi-project/runtime/pkg/resource" "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" "go.uber.org/zap" "github.com/siderolabs/omni/client/pkg/omni/resources" @@ -92,6 +93,10 @@ func UpdateFinalizers(ctx context.Context, r controller.ReaderWriter, rc *Reconc for _, clusterMachine := range rc.GetClusterMachines() { if clusterMachine.Metadata().Phase() == resource.PhaseRunning && !clusterMachine.Metadata().Finalizers().Has(ControllerName) { if err := r.AddFinalizer(ctx, omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), ControllerName); err != nil { + if state.IsNotFoundError(err) { + continue + } + return err } } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go b/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go index cde6f51e..07b946f8 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go @@ -49,7 +49,7 @@ func (h *sameIDHandler[I, O]) FinalizerRemoval(ctx context.Context, r controller return nil } - ready, err := r.Teardown(ctx, md) + ready, err := r.Teardown(ctx, md, controller.WithOwner("")) if err != nil { return err } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_cleanup_test.go b/internal/backend/runtime/omni/controllers/omni/machine_cleanup_test.go new file mode 100644 index 00000000..ef33864a --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/machine_cleanup_test.go @@ -0,0 +1,93 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package omni_test + +import ( + "context" + "testing" + "time" + + "github.com/cosi-project/runtime/pkg/resource/rtestutils" + "github.com/cosi-project/runtime/pkg/state" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" +) + +type MachineCleanupSuite struct { + OmniSuite +} + +func (suite *MachineCleanupSuite) TestCleanup() { + require := suite.Require() + + suite.ctx, suite.ctxCancel = context.WithTimeout(suite.ctx, time.Second*10) + + suite.startRuntime() + + controller := omnictrl.NewMachineCleanupController() + + require.NoError(suite.runtime.RegisterController(controller)) + + machineID := "machine-cleanup-test-machine" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machine-cleanup-test-machine-set") + machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, machineID, machineSet) + machine := omni.NewMachine(resources.DefaultNamespace, machineID) + + machine.Metadata().Finalizers().Add(controller.Name()) + + require.NoError(suite.state.Create(suite.ctx, machineSetNode)) + require.NoError(suite.state.Create(suite.ctx, machine)) + + _, err := suite.state.Teardown(suite.ctx, machine.Metadata()) + + require.NoError(err) + + assertNoResource[*omni.MachineSetNode](&suite.OmniSuite, machineSetNode) + + assertResource[*omni.Machine](&suite.OmniSuite, machine.Metadata(), func(r *omni.Machine, assertion *assert.Assertions) { + assertion.Empty(r.Metadata().Finalizers()) + }) + + require.NoError(suite.state.Destroy(suite.ctx, machine.Metadata())) +} + +func (suite *MachineCleanupSuite) TestSkipMachineSetNodeWithOwner() { + require := suite.Require() + + suite.ctx, suite.ctxCancel = context.WithTimeout(suite.ctx, time.Second*10) + + suite.startRuntime() + + controller := omnictrl.NewMachineCleanupController() + + require.NoError(suite.runtime.RegisterController(controller)) + + machineID := "machine-cleanup-skip-test-machine" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machine-cleanup-skip-test-machine-set") + machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, machineID, machineSet) + machine := omni.NewMachine(resources.DefaultNamespace, machineID) + + machine.Metadata().Finalizers().Add(controller.Name()) + require.NoError(machineSetNode.Metadata().SetOwner("some-owner")) + + require.NoError(suite.state.Create(suite.ctx, machineSetNode, state.WithCreateOwner("some-owner"))) + require.NoError(suite.state.Create(suite.ctx, machine)) + + rtestutils.Destroy[*omni.Machine](suite.ctx, suite.T(), suite.state, []string{machine.Metadata().ID()}) + + // MachineSetNode should still be around, as it is owned by a controller - CleanupController should skip it + assertResource[*omni.MachineSetNode](&suite.OmniSuite, machine.Metadata(), func(*omni.MachineSetNode, *assert.Assertions) {}) +} + +func TestMachineCleanupSuite(t *testing.T) { + suite.Run(t, new(MachineCleanupSuite)) +}