Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Events for the Maintenance Process #113

Merged
merged 5 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions controllers/controllers_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (

"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/kubectl/pkg/drain"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -42,13 +43,14 @@ import (
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var (
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
r *NodeMaintenanceReconciler
drainer *drain.Helper
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
fakeRecorder *record.FakeRecorder
r *NodeMaintenanceReconciler
drainer *drain.Helper
)

func TestControllers(t *testing.T) {
Expand Down Expand Up @@ -86,12 +88,14 @@ var _ = BeforeSuite(func() {
Expect(k8sClient).NotTo(BeNil())

mockManager, _ := lease.NewManager(k8sClient, "")
fakeRecorder = record.NewFakeRecorder(20)
// Create a ReconcileNodeMaintenance object with the scheme and fake client
r = &NodeMaintenanceReconciler{
Client: k8sClient,
Scheme: scheme.Scheme,
MgrConfig: cfg,
LeaseManager: &mockLeaseManager{mockManager},
Recorder: fakeRecorder,
logger: ctrl.Log.WithName("unit test"),
}
ctx, cancel = context.WithCancel(ctrl.SetupSignalHandler())
Expand Down
44 changes: 36 additions & 8 deletions controllers/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ import (
"github.com/medik8s/common/pkg/lease"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/klog"
"k8s.io/kubectl/pkg/cmd/util"
"k8s.io/kubectl/pkg/drain"
Expand All @@ -42,12 +43,15 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/medik8s/node-maintenance-operator/api/v1beta1"
"github.com/medik8s/node-maintenance-operator/pkg/utils"
)

const (
maxAllowedErrorToUpdateOwnedLease = 3
waitDurationOnDrainError = 5 * time.Second
FixedDurationReconcileLog = "Reconciling with fixed duration"
// An expected error from fetchNode function
expectedNodeNotFoundErrorMsg = "nodes \"%s\" not found"

//lease consts
LeaseHolderIdentity = "node-maintenance"
Expand All @@ -61,6 +65,7 @@ type NodeMaintenanceReconciler struct {
Scheme *runtime.Scheme
MgrConfig *rest.Config
LeaseManager lease.Manager
Recorder record.EventRecorder
logger logr.Logger
}

Expand Down Expand Up @@ -105,7 +110,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
nm := &v1beta1.NodeMaintenance{}
err := r.Client.Get(ctx, req.NamespacedName, nm)
if err != nil {
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
Expand All @@ -116,26 +121,27 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.logger.Info("Error reading the request object, requeuing.")
return emptyResult, err
}

// Add finalizer when object is created
drainer, err := createDrainer(ctx, r.MgrConfig)
if err != nil {
return emptyResult, err
}

if !controllerutil.ContainsFinalizer(nm, v1beta1.NodeMaintenanceFinalizer) && nm.ObjectMeta.DeletionTimestamp.IsZero() {
// Add finalizer when object is created
controllerutil.AddFinalizer(nm, v1beta1.NodeMaintenanceFinalizer)
if err := r.Client.Update(ctx, nm); err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
}
// begin maintenance on adding finalizer
utils.NormalEvent(r.Recorder, nm, utils.EventReasonBeginMaintenance, utils.EventMessageBeginMaintenance)
} else if controllerutil.ContainsFinalizer(nm, v1beta1.NodeMaintenanceFinalizer) && !nm.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is being deleted
r.logger.Info("Deletion timestamp not zero")

// Stop node maintenance - uncordon and remove live migration taint from the node.
if err := r.stopNodeMaintenanceOnDeletion(ctx, drainer, nm.Spec.NodeName); err != nil {
r.logger.Error(err, "error stopping node maintenance")
if !errors.IsNotFound(err) {
if !apiErrors.IsNotFound(err) {
return r.onReconcileError(ctx, nm, drainer, err)
}
}
Expand All @@ -145,6 +151,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err := r.Client.Update(ctx, nm); err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
}
// end maintenance on removing finalizer, taints, and node is already uncordoned
utils.NormalEvent(r.Recorder, nm, utils.EventReasonRemovedMaintenance, utils.EventMessageRemovedMaintenance)
return emptyResult, nil
}

Expand All @@ -159,7 +167,16 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.logger.Info("Applying maintenance mode", "node", nodeName, "reason", nm.Spec.Reason)
node, err := r.fetchNode(ctx, drainer, nodeName)
if err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
if apiErrors.IsNotFound(err) {
r.logger.Error(err, "Didn't find a node matching the NodeName field", "NodeName", nodeName)
// maintenance has failed - nodeName doesn't match an existing node
utils.WarningEvent(r.Recorder, nm, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
nm.Status.Phase = v1beta1.MaintenanceFailed
return r.onReconcileError(ctx, nm, drainer, err)
mshitrit marked this conversation as resolved.
Show resolved Hide resolved
} else {
r.logger.Error(err, "Unexpected error for the NodeName field", "NodeName", nodeName)
return r.onReconcileError(ctx, nm, drainer, err)
}
}

setOwnerRefToNode(nm, node, r.logger)
Expand All @@ -175,6 +192,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err != nil {
return r.onReconcileError(ctx, nm, drainer, fmt.Errorf("failed to uncordon upon failure to obtain owned lease : %v ", err))
}
// maintenance has failed - node was uncordon and under maintenance mode
utils.WarningEvent(r.Recorder, nm, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
nm.Status.Phase = v1beta1.MaintenanceFailed
}
return r.onReconcileError(ctx, nm, drainer, fmt.Errorf("failed to extend lease owned by us : %v errorOnLeaseCount %d", err, nm.Status.ErrorOnLeaseCount))
Expand All @@ -185,6 +204,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
} else {
if nm.Status.Phase != v1beta1.MaintenanceRunning || nm.Status.ErrorOnLeaseCount != 0 {
nm.Status.Phase = v1beta1.MaintenanceRunning
// Another chance to evict pods - clear ErrorOnLeaseCount and try again to put the node under maintenance
nm.Status.ErrorOnLeaseCount = 0

}
Expand All @@ -211,6 +231,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
waitOnReconcile := waitDurationOnDrainError
return r.onReconcileErrorWithRequeue(ctx, nm, drainer, err, &waitOnReconcile)
} else if nm.Status.Phase != v1beta1.MaintenanceSucceeded {
// maintenance has completed - node is under maintenance mode
utils.NormalEvent(r.Recorder, nm, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)
setLastUpdate(nm)
}

Expand Down Expand Up @@ -357,6 +379,8 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceImp(ctx context.Context,
return err
}

// stop maintenance - remove the added taints and uncordon the node
utils.NormalEvent(r.Recorder, node, utils.EventReasonUncordonNode, utils.EventMessageUncordonNode)
if err := r.LeaseManager.InvalidateLease(ctx, node); err != nil {
return err
}
Expand All @@ -367,7 +391,7 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceOnDeletion(ctx context.Co
node, err := r.fetchNode(ctx, drainer, nodeName)
if err != nil {
// if CR is gathered as result of garbage collection: the node may have been deleted, but the CR has not yet been deleted, still we must clean up the lease!
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
if err := r.LeaseManager.InvalidateLease(ctx, &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}); err != nil {
return err
}
Expand All @@ -380,7 +404,7 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceOnDeletion(ctx context.Co

func (r *NodeMaintenanceReconciler) fetchNode(ctx context.Context, drainer *drain.Helper, nodeName string) (*corev1.Node, error) {
node, err := drainer.Client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
if err != nil && apiErrors.IsNotFound(err) {
r.logger.Error(err, "Node cannot be found", "nodeName", nodeName)
return nil, err
} else if err != nil {
Expand Down Expand Up @@ -436,6 +460,10 @@ func (r *NodeMaintenanceReconciler) onReconcileErrorWithRequeue(ctx context.Cont
if updateErr != nil {
r.logger.Error(updateErr, "Failed to update NodeMaintenance with \"Failed\" status")
}
if nm.Spec.NodeName != "" && err.Error() == fmt.Sprintf(expectedNodeNotFoundErrorMsg, nm.Spec.NodeName) {
// don't return an error in case of a missing node, as it won't be found in the future.
return ctrl.Result{}, nil
}
if duration != nil {
r.logger.Info(FixedDurationReconcileLog)
return ctrl.Result{RequeueAfter: *duration}, nil
Expand Down
59 changes: 58 additions & 1 deletion controllers/nodemaintenance_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/medik8s/node-maintenance-operator/api/v1beta1"
utils "github.com/medik8s/node-maintenance-operator/pkg/utils"
)

const (
Expand Down Expand Up @@ -178,6 +179,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(k8sClient.Create(ctx, podTwo)).To(Succeed())
DeferCleanup(cleanupPod, ctx, podOne)
DeferCleanup(cleanupPod, ctx, podTwo)
DeferCleanup(clearEvents)
})
JustBeforeEach(func() {
// Sleep for a second to ensure dummy reconciliation has begun running before the unit tests
Expand All @@ -202,6 +204,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(maintenance.Status.LastError).To(Equal(""))
Expect(maintenance.Status.LastUpdate).NotTo(BeZero())
Expect(maintenance.Status.ErrorOnLeaseCount).To(Equal(0))
verifyEvent(corev1.EventTypeNormal, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)

By("Check whether node was cordoned")
node := &corev1.Node{}
Expand All @@ -219,6 +222,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(k8sClient.Delete(ctx, nm)).To(Succeed())
// Sleep for a second to ensure dummy reconciliation has begun running before the unit tests
time.Sleep(1 * time.Second)
verifyEvent(corev1.EventTypeNormal, utils.EventReasonRemovedMaintenance, utils.EventMessageRemovedMaintenance)

Expect(k8sClient.Get(ctx, client.ObjectKey{Name: nm.Spec.NodeName}, node)).NotTo(HaveOccurred())
_, exist := node.Labels[commonLabels.ExcludeFromRemediation]
Expand All @@ -236,14 +240,17 @@ var _ = Describe("Node Maintenance", func() {
maintenance := &v1beta1.NodeMaintenance{}
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nm), maintenance)).To(Succeed())

Expect(maintenance.Status.Phase).To(Equal(v1beta1.MaintenanceRunning))
Expect(len(maintenance.Status.PendingPods)).To(Equal(0))
Expect(maintenance.Status.EvictionPods).To(Equal(0))
Expect(maintenance.Status.TotalPods).To(Equal(0))
Expect(maintenance.Status.DrainProgress).To(Equal(0))
Expect(maintenance.Status.LastError).To(Equal(fmt.Sprintf("nodes \"%s\" not found", invalidNodeName)))
Expect(maintenance.Status.LastUpdate).NotTo(BeZero())
Expect(maintenance.Status.ErrorOnLeaseCount).To(Equal(0))
Expect(maintenance.Status.Phase).To(Equal(v1beta1.MaintenanceFailed))
verifyEvent(corev1.EventTypeNormal, utils.EventReasonBeginMaintenance, utils.EventMessageBeginMaintenance)
verifyEvent(corev1.EventTypeWarning, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
verifyNoEvent(corev1.EventTypeNormal, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)
})
})
})
Expand Down Expand Up @@ -347,6 +354,56 @@ func getTestNM(crName, nodeName string) *v1beta1.NodeMaintenance {
}
}

func verifyEvent(eventType, eventReason, eventMessage string) {
By(fmt.Sprintf("Verifying that event %s was created", eventReason))
isEventMatch := isEventOccurred(eventType, eventReason, eventMessage)
ExpectWithOffset(1, isEventMatch).To(BeTrue())
}

func verifyNoEvent(eventType, eventReason, eventMessage string) {
By(fmt.Sprintf("Verifying that event %s was not created", eventReason))
isEventMatch := isEventOccurred(eventType, eventReason, eventMessage)
ExpectWithOffset(1, isEventMatch).To(BeFalse())
}

// isEventOccurred checks whether an event has occoured
func isEventOccurred(eventType, eventReason, eventMessage string) bool {
expected := fmt.Sprintf("%s %s %s", eventType, eventReason, eventMessage)
isEventMatch := false

unMatchedEvents := make(chan string, len(fakeRecorder.Events))
isDone := false
for {
select {
case event := <-fakeRecorder.Events:
if isEventMatch = event == expected; isEventMatch {
isDone = true
} else {
unMatchedEvents <- event
}
default:
isDone = true
}
if isDone {
break
}
}

close(unMatchedEvents)
for unMatchedEvent := range unMatchedEvents {
fakeRecorder.Events <- unMatchedEvent
}
return isEventMatch
}

// clearEvents loop over the events channel until it is empty from events
func clearEvents() {
for len(fakeRecorder.Events) > 0 {
<-fakeRecorder.Events
}
testLog.Info("Cleanup: events list is empty")
}

type mockLeaseManager struct {
lease.Manager
}
Expand Down
8 changes: 5 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
WebhookCertDir = "/apiserver.local.config/certificates"
WebhookCertName = "apiserver.crt"
WebhookKeyName = "apiserver.key"
operatorName = "NodeMaintenance"
)

var (
Expand Down Expand Up @@ -108,7 +109,7 @@ func main() {
cl := mgr.GetClient()
leaseManagerInitializer := &leaseManagerInitializer{cl: cl}
if err := mgr.Add(leaseManagerInitializer); err != nil {
setupLog.Error(err, "unable to set up lease Manager", "lease", "NodeMaintenance")
setupLog.Error(err, "unable to set up lease Manager", "lease", operatorName)
os.Exit(1)
}

Expand All @@ -128,12 +129,13 @@ func main() {
Scheme: mgr.GetScheme(),
MgrConfig: mgr.GetConfig(),
LeaseManager: leaseManagerInitializer,
Recorder: mgr.GetEventRecorderFor(operatorName),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeMaintenance")
setupLog.Error(err, "unable to create controller", "controller", operatorName)
os.Exit(1)
}
if err = (&nodemaintenancev1beta1.NodeMaintenance{}).SetupWebhookWithManager(isOpenShift, mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "NodeMaintenance")
setupLog.Error(err, "unable to create webhook", "webhook", operatorName)
os.Exit(1)
}
//+kubebuilder:scaffold:builder
Expand Down
33 changes: 33 additions & 0 deletions pkg/utils/events.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package utils

import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
)

const (
// events reasons
EventReasonBeginMaintenance = "BeginMaintenance"
EventReasonFailedMaintenance = "FailedMaintenance"
EventReasonSucceedMaintenance = "SucceedMaintenance"
EventReasonUncordonNode = "UncordonNode"
EventReasonRemovedMaintenance = "RemovedMaintenance"

// events messages
EventMessageBeginMaintenance = "Begin a node maintenance"
EventMessageFailedMaintenance = "Failed a node maintenance"
EventMessageSucceedMaintenance = "Node maintenance was succeeded"
EventMessageUncordonNode = "Uncordon a node"
EventMessageRemovedMaintenance = "Removed a node maintenance"
)

// NormalEvent will record an event with type Normal and fixed message.
func NormalEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) {
recorder.Event(object, corev1.EventTypeNormal, reason, message)
}

// WarningEvent will record an event with type Warning and fixed message.
func WarningEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) {
recorder.Event(object, corev1.EventTypeWarning, reason, message)
}
Loading
Loading