Skip to content

Commit

Permalink
Merge pull request #113 from razo7/events-maintenance
Browse files Browse the repository at this point in the history
Add Events for the Maintenance Process
  • Loading branch information
openshift-merge-bot[bot] authored Mar 24, 2024
2 parents 487905b + 4583f50 commit 70c6a5a
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 126 deletions.
18 changes: 11 additions & 7 deletions controllers/controllers_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (

"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/kubectl/pkg/drain"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -42,13 +43,14 @@ import (
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var (
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
r *NodeMaintenanceReconciler
drainer *drain.Helper
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
fakeRecorder *record.FakeRecorder
r *NodeMaintenanceReconciler
drainer *drain.Helper
)

func TestControllers(t *testing.T) {
Expand Down Expand Up @@ -86,12 +88,14 @@ var _ = BeforeSuite(func() {
Expect(k8sClient).NotTo(BeNil())

mockManager, _ := lease.NewManager(k8sClient, "")
fakeRecorder = record.NewFakeRecorder(20)
// Create a ReconcileNodeMaintenance object with the scheme and fake client
r = &NodeMaintenanceReconciler{
Client: k8sClient,
Scheme: scheme.Scheme,
MgrConfig: cfg,
LeaseManager: &mockLeaseManager{mockManager},
Recorder: fakeRecorder,
logger: ctrl.Log.WithName("unit test"),
}
ctx, cancel = context.WithCancel(ctrl.SetupSignalHandler())
Expand Down
44 changes: 36 additions & 8 deletions controllers/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ import (
"github.com/medik8s/common/pkg/lease"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/klog"
"k8s.io/kubectl/pkg/cmd/util"
"k8s.io/kubectl/pkg/drain"
Expand All @@ -42,12 +43,15 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/medik8s/node-maintenance-operator/api/v1beta1"
"github.com/medik8s/node-maintenance-operator/pkg/utils"
)

const (
maxAllowedErrorToUpdateOwnedLease = 3
waitDurationOnDrainError = 5 * time.Second
FixedDurationReconcileLog = "Reconciling with fixed duration"
// An expected error from fetchNode function
expectedNodeNotFoundErrorMsg = "nodes \"%s\" not found"

//lease consts
LeaseHolderIdentity = "node-maintenance"
Expand All @@ -61,6 +65,7 @@ type NodeMaintenanceReconciler struct {
Scheme *runtime.Scheme
MgrConfig *rest.Config
LeaseManager lease.Manager
Recorder record.EventRecorder
logger logr.Logger
}

Expand Down Expand Up @@ -105,7 +110,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
nm := &v1beta1.NodeMaintenance{}
err := r.Client.Get(ctx, req.NamespacedName, nm)
if err != nil {
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
Expand All @@ -116,26 +121,27 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.logger.Info("Error reading the request object, requeuing.")
return emptyResult, err
}

// Add finalizer when object is created
drainer, err := createDrainer(ctx, r.MgrConfig)
if err != nil {
return emptyResult, err
}

if !controllerutil.ContainsFinalizer(nm, v1beta1.NodeMaintenanceFinalizer) && nm.ObjectMeta.DeletionTimestamp.IsZero() {
// Add finalizer when object is created
controllerutil.AddFinalizer(nm, v1beta1.NodeMaintenanceFinalizer)
if err := r.Client.Update(ctx, nm); err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
}
// begin maintenance on adding finalizer
utils.NormalEvent(r.Recorder, nm, utils.EventReasonBeginMaintenance, utils.EventMessageBeginMaintenance)
} else if controllerutil.ContainsFinalizer(nm, v1beta1.NodeMaintenanceFinalizer) && !nm.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is being deleted
r.logger.Info("Deletion timestamp not zero")

// Stop node maintenance - uncordon and remove live migration taint from the node.
if err := r.stopNodeMaintenanceOnDeletion(ctx, drainer, nm.Spec.NodeName); err != nil {
r.logger.Error(err, "error stopping node maintenance")
if !errors.IsNotFound(err) {
if !apiErrors.IsNotFound(err) {
return r.onReconcileError(ctx, nm, drainer, err)
}
}
Expand All @@ -145,6 +151,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err := r.Client.Update(ctx, nm); err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
}
// end maintenance on removing finalizer, taints, and node is already uncordoned
utils.NormalEvent(r.Recorder, nm, utils.EventReasonRemovedMaintenance, utils.EventMessageRemovedMaintenance)
return emptyResult, nil
}

Expand All @@ -165,7 +173,16 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.logger.Info("Applying maintenance mode", "node", nodeName, "reason", nm.Spec.Reason)
node, err := r.fetchNode(ctx, drainer, nodeName)
if err != nil {
return r.onReconcileError(ctx, nm, drainer, err)
if apiErrors.IsNotFound(err) {
r.logger.Error(err, "Didn't find a node matching the NodeName field", "NodeName", nodeName)
// maintenance has failed - nodeName doesn't match an existing node
utils.WarningEvent(r.Recorder, nm, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
nm.Status.Phase = v1beta1.MaintenanceFailed
return r.onReconcileError(ctx, nm, drainer, err)
} else {
r.logger.Error(err, "Unexpected error for the NodeName field", "NodeName", nodeName)
return r.onReconcileError(ctx, nm, drainer, err)
}
}

setOwnerRefToNode(nm, node, r.logger)
Expand All @@ -181,6 +198,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err != nil {
return r.onReconcileError(ctx, nm, drainer, fmt.Errorf("failed to uncordon upon failure to obtain owned lease : %v ", err))
}
// maintenance has failed - node was uncordon and under maintenance mode
utils.WarningEvent(r.Recorder, nm, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
nm.Status.Phase = v1beta1.MaintenanceFailed
}
return r.onReconcileError(ctx, nm, drainer, fmt.Errorf("failed to extend lease owned by us : %v errorOnLeaseCount %d", err, nm.Status.ErrorOnLeaseCount))
Expand All @@ -191,6 +210,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
} else {
if nm.Status.Phase != v1beta1.MaintenanceRunning || nm.Status.ErrorOnLeaseCount != 0 {
nm.Status.Phase = v1beta1.MaintenanceRunning
// Another chance to evict pods - clear ErrorOnLeaseCount and try again to put the node under maintenance
nm.Status.ErrorOnLeaseCount = 0

}
Expand All @@ -217,6 +237,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
waitOnReconcile := waitDurationOnDrainError
return r.onReconcileErrorWithRequeue(ctx, nm, drainer, err, &waitOnReconcile)
} else if nm.Status.Phase != v1beta1.MaintenanceSucceeded {
// maintenance has completed - node is under maintenance mode
utils.NormalEvent(r.Recorder, nm, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)
setLastUpdate(nm)
}

Expand Down Expand Up @@ -363,6 +385,8 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceImp(ctx context.Context,
return err
}

// stop maintenance - remove the added taints and uncordon the node
utils.NormalEvent(r.Recorder, node, utils.EventReasonUncordonNode, utils.EventMessageUncordonNode)
if err := r.LeaseManager.InvalidateLease(ctx, node); err != nil {
return err
}
Expand All @@ -373,7 +397,7 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceOnDeletion(ctx context.Co
node, err := r.fetchNode(ctx, drainer, nodeName)
if err != nil {
// if CR is gathered as result of garbage collection: the node may have been deleted, but the CR has not yet been deleted, still we must clean up the lease!
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
if err := r.LeaseManager.InvalidateLease(ctx, &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}); err != nil {
return err
}
Expand All @@ -386,7 +410,7 @@ func (r *NodeMaintenanceReconciler) stopNodeMaintenanceOnDeletion(ctx context.Co

func (r *NodeMaintenanceReconciler) fetchNode(ctx context.Context, drainer *drain.Helper, nodeName string) (*corev1.Node, error) {
node, err := drainer.Client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
if err != nil && apiErrors.IsNotFound(err) {
r.logger.Error(err, "Node cannot be found", "nodeName", nodeName)
return nil, err
} else if err != nil {
Expand Down Expand Up @@ -443,6 +467,10 @@ func (r *NodeMaintenanceReconciler) onReconcileErrorWithRequeue(ctx context.Cont
if updateErr != nil {
r.logger.Error(updateErr, "Failed to update NodeMaintenance with \"Failed\" status")
}
if nm.Spec.NodeName != "" && err.Error() == fmt.Sprintf(expectedNodeNotFoundErrorMsg, nm.Spec.NodeName) {
// don't return an error in case of a missing node, as it won't be found in the future.
return ctrl.Result{}, nil
}
if duration != nil {
r.logger.Info(FixedDurationReconcileLog)
return ctrl.Result{RequeueAfter: *duration}, nil
Expand Down
59 changes: 58 additions & 1 deletion controllers/nodemaintenance_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/medik8s/node-maintenance-operator/api/v1beta1"
utils "github.com/medik8s/node-maintenance-operator/pkg/utils"
)

const (
Expand Down Expand Up @@ -181,6 +182,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(k8sClient.Create(ctx, podTwo)).To(Succeed())
DeferCleanup(cleanupPod, ctx, podOne)
DeferCleanup(cleanupPod, ctx, podTwo)
DeferCleanup(clearEvents)
})
JustBeforeEach(func() {
// Sleep for a second to ensure dummy reconciliation has begun running before the unit tests
Expand All @@ -205,6 +207,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(maintenance.Status.LastError).To(Equal(""))
Expect(maintenance.Status.LastUpdate).NotTo(BeZero())
Expect(maintenance.Status.ErrorOnLeaseCount).To(Equal(0))
verifyEvent(corev1.EventTypeNormal, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)

By("Check whether node was cordoned")
node := &corev1.Node{}
Expand All @@ -222,6 +225,7 @@ var _ = Describe("Node Maintenance", func() {
Expect(k8sClient.Delete(ctx, nm)).To(Succeed())
// Sleep for a second to ensure dummy reconciliation has begun running before the unit tests
time.Sleep(1 * time.Second)
verifyEvent(corev1.EventTypeNormal, utils.EventReasonRemovedMaintenance, utils.EventMessageRemovedMaintenance)

Expect(k8sClient.Get(ctx, client.ObjectKey{Name: nm.Spec.NodeName}, node)).NotTo(HaveOccurred())
_, exist := node.Labels[commonLabels.ExcludeFromRemediation]
Expand All @@ -239,14 +243,17 @@ var _ = Describe("Node Maintenance", func() {
maintenance := &v1beta1.NodeMaintenance{}
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nm), maintenance)).To(Succeed())

Expect(maintenance.Status.Phase).To(Equal(v1beta1.MaintenanceRunning))
Expect(len(maintenance.Status.PendingPods)).To(Equal(0))
Expect(maintenance.Status.EvictionPods).To(Equal(0))
Expect(maintenance.Status.TotalPods).To(Equal(0))
Expect(maintenance.Status.DrainProgress).To(Equal(0))
Expect(maintenance.Status.LastError).To(Equal(fmt.Sprintf("nodes \"%s\" not found", invalidNodeName)))
Expect(maintenance.Status.LastUpdate).NotTo(BeZero())
Expect(maintenance.Status.ErrorOnLeaseCount).To(Equal(0))
Expect(maintenance.Status.Phase).To(Equal(v1beta1.MaintenanceFailed))
verifyEvent(corev1.EventTypeNormal, utils.EventReasonBeginMaintenance, utils.EventMessageBeginMaintenance)
verifyEvent(corev1.EventTypeWarning, utils.EventReasonFailedMaintenance, utils.EventMessageFailedMaintenance)
verifyNoEvent(corev1.EventTypeNormal, utils.EventReasonSucceedMaintenance, utils.EventMessageSucceedMaintenance)
})
})
})
Expand Down Expand Up @@ -350,6 +357,56 @@ func getTestNM(crName, nodeName string) *v1beta1.NodeMaintenance {
}
}

func verifyEvent(eventType, eventReason, eventMessage string) {
By(fmt.Sprintf("Verifying that event %s was created", eventReason))
isEventMatch := isEventOccurred(eventType, eventReason, eventMessage)
ExpectWithOffset(1, isEventMatch).To(BeTrue())
}

func verifyNoEvent(eventType, eventReason, eventMessage string) {
By(fmt.Sprintf("Verifying that event %s was not created", eventReason))
isEventMatch := isEventOccurred(eventType, eventReason, eventMessage)
ExpectWithOffset(1, isEventMatch).To(BeFalse())
}

// isEventOccurred checks whether an event has occoured
func isEventOccurred(eventType, eventReason, eventMessage string) bool {
expected := fmt.Sprintf("%s %s %s", eventType, eventReason, eventMessage)
isEventMatch := false

unMatchedEvents := make(chan string, len(fakeRecorder.Events))
isDone := false
for {
select {
case event := <-fakeRecorder.Events:
if isEventMatch = event == expected; isEventMatch {
isDone = true
} else {
unMatchedEvents <- event
}
default:
isDone = true
}
if isDone {
break
}
}

close(unMatchedEvents)
for unMatchedEvent := range unMatchedEvents {
fakeRecorder.Events <- unMatchedEvent
}
return isEventMatch
}

// clearEvents loop over the events channel until it is empty from events
func clearEvents() {
for len(fakeRecorder.Events) > 0 {
<-fakeRecorder.Events
}
testLog.Info("Cleanup: events list is empty")
}

type mockLeaseManager struct {
lease.Manager
}
Expand Down
8 changes: 5 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
WebhookCertDir = "/apiserver.local.config/certificates"
WebhookCertName = "apiserver.crt"
WebhookKeyName = "apiserver.key"
operatorName = "NodeMaintenance"
)

var (
Expand Down Expand Up @@ -108,7 +109,7 @@ func main() {
cl := mgr.GetClient()
leaseManagerInitializer := &leaseManagerInitializer{cl: cl}
if err := mgr.Add(leaseManagerInitializer); err != nil {
setupLog.Error(err, "unable to set up lease Manager", "lease", "NodeMaintenance")
setupLog.Error(err, "unable to set up lease Manager", "lease", operatorName)
os.Exit(1)
}

Expand All @@ -128,12 +129,13 @@ func main() {
Scheme: mgr.GetScheme(),
MgrConfig: mgr.GetConfig(),
LeaseManager: leaseManagerInitializer,
Recorder: mgr.GetEventRecorderFor(operatorName),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeMaintenance")
setupLog.Error(err, "unable to create controller", "controller", operatorName)
os.Exit(1)
}
if err = (&nodemaintenancev1beta1.NodeMaintenance{}).SetupWebhookWithManager(isOpenShift, mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "NodeMaintenance")
setupLog.Error(err, "unable to create webhook", "webhook", operatorName)
os.Exit(1)
}
//+kubebuilder:scaffold:builder
Expand Down
33 changes: 33 additions & 0 deletions pkg/utils/events.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package utils

import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
)

const (
// events reasons
EventReasonBeginMaintenance = "BeginMaintenance"
EventReasonFailedMaintenance = "FailedMaintenance"
EventReasonSucceedMaintenance = "SucceedMaintenance"
EventReasonUncordonNode = "UncordonNode"
EventReasonRemovedMaintenance = "RemovedMaintenance"

// events messages
EventMessageBeginMaintenance = "Begin a node maintenance"
EventMessageFailedMaintenance = "Failed a node maintenance"
EventMessageSucceedMaintenance = "Node maintenance was succeeded"
EventMessageUncordonNode = "Uncordon a node"
EventMessageRemovedMaintenance = "Removed a node maintenance"
)

// NormalEvent will record an event with type Normal and fixed message.
func NormalEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) {
recorder.Event(object, corev1.EventTypeNormal, reason, message)
}

// WarningEvent will record an event with type Warning and fixed message.
func WarningEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) {
recorder.Event(object, corev1.EventTypeWarning, reason, message)
}
Loading

0 comments on commit 70c6a5a

Please sign in to comment.