Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some fixes and e2e test improvements #226

Merged
merged 2 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ KUSTOMIZE_VERSION = v5.3.0
ENVTEST_VERSION = v0.0.0-20240215124517-56159419231e

# versions at https://github.com/slintes/sort-imports/tags
SORT_IMPORTS_VERSION = v0.2.1
SORT_IMPORTS_VERSION = v0.3.0

# version at https://github.com/a8m/envsubst/releases
ENVSUBST_VERSION = v1.4.2
Expand Down Expand Up @@ -415,9 +415,9 @@ protoc-gen-go-grpc: ## Download protoc-gen-go-grpc locally if necessary.

.PHONY: e2e-test
e2e-test:
# KUBECONFIG must be set to the cluster, and PP needs to be deployed already
# KUBECONFIG must be set to the cluster, and SNR needs to be deployed already
# count arg makes the test ignoring cached test results
go test ./e2e -ginkgo.vv -test.v -timeout 60m -count=1 ${TEST_OPS}
go test ./e2e -ginkgo.vv -test.v -timeout 80m -count=1 ${TEST_OPS}

YQ = $(shell pwd)/bin/yq
.PHONY: yq
Expand Down
96 changes: 96 additions & 0 deletions controllers/owner_and_name.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package controllers

import (
"context"
"errors"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
)

// IsSNRMatching checks if the SNR CR is matching the node or machine name,
// and additionally returns the node name for the SNR in case machineName is empty
func IsSNRMatching(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, nodeName string, machineName string, log logr.Logger) (bool, string, error) {
if isOwnedByMachine, ref := isOwnedByMachine(snr); isOwnedByMachine && machineName == ref.Name {
return true, "", nil
}
snrNodeName, err := getNodeName(ctx, c, snr, log)
if err != nil {
log.Error(err, "failed to get node name from machine")
return false, "", err
}
return snrNodeName == nodeName, snrNodeName, nil
}

// getNodeName gets the node name:
// - if owned by NHC, or as fallback, from annotation or CR name
// - if owned by a Machine, from the Machine's node reference
func getNodeName(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, log logr.Logger) (string, error) {
// NHC has priority, so check it first: in case the SNR is owned by NHC, get the node name from annotation or CR name
if ownedByNHC, _ := isOwnedByNHC(snr); ownedByNHC {
return getNodeNameDirect(snr), nil
}
// in case the SNR is owned by a Machine, we need to check the Machine's nodeRef
if ownedByMachine, ref := isOwnedByMachine(snr); ownedByMachine {
return getNodeNameFromMachine(ctx, c, ref, snr.GetNamespace(), log)
}
// fallback: annotation or name
return getNodeNameDirect(snr), nil
}

func getNodeNameDirect(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}

// isOwnedByNHC checks if the SNR CR is owned by a NodeHealthCheck CR.
func isOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true, &ownerRef
}
}
return false, nil
}

// isOwnedByMachine checks if the SNR CR is owned by a Machine CR.
func isOwnedByMachine(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return true, &ownerRef
}
}
return false, nil
}

func getNodeNameFromMachine(ctx context.Context, c client.Client, ref *metav1.OwnerReference, ns string, log logr.Logger) (string, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := c.Get(ctx, machineKey, machine); err != nil {
log.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return "", err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
log.Error(err, "failed to retrieve node from the unhealthy machine")
return "", err
}

return machine.Status.NodeRef.Name, nil
}
90 changes: 14 additions & 76 deletions controllers/selfnoderemediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"time"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"
"github.com/medik8s/common/pkg/events"
"github.com/medik8s/common/pkg/resources"
"github.com/pkg/errors"
Expand All @@ -39,8 +38,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/utils"
Expand Down Expand Up @@ -172,8 +169,12 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
return ctrl.Result{}, err
}

targetNodeName := getNodeName(snr)
if targetNodeName != r.MyNodeName {
snrMatches, targetNodeName, err := IsSNRMatching(ctx, r.Client, snr, r.MyNodeName, "", r.logger)
if err != nil {
r.logger.Error(err, "failed to check if SNR matches our node")
return ctrl.Result{}, err
}
if !snrMatches {
r.logger.Info("agent pod skipping remediation because node belongs to a different agent", "Agent node name", r.MyNodeName, "Remediated node name", targetNodeName)
return ctrl.Result{}, nil
}
Expand All @@ -183,7 +184,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
switch phase {
case preRebootCompletedPhase:
r.logger.Info("node reboot not completed yet, start rebooting")
node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
r.logger.Info("didn't find node, eventing might be incomplete", "node name", targetNodeName)
}
Expand Down Expand Up @@ -254,7 +255,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileManager(ctx context.Context, re
result := ctrl.Result{}
var err error

node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
if apiErrors.IsNotFound(err) {
r.logger.Info("couldn't find node matching remediation", "remediation name", snr.Name)
Expand Down Expand Up @@ -688,65 +689,20 @@ func (r *SelfNodeRemediationReconciler) setTimeAssumedRebooted(ctx context.Conte
}

// getNodeFromSnr returns the unhealthy node reported in the given snr
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
// SNR could be created by either machine based controller (e.g. MHC) or
// by a node based controller (e.g. NHC).
// In case snr is created with machine owner reference if NHC isn't it's owner it means
// it was created by a machine based controller (e.g. MHC).
if !IsOwnedByNHC(snr) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return r.getNodeFromMachine(ownerRef, snr.Namespace)
}
}
}

// since we didn't find a Machine owner ref, we assume that SNR remediation contains the node's name either in the
// remediation name or in its annotation
node := &v1.Node{}
key := client.ObjectKey{
Name: getNodeName(snr),
Namespace: "",
}

if err := r.Get(context.TODO(), key, node); err != nil {
return nil, err
}

return node, nil
}

func (r *SelfNodeRemediationReconciler) getNodeFromMachine(ref metav1.OwnerReference, ns string) (*v1.Node, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := r.Client.Get(context.Background(), machineKey, machine); err != nil {
r.logger.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return nil, err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
r.logger.Error(err, "failed to retrieve node from the unhealthy machine")
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(ctx context.Context, snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
nodeName, err := getNodeName(ctx, r.Client, snr, r.logger)
if err != nil {
return nil, err
}

node := &v1.Node{}
key := client.ObjectKey{
Name: machine.Status.NodeRef.Name,
Namespace: machine.Status.NodeRef.Namespace,
Name: nodeName,
Namespace: "",
}

if err := r.Get(context.Background(), key, node); err != nil {
r.logger.Error(err, "failed to retrieve node from the unhealthy machine",
"node name", node.Name, "machine name", machine.Name)
if err := r.Get(ctx, key, node); err != nil {
return nil, err
}

return node, nil
}

Expand Down Expand Up @@ -950,21 +906,3 @@ func (r *SelfNodeRemediationReconciler) getRuntimeStrategy(snr *v1alpha1.SelfNod
return remediationStrategy

}

func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true
}
}
return false
}

// getNodeName checks for the node name in SNR CR's annotation. If it does not exist it assumes the node name equals to SNR CR's name and returns it.
func getNodeName(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ var _ = Describe("SNR Controller", func() {
verifyEvent("Warning", "RemediationCannotStart", "Could not get remediation target Node")
})
})
When("NHC isn set as owner in the remediation", func() {
When("NHC is set as owner in the remediation", func() {
BeforeEach(func() {
snr.OwnerReferences = append(snr.OwnerReferences, metav1.OwnerReference{Name: "nhc", Kind: "NodeHealthCheck", APIVersion: "remediation.medik8s.io/v1alpha1", UID: "12345"})
})
Expand Down Expand Up @@ -625,7 +625,7 @@ func removeUnschedulableTaint() {
func verifyNodeIsUnschedulable() *v1.Node {
By("Verify that node was marked as unschedulable")
node := &v1.Node{}
Eventually(func() (bool, error) {
EventuallyWithOffset(1, func() (bool, error) {
err := k8sClient.Client.Get(context.TODO(), unhealthyNodeNamespacedName, node)
return node.Spec.Unschedulable, err
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue(), "node should be marked as unschedulable")
Expand Down
Loading
Loading