Skip to content

Commit

Permalink
update logic for snr health
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Shitrit <[email protected]>
  • Loading branch information
mshitrit committed Jan 9, 2024
1 parent cca2628 commit 743801a
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 41 deletions.
19 changes: 2 additions & 17 deletions controllers/selfnoderemediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package controllers
import (
"context"
"fmt"
"sync"
"time"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -87,8 +86,6 @@ var (
Value: "nodeshutdown",
Effect: v1.TaintEffectNoExecute,
}

lastSeenSnrNamespace string
)

type processingChangeReason string
Expand Down Expand Up @@ -118,13 +115,6 @@ func (e *UnreconcilableError) Error() string {
return e.msg
}

// GetLastSeenSnrNamespace returns the namespace of the last reconciled SNR
func (r *SelfNodeRemediationReconciler) GetLastSeenSnrNamespace() string {
r.mutex.Lock()
defer r.mutex.Unlock()
return lastSeenSnrNamespace
}

// SelfNodeRemediationReconciler reconciles a SelfNodeRemediation object
type SelfNodeRemediationReconciler struct {
client.Client
Expand All @@ -135,7 +125,6 @@ type SelfNodeRemediationReconciler struct {
Recorder record.EventRecorder
Rebooter reboot.Rebooter
MyNodeName string
mutex sync.Mutex
//we need to restore the node only after the cluster realized it can reschedule the affected workloads
//as of writing this lines, kubernetes will check for pods with non-existent node once in 20s, and allows
//40s of grace period for the node to reappear before it deletes the pods.
Expand Down Expand Up @@ -218,10 +207,6 @@ func (r *SelfNodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.
}
}

r.mutex.Lock()
lastSeenSnrNamespace = req.Namespace
r.mutex.Unlock()

result := ctrl.Result{}
var err error

Expand Down Expand Up @@ -645,7 +630,7 @@ func (r *SelfNodeRemediationReconciler) getNodeFromSnr(snr *v1alpha1.SelfNodeRem
//by a node based controller (e.g. NHC).
//In case snr is created with machine owner reference if NHC isn't it's owner it means
//it was created by a machine based controller (e.g. MHC).
if !r.isOwnedByNHC(snr) {
if !IsOwnedByNHC(snr) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return r.getNodeFromMachine(ownerRef, snr.Namespace)
Expand Down Expand Up @@ -927,7 +912,7 @@ func (r *SelfNodeRemediationReconciler) getRuntimeStrategy(strategy v1alpha1.Rem

}

func (r *SelfNodeRemediationReconciler) isOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool {
func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true
Expand Down
7 changes: 3 additions & 4 deletions pkg/peerhealth/client_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,14 @@ var _ = Describe("Checking health using grpc client and server", func() {
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Namespace: "default",
OwnerReferences: []metav1.OwnerReference{
{Name: "Dummy", Kind: "NodeHealthCheck", APIVersion: "Dummy", UID: "Dummy"},
},
},
}
err := k8sClient.Create(context.Background(), snr)
Expect(err).ToNot(HaveOccurred())

// wait until reconciled
Eventually(func() bool {
return snrReconciler.GetLastSeenSnrNamespace() != ""
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue(), "SNR not reconciled")
})

It("should return unhealthy", func() {
Expand Down
35 changes: 15 additions & 20 deletions pkg/peerhealth/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,29 +119,24 @@ func (s Server) IsHealthy(ctx context.Context, request *HealthRequest) (*HealthR
return nil, fmt.Errorf("empty node name in HealthRequest")
}

s.log.Info("checking health for", "node", nodeName)

namespace := s.snr.GetLastSeenSnrNamespace()
isMachine := len(request.GetMachineName()) > 0

// when namespace is empty, there wasn't a SNR yet, which also means that the node must be healthy
if namespace == "" {
// we didn't see a SNR yet, so the node is healthy
// but we need to check for API error, so let's get node
if _, err := s.getNode(ctx, nodeName); err != nil {
// TODO do we need to deal with isNotFound, and if so, how?
s.log.Info("no SNR seen yet, and API server issue, returning API error", "api error", err)
return toResponse(selfNodeRemediationApis.ApiError)
}
s.log.Info("no SNR seen yet, node is healthy")
return toResponse(selfNodeRemediationApis.Healthy)
//fetch all snrs from all ns
snrs := &v1alpha1.SelfNodeRemediationList{}
if err := s.snr.List(ctx, snrs); err != nil {
s.log.Error(err, "failed to fetch snrs")
return nil, err
}

if isMachine {
return toResponse(s.isHealthyBySnr(ctx, request.MachineName, namespace))
} else {
return toResponse(s.isHealthyBySnr(ctx, nodeName, namespace))
//return healthy only if all of snrs are considered healthy for that node
for _, snr := range snrs.Items {
if controllers.IsOwnedByNHC(&snr) {
if healthCode := s.isHealthyBySnr(ctx, request.NodeName, snr.Namespace); healthCode != selfNodeRemediationApis.Healthy {
return toResponse(healthCode)
}
} else if healthCode := s.isHealthyBySnr(ctx, request.MachineName, snr.Namespace); healthCode != selfNodeRemediationApis.Healthy {
return toResponse(healthCode)
}
}
return toResponse(selfNodeRemediationApis.Healthy)
}

func (s Server) isHealthyBySnr(ctx context.Context, snrName string, snrNamespace string) selfNodeRemediationApis.HealthCheckResponseCode {
Expand Down

0 comments on commit 743801a

Please sign in to comment.