Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop remediation when node not found #138

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions controllers/selfnoderemediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,10 @@ var (
type processingChangeReason string

const (
remediationStarted processingChangeReason = "RemediationStarted"
remediationTerminatedByNHC processingChangeReason = "RemediationStoppedByNHC"
remediationFinished processingChangeReason = "RemediationFinished"
remediationStarted processingChangeReason = "RemediationStarted"
remediationTerminatedByNHC processingChangeReason = "RemediationStoppedByNHC"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I would follow the pattern of the const name match its value by updating the const name to be remediationStoppedByNHC

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and NHC is signalling a timeout actually, not stopping or terminating anything 😉

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Too many verbs for one action :D

remediationFinishedSuccessfully processingChangeReason = "RemediationFinishedSuccessfully"
remediationFinishedNodeNotFound processingChangeReason = "RemediationFinishedNodeNotFound"
)

type remediationPhase string
Expand Down Expand Up @@ -219,12 +220,15 @@ func (r *SelfNodeRemediationReconciler) updateConditions(processingTypeReason pr
case remediationStarted:
processingConditionStatus = metav1.ConditionTrue
succeededConditionStatus = metav1.ConditionUnknown
case remediationFinished:
case remediationFinishedSuccessfully:
processingConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionTrue
case remediationTerminatedByNHC:
processingConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionFalse
case remediationFinishedNodeNotFound:
processingConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionFalse
default:
err := fmt.Errorf("unkown processingChangeReason:%s", processingTypeReason)
r.Log.Error(err, "couldn't update snr processing condition")
Expand All @@ -243,11 +247,10 @@ func (r *SelfNodeRemediationReconciler) updateConditions(processingTypeReason pr
})

//Reason is mandatory, and reason for processing matches succeeded
succeededTypeReason := string(processingTypeReason)
meta.SetStatusCondition(&snr.Status.Conditions, metav1.Condition{
Type: v1alpha1.SucceededConditionType,
Status: succeededConditionStatus,
Reason: succeededTypeReason,
Reason: string(processingTypeReason),
})

return nil
Expand Down Expand Up @@ -377,6 +380,13 @@ type removeNodeResources func(*v1.Node, *v1alpha1.SelfNodeRemediation) (time.Dur
func (r *SelfNodeRemediationReconciler) remediateWithResourceRemoval(snr *v1alpha1.SelfNodeRemediation, rmNodeResources removeNodeResources) (ctrl.Result, error) {
node, err := r.getNodeFromSnr(snr)
if err != nil {
if apiErrors.IsNotFound(err) {
r.logger.Info("couldn't find node matching remediation", "node name", snr.Name)
if err = r.updateConditions(remediationFinishedNodeNotFound, snr); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
r.logger.Error(err, "failed to get node", "node name", snr.Name)
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -469,7 +479,7 @@ func (r *SelfNodeRemediationReconciler) handleRebootCompletedPhase(node *v1.Node
fencingCompleted := string(fencingCompletedPhase)
snr.Status.Phase = &fencingCompleted

return ctrl.Result{}, r.updateConditions(remediationFinished, snr)
return ctrl.Result{}, r.updateConditions(remediationFinishedSuccessfully, snr)
}

func (r *SelfNodeRemediationReconciler) handleFencingCompletedPhase(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) {
Expand Down
71 changes: 42 additions & 29 deletions controllers/selfnoderemediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ const (
)

var _ = Describe("snr Controller", func() {
snr := &v1alpha1.SelfNodeRemediation{}
snr.Name = unhealthyNodeName
snr.Namespace = snrNamespace
var snr *v1alpha1.SelfNodeRemediation

BeforeEach(func() {
k8sClient.ShouldSimulateFailure = false
snr = &v1alpha1.SelfNodeRemediation{}
snr.Name = unhealthyNodeName
snr.Namespace = snrNamespace
})

AfterEach(func() {
Expand Down Expand Up @@ -66,7 +67,7 @@ var _ = Describe("snr Controller", func() {
//be in a safe state (i.e. rebooted)
var remediationStrategy v1alpha1.RemediationStrategyType
JustBeforeEach(func() {
createSNR(remediationStrategy)
createSNR(snr, remediationStrategy)
})

AfterEach(func() {
Expand Down Expand Up @@ -104,7 +105,7 @@ var _ = Describe("snr Controller", func() {

BeforeEach(func() {
createSelfNodeRemediationPod()
createSNR(v1alpha1.ResourceDeletionRemediationStrategy)
createSNR(snr, v1alpha1.ResourceDeletionRemediationStrategy)
})

AfterEach(func() {
Expand Down Expand Up @@ -139,13 +140,22 @@ var _ = Describe("snr Controller", func() {
Context("Unhealthy node with api-server access", func() {
var remediationStrategy v1alpha1.RemediationStrategyType
var isSNRNeedsDeletion = true
var vaName = "some-va"
var isSetResources = true
JustBeforeEach(func() {
createSelfNodeRemediationPod()
updateIsRebootCapable("true")
createSNR(remediationStrategy)
if isSetResources {
createVolumeAttachment(vaName)
createSelfNodeRemediationPod()
updateIsRebootCapable("true")
By("make sure self node remediation exists with correct label")
verifySelfNodeRemediationPodExist()
}

By("make sure self node remediation exists with correct label")
verifySelfNodeRemediationPodExist()
createSNR(snr, remediationStrategy)
})

BeforeEach(func() {
isSetResources = true
})

AfterEach(func() {
Expand All @@ -156,11 +166,9 @@ var _ = Describe("snr Controller", func() {
})

Context("ResourceDeletion strategy", func() {
var vaName = "some-va"

BeforeEach(func() {
remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy
createVolumeAttachment(vaName)
})

AfterEach(func() {
Expand All @@ -172,7 +180,7 @@ var _ = Describe("snr Controller", func() {

addUnschedulableTaint(node)

verifyTypeConditions(metav1.ConditionTrue, metav1.ConditionUnknown)
verifyTypeConditions(snr.Name, metav1.ConditionTrue, metav1.ConditionUnknown, "RemediationStarted")

verifyTimeHasBeenRebootedExists()

Expand All @@ -186,7 +194,7 @@ var _ = Describe("snr Controller", func() {

verifyNoExecuteTaintExist()

verifyTypeConditions(metav1.ConditionFalse, metav1.ConditionTrue)
verifyTypeConditions(snr.Name, metav1.ConditionFalse, metav1.ConditionTrue, "RemediationFinishedSuccessfully")

deleteSNR(snr)
isSNRNeedsDeletion = false
Expand All @@ -208,7 +216,7 @@ var _ = Describe("snr Controller", func() {

addUnschedulableTaint(node)

verifyTypeConditions(metav1.ConditionTrue, metav1.ConditionUnknown)
verifyTypeConditions(snr.Name, metav1.ConditionTrue, metav1.ConditionUnknown, "RemediationStarted")

verifyTimeHasBeenRebootedExists()

Expand Down Expand Up @@ -236,15 +244,21 @@ var _ = Describe("snr Controller", func() {
verifySNRDoesNotExists()

})
When("Node isn't found", func() {
BeforeEach(func() {
isSetResources = false
snr.Name = "non-existing-node"
})

It("remediation should stop and update conditions", func() {
verifyTypeConditions(snr.Name, metav1.ConditionFalse, metav1.ConditionFalse, "RemediationFinishedNodeNotFound")
})
})
})

Context("OutOfServiceTaint strategy", func() {
var vaName = "some-va"

BeforeEach(func() {
remediationStrategy = v1alpha1.OutOfServiceTaintRemediationStrategy
createVolumeAttachment(vaName)
})

AfterEach(func() {
Expand All @@ -256,7 +270,7 @@ var _ = Describe("snr Controller", func() {

addUnschedulableTaint(node)

verifyTypeConditions(metav1.ConditionTrue, metav1.ConditionUnknown)
verifyTypeConditions(snr.Name, metav1.ConditionTrue, metav1.ConditionUnknown, "RemediationStarted")

// The normal NoExecute taint tries to delete pods, however it can't delete pods
// with stateful workloads like volumes and they are stuck in terminating status.
Expand All @@ -278,7 +292,7 @@ var _ = Describe("snr Controller", func() {

verifyOutOfServiceTaintRemoved()

verifyTypeConditions(metav1.ConditionFalse, metav1.ConditionTrue)
verifyTypeConditions(snr.Name, metav1.ConditionFalse, metav1.ConditionTrue, "RemediationFinishedSuccessfully")

deleteSNR(snr)
isSNRNeedsDeletion = false
Expand All @@ -303,7 +317,7 @@ var _ = Describe("snr Controller", func() {
BeforeEach(func() {
By("Simulate api-server failure")
k8sClient.ShouldSimulateFailure = true
createSNR(v1alpha1.ResourceDeletionRemediationStrategy)
createSNR(snr, v1alpha1.ResourceDeletionRemediationStrategy)
})

AfterEach(func() {
Expand Down Expand Up @@ -346,18 +360,20 @@ func createVolumeAttachment(vaName string) {
ExpectWithOffset(1, k8sClient.Create(context.Background(), va)).To(Succeed())
}

func verifyTypeConditions(expectedProcessingConditionStatus, expectedSucceededConditionStatus metav1.ConditionStatus) {
func verifyTypeConditions(nodeName string, expectedProcessingConditionStatus, expectedSucceededConditionStatus metav1.ConditionStatus, expectedReason string) {
By("Verify that SNR Processing status condition is correct")
snr := &v1alpha1.SelfNodeRemediation{}
Eventually(func() bool {
snrNamespacedName := client.ObjectKey{Name: unhealthyNodeName, Namespace: snrNamespace}
snrNamespacedName := client.ObjectKey{Name: nodeName, Namespace: snrNamespace}
if err := k8sClient.Client.Get(context.Background(), snrNamespacedName, snr); err != nil {
return false
}
isActualProcessingMatchExpected := meta.IsStatusConditionPresentAndEqual(snr.Status.Conditions, v1alpha1.ProcessingConditionType, expectedProcessingConditionStatus)
actualProcessingCondition := meta.FindStatusCondition(snr.Status.Conditions, v1alpha1.ProcessingConditionType)
isActualProcessingMatchExpected := actualProcessingCondition != nil && actualProcessingCondition.Status == expectedProcessingConditionStatus
isActualSucceededMatchExpected := meta.IsStatusConditionPresentAndEqual(snr.Status.Conditions, v1alpha1.SucceededConditionType, expectedSucceededConditionStatus)

return isActualProcessingMatchExpected &&
isActualSucceededMatchExpected
isActualSucceededMatchExpected && actualProcessingCondition.Reason == expectedReason

}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
}
Expand Down Expand Up @@ -556,10 +572,7 @@ func deleteSNR(snr *v1alpha1.SelfNodeRemediation) {
ExpectWithOffset(1, k8sClient.Client.Delete(context.Background(), snr)).To(Succeed(), "failed to delete snr CR")
}

func createSNR(strategy v1alpha1.RemediationStrategyType) {
snr := &v1alpha1.SelfNodeRemediation{}
snr.Name = unhealthyNodeName
snr.Namespace = snrNamespace
func createSNR(snr *v1alpha1.SelfNodeRemediation, strategy v1alpha1.RemediationStrategyType) {
snr.Spec.RemediationStrategy = strategy
ExpectWithOffset(1, k8sClient.Client.Create(context.TODO(), snr)).To(Succeed(), "failed to create snr CR")
}
Expand Down