From 7b24b44084d10c028345bb0b2acbd6633af822c6 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Thu, 3 Oct 2024 16:32:27 -0700 Subject: [PATCH] cherry-pick: don't include terminated nodes in budget (#1735) (#1736) --- pkg/controllers/disruption/helpers.go | 7 ++++++ pkg/controllers/disruption/suite_test.go | 32 ++++++++++++++++++++++++ pkg/controllers/state/statenode.go | 4 +-- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/disruption/helpers.go b/pkg/controllers/disruption/helpers.go index b6aaa819d6..3664da937a 100644 --- a/pkg/controllers/disruption/helpers.go +++ b/pkg/controllers/disruption/helpers.go @@ -212,6 +212,13 @@ func BuildDisruptionBudgets(ctx context.Context, cluster *state.Cluster, clk clo continue } + // Additionally, don't consider nodeclaims that have the terminating condition. A nodeclaim should have + // the Terminating condition only when the node is drained and cloudprovider.Delete() was successful + // on the underlying cloud provider machine. + if node.NodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue() { + continue + } + nodePool := node.Labels()[v1.NodePoolLabelKey] numNodes[nodePool]++ diff --git a/pkg/controllers/disruption/suite_test.go b/pkg/controllers/disruption/suite_test.go index 5ab1e4ba23..0cb929ae5f 100644 --- a/pkg/controllers/disruption/suite_test.go +++ b/pkg/controllers/disruption/suite_test.go @@ -667,6 +667,38 @@ var _ = Describe("BuildDisruptionBudgetMapping", func() { Expect(budgets[nodePool.Name][reason]).To(Equal(10)) } }) + It("should not consider nodes that have the terminating status condition as part of disruption count", func() { + nodePool.Spec.Disruption.Budgets = []v1.Budget{{Nodes: "100%"}} + ExpectApplied(ctx, env.Client, nodePool) + nodeClaim, node := test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{"karpenter.sh/test-finalizer"}, + Labels: map[string]string{ + v1.NodePoolLabelKey: nodePool.Name, + corev1.LabelInstanceTypeStable: mostExpensiveInstance.Name, + v1.CapacityTypeLabelKey: mostExpensiveOffering.Requirements.Get(v1.CapacityTypeLabelKey).Any(), + corev1.LabelTopologyZone: mostExpensiveOffering.Requirements.Get(corev1.LabelTopologyZone).Any(), + }, + }, + Status: v1.NodeClaimStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("32"), + corev1.ResourcePods: resource.MustParse("100"), + }, + }, + }) + nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInstanceTerminating) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, nodeClaimStateController, client.ObjectKeyFromObject(nodeClaim)) + + budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder) + Expect(err).To(Succeed()) + // This should not bring in the terminating node. + for _, reason := range v1.WellKnownDisruptionReasons { + Expect(budgets[nodePool.Name][reason]).To(Equal(10)) + } + }) It("should not return a negative disruption value", func() { nodePool.Spec.Disruption.Budgets = []v1.Budget{{Nodes: "10%"}} ExpectApplied(ctx, env.Client, nodePool) diff --git a/pkg/controllers/state/statenode.go b/pkg/controllers/state/statenode.go index 5fcbf57a7c..9b5b6d2c07 100644 --- a/pkg/controllers/state/statenode.go +++ b/pkg/controllers/state/statenode.go @@ -408,10 +408,10 @@ func (in *StateNode) PodLimits() corev1.ResourceList { func (in *StateNode) MarkedForDeletion() bool { // The Node is marked for deletion if: // 1. The Node has MarkedForDeletion set - // 2. The Node has a NodeClaim counterpart and is actively deleting + // 2. The Node has a NodeClaim counterpart and is actively deleting (or the nodeclaim is marked as terminating) // 3. The Node has no NodeClaim counterpart and is actively deleting return in.markedForDeletion || - (in.NodeClaim != nil && !in.NodeClaim.DeletionTimestamp.IsZero()) || + (in.NodeClaim != nil && (!in.NodeClaim.DeletionTimestamp.IsZero() || in.NodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue())) || (in.Node != nil && in.NodeClaim == nil && !in.Node.DeletionTimestamp.IsZero()) }