Skip to content

Commit

Permalink
add cp node deletion param configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
gandhipr committed Feb 27, 2024
1 parent 0c62f54 commit 02d1c51
Show file tree
Hide file tree
Showing 8 changed files with 23 additions and 15 deletions.
2 changes: 2 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ type AutoscalingOptions struct {
BypassedSchedulers map[string]bool
// ProvisioningRequestEnabled tells if CA processes ProvisioningRequest.
ProvisioningRequestEnabled bool
// MaxCloudProviderNodeDeletionTime is the maximum time needed by cloud provider to delete a node
MaxCloudProviderNodeDeletionTime time.Duration
}

// KubeClientOptions specify options for kube client
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ import (
const (
// MaxKubernetesEmptyNodeDeletionTime is the maximum time needed by Kubernetes to delete an empty node.
MaxKubernetesEmptyNodeDeletionTime = 3 * time.Minute
// MaxCloudProviderNodeDeletionTime is the maximum time needed by cloud provider to delete a node.
MaxCloudProviderNodeDeletionTime = 5 * time.Minute
)

// NodeDeletionBatcher batch scale down candidates for one node group and remove them.
Expand Down Expand Up @@ -180,9 +178,9 @@ func nodeScaleDownReason(node *apiv1.Node, drain bool) metrics.NodeScaleDownReas
}

// IsNodeBeingDeleted returns true iff a given node is being deleted.
func IsNodeBeingDeleted(node *apiv1.Node, timestamp time.Time) bool {
func IsNodeBeingDeleted(ctx *context.AutoscalingContext, node *apiv1.Node, timestamp time.Time) bool {
deleteTime, _ := taints.GetToBeDeletedTime(node)
return deleteTime != nil && (timestamp.Sub(*deleteTime) < MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < MaxKubernetesEmptyNodeDeletionTime)
return deleteTime != nil && (timestamp.Sub(*deleteTime) < ctx.MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < MaxKubernetesEmptyNodeDeletionTime)
}

// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log an error.
Expand All @@ -203,7 +201,7 @@ func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node
func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker) {
ctx.Recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "nodes removed by cluster autoscaler")
currentTime := time.Now()
expectedDeleteTime := time.Now().Add(MaxCloudProviderNodeDeletionTime)
expectedDeleteTime := time.Now().Add(ctx.MaxCloudProviderNodeDeletionTime)
scaleStateNotifier.RegisterScaleDown(nodeGroup, node.Name, currentTime, expectedDeleteTime)
gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node)
metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func (c *Checker) FilterOutUnremovable(context *context.AutoscalingContext, scal
func (c *Checker) unremovableReasonAndNodeUtilization(context *context.AutoscalingContext, timestamp time.Time, nodeInfo *schedulerframework.NodeInfo, utilLogsQuota *klogx.Quota) (simulator.UnremovableReason, *utilization.Info) {
node := nodeInfo.Node()

if actuation.IsNodeBeingDeleted(node, timestamp) {
if actuation.IsNodeBeingDeleted(context, node, timestamp) {
klog.V(1).Infof("Skipping %s from delete consideration - the node is currently being deleted", node.Name)
return simulator.CurrentlyBeingDeleted, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ func TestFilterOutUnremovable(t *testing.T) {
ScaleDownUnreadyTime: config.DefaultScaleDownUnreadyTime,
IgnoreDaemonSetsUtilization: tc.ignoreDaemonSetsUtilization,
},
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
}
s := nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults)
c := NewChecker(s)
Expand Down
8 changes: 4 additions & 4 deletions cluster-autoscaler/core/scaledown/resource/limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func NoLimits() Limits {
// LimitsLeft returns the amount of each resource that can be deleted from the
// cluster without violating any constraints.
func (lf *LimitsFinder) LimitsLeft(context *context.AutoscalingContext, nodes []*apiv1.Node, resourceLimiter *cloudprovider.ResourceLimiter, timestamp time.Time) Limits {
totalCores, totalMem := coresMemoryTotal(nodes, timestamp)
totalCores, totalMem := coresMemoryTotal(context, nodes, timestamp)

var totalResources map[string]int64
var totalResourcesErr error
Expand Down Expand Up @@ -102,10 +102,10 @@ func computeAboveMin(total int64, min int64) int64 {
return 0
}

func coresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
func coresMemoryTotal(ctx *context.AutoscalingContext, nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
var coresTotal, memoryTotal int64
for _, node := range nodes {
if actuation.IsNodeBeingDeleted(node, timestamp) {
if actuation.IsNodeBeingDeleted(ctx, node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
Expand All @@ -122,7 +122,7 @@ func (lf *LimitsFinder) customResourcesTotal(context *context.AutoscalingContext
result := make(map[string]int64)
ngCache := make(map[string][]customresources.CustomResourceTarget)
for _, node := range nodes {
if actuation.IsNodeBeingDeleted(node, timestamp) {
if actuation.IsNodeBeingDeleted(context, node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
Expand Down
8 changes: 7 additions & 1 deletion cluster-autoscaler/core/scaledown/resource/limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"
"time"

"k8s.io/autoscaler/cluster-autoscaler/config"
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
Expand Down Expand Up @@ -55,7 +56,12 @@ func TestCalculateCoresAndMemoryTotal(t *testing.T) {
},
}

coresTotal, memoryTotal := coresMemoryTotal(nodes, time.Now())
options := config.AutoscalingOptions{
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
}
context, err := NewScaleTestAutoscalingContext(options, nil, nil, nil, nil, nil)
assert.NoError(t, err)
coresTotal, memoryTotal := coresMemoryTotal(&context, nodes, time.Now())

assert.Equal(t, int64(42), coresTotal)
assert.Equal(t, int64(44000*utils.MiB), memoryTotal)
Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
}

// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
coresTotal, memoryTotal := calculateCoresMemoryTotal(a.AutoscalingContext, allNodes, currentTime)
metrics.UpdateClusterCPUCurrentCores(coresTotal)
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)

Expand Down Expand Up @@ -1048,12 +1048,12 @@ func getUpcomingNodeInfos(upcomingCounts map[string]int, nodeInfos map[string]*s
return upcomingNodes
}

func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
func calculateCoresMemoryTotal(context *context.AutoscalingContext, nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
var coresTotal, memoryTotal int64
for _, node := range nodes {
if actuation.IsNodeBeingDeleted(node, timestamp) {
if actuation.IsNodeBeingDeleted(context, node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
Expand Down
3 changes: 2 additions & 1 deletion cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ var (
"--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+
"Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+
"Eg. flag usage: '10000:20,1000:100,0:60'")
provisioningRequestsEnabled = flag.Bool("enable-provisioning-requests", false, "Whether the clusterautoscaler will be handling the ProvisioningRequest CRs.")
provisioningRequestsEnabled = flag.Bool("enable-provisioning-requests", false, "Whether the clusterautoscaler will be handling the ProvisioningRequest CRs.")
maxCloudProviderNodeDeletionTime = flag.Duration("max-cloud-provider-node-deletion-time", 5*time.Minute, "Maximum time needed by cloud provider to delete a node")
)

func isFlagPassed(name string) bool {
Expand Down

0 comments on commit 02d1c51

Please sign in to comment.