Skip to content

Commit 312756a

Browse files
committed
make maxNodeStartupTime configurable
1 parent 7b9cb8c commit 312756a

File tree

8 files changed

+183
-151
lines changed

8 files changed

+183
-151
lines changed

cluster-autoscaler/FAQ.md

Lines changed: 149 additions & 148 deletions
Large diffs are not rendered by default.

cluster-autoscaler/cloudprovider/clusterapi/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,8 @@ metadata:
326326
cluster.x-k8s.io/autoscaling-options-scaledownunreadytime: "20m0s"
327327
# overrides --max-node-provision-time global value for that specific MachineDeployment
328328
cluster.x-k8s.io/autoscaling-options-maxnodeprovisiontime: "20m0s"
329+
# overrides --max-node-startup-time global value for that specific MachineDeployment
330+
cluster.x-k8s.io/autoscaling-options-maxnodestartuptime: "20m0s"
329331
```
330332

331333
#### CPU Architecture awareness for single-arch clusters

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,9 @@ func (ng *nodegroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*c
454454
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeProvisionTimeKey); ok {
455455
defaults.MaxNodeProvisionTime = opt
456456
}
457+
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeStartupTimeKey); ok {
458+
defaults.MaxNodeStartupTime = opt
459+
}
457460

458461
return &defaults, nil
459462
}

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,14 +624,21 @@ type Readiness struct {
624624
func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
625625
perNodeGroup := make(map[string]Readiness)
626626
total := Readiness{Time: currentTime}
627-
627+
maxNodeStartupTime := MaxNodeStartupTime
628628
update := func(current Readiness, node *apiv1.Node, nr kube_util.NodeReadiness) Readiness {
629+
nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node)
630+
if errNg == nil && nodeGroup != nil {
631+
if startupTime, err := csr.nodeGroupConfigProcessor.GetMaxNodeStartupTime(nodeGroup); err == nil {
632+
maxNodeStartupTime = startupTime
633+
}
634+
}
635+
klog.V(1).Infof("Node %s: using maxNodeStartupTime = %v", node.Name, maxNodeStartupTime)
629636
current.Registered = append(current.Registered, node.Name)
630637
if _, isDeleted := csr.deletedNodes[node.Name]; isDeleted {
631638
current.Deleted = append(current.Deleted, node.Name)
632639
} else if nr.Ready {
633640
current.Ready = append(current.Ready, node.Name)
634-
} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
641+
} else if node.CreationTimestamp.Time.Add(maxNodeStartupTime).After(currentTime) {
635642
current.NotStarted = append(current.NotStarted, node.Name)
636643
} else {
637644
current.Unready = append(current.Unready, node.Name)

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ type NodeGroupAutoscalingOptions struct {
5050
ScaleDownUnreadyTime time.Duration
5151
// Maximum time CA waits for node to be provisioned
5252
MaxNodeProvisionTime time.Duration
53+
// Maximum time CA waits for node to be ready from registered
54+
MaxNodeStartupTime time.Duration
5355
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
5456
ZeroOrMaxNodeScaling bool
5557
// AllowNonAtomicScaleUpToMax indicates that partially failing scale-ups of ZeroOrMaxNodeScaling node groups should not be cancelled

cluster-autoscaler/config/const.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@ const (
3838
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime"
3939
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option
4040
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime"
41+
// DefaultMaxNodeStartupTimeKey identifies MaxNodeProvisionTime autoscaling option
42+
DefaultMaxNodeStartupTimeKey = "maxnodestartuptime"
4143
// DefaultIgnoreDaemonSetsUtilizationKey identifies IgnoreDaemonSetsUtilization autoscaling option
4244
DefaultIgnoreDaemonSetsUtilizationKey = "ignoredaemonsetsutilization"
43-
4445
// DefaultScaleDownUnneededTime is the default time duration for which CA waits before deleting an unneeded node
4546
DefaultScaleDownUnneededTime = 10 * time.Minute
4647
// DefaultScaleDownUnreadyTime identifies ScaleDownUnreadyTime autoscaling option

cluster-autoscaler/config/flags/flags.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ var (
124124
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
125125
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
126126
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
127+
maxNodeStartupTime = flag.Duration("max-node-start-up-time", 15*time.Minute, "The maximum time from the moment the node is registered to the time the node is ready - the value can be overridden per node group")
127128
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
128129
nodeGroupsFlag = multiStringFlag(
129130
"nodes",
@@ -290,6 +291,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
290291
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
291292
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
292293
MaxNodeProvisionTime: *maxNodeProvisionTime,
294+
MaxNodeStartupTime: *maxNodeStartupTime,
293295
},
294296
CloudConfig: *cloudConfig,
295297
CloudProviderName: *cloudProviderFlag,

cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ type NodeGroupConfigProcessor interface {
3535
GetScaleDownGpuUtilizationThreshold(nodeGroup cloudprovider.NodeGroup) (float64, error)
3636
// GetMaxNodeProvisionTime return MaxNodeProvisionTime value that should be used for a given NodeGroup.
3737
GetMaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error)
38+
// GetMaxNodeStartupTime return MaxNodeStartupTime value that should be used for a given NodeGroup.
39+
GetMaxNodeStartupTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error)
3840
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
3941
GetIgnoreDaemonSetsUtilization(nodeGroup cloudprovider.NodeGroup) (bool, error)
4042
// CleanUp cleans up processor's internal structures.
@@ -108,6 +110,18 @@ func (p *DelegatingNodeGroupConfigProcessor) GetMaxNodeProvisionTime(nodeGroup c
108110
return ngConfig.MaxNodeProvisionTime, nil
109111
}
110112

113+
// GetMaxNodeProvisionTime returns MaxNodeStartupTime value that should be used for a given NodeGroup.
114+
func (p *DelegatingNodeGroupConfigProcessor) GetMaxNodeStartupTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
115+
ngConfig, err := nodeGroup.GetOptions(p.nodeGroupDefaults)
116+
if err != nil && err != cloudprovider.ErrNotImplemented {
117+
return time.Duration(0), err
118+
}
119+
if ngConfig == nil || err == cloudprovider.ErrNotImplemented {
120+
return p.nodeGroupDefaults.MaxNodeStartupTime, nil
121+
}
122+
return ngConfig.MaxNodeStartupTime, nil
123+
}
124+
111125
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
112126
func (p *DelegatingNodeGroupConfigProcessor) GetIgnoreDaemonSetsUtilization(nodeGroup cloudprovider.NodeGroup) (bool, error) {
113127
ngConfig, err := nodeGroup.GetOptions(p.nodeGroupDefaults)

0 commit comments

Comments
 (0)