Skip to content

Revert "Update Node Monitor Grace Period default duration to 50s" #2376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: release-4.19
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&o.NodeMonitorGracePeriod.Duration, "node-monitor-grace-period", o.NodeMonitorGracePeriod.Duration,
"Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+
"Must be N times more than kubelet's nodeStatusUpdateFrequency, "+
"where N means number of retries allowed for kubelet to post node status. "+
"This value should also be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS")
"where N means number of retries allowed for kubelet to post node status.")
fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.")
fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.")
fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, fmt.Sprintf("Number of nodes from which %s treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller. Notice: If nodes reside in multiple zones, this threshold will be considered as zone node size threshold for each zone to determine node eviction rate independently.", names.NodeLifecycleController))
Expand Down
3 changes: 1 addition & 2 deletions pkg/controller/nodelifecycle/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ type NodeLifecycleControllerConfiguration struct {
// NodeMonitorGracePeriod is the amount of time which we allow a running node to be
// unresponsive before marking it unhealthy. Must be N times more than kubelet's
// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
// to post node status. This value should also be greater than the sum of
// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
// to post node status.
NodeMonitorGracePeriod metav1.Duration
// secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold
LargeClusterSizeThreshold int32
Expand Down
7 changes: 1 addition & 6 deletions pkg/controller/nodelifecycle/config/v1alpha1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,8 @@ func RecommendedDefaultNodeLifecycleControllerConfiguration(obj *kubectrlmgrconf
if obj.PodEvictionTimeout == zero {
obj.PodEvictionTimeout = metav1.Duration{Duration: 5 * time.Minute}
}
// NodeMonitorGracePeriod is set to a default value of 50 seconds.
// This value should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
// to ensure that the server has adequate time to handle slow or idle connections
// properly before marking a node as unhealthy.
if obj.NodeMonitorGracePeriod == zero {
obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 50 * time.Second}
obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 40 * time.Second}
}
if obj.NodeStartupGracePeriod == zero {
obj.NodeStartupGracePeriod = metav1.Duration{Duration: 60 * time.Second}
Expand Down
6 changes: 1 addition & 5 deletions pkg/controller/nodelifecycle/node_lifecycle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,7 @@ type Controller struct {
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
// update frequency.
// 2. nodeMonitorGracePeriod should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
// to ensure that the server has adequate time to handle slow or idle connections
// properly before marking a node as unhealthy.
// 3. nodeMonitorGracePeriod can't be too large for user experience - larger
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
// value takes longer for user to see up-to-date node health.
nodeMonitorGracePeriod time.Duration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ import (
)

const (
testNodeMonitorGracePeriod = 50 * time.Second
testNodeMonitorGracePeriod = 40 * time.Second
testNodeStartupGracePeriod = 60 * time.Second
testNodeMonitorPeriod = 5 * time.Second
testRateLimiterQPS = float32(100000)
Expand Down
2 changes: 1 addition & 1 deletion pkg/generated/openapi/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,7 @@ type NodeLifecycleControllerConfiguration struct {
// nodeMontiorGracePeriod is the amount of time which we allow a running node to be
// unresponsive before marking it unhealthy. Must be N times more than kubelet's
// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
// to post node status. This value should also be greater than the sum of
// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
// to post node status.
NodeMonitorGracePeriod metav1.Duration
// podEvictionTimeout is the grace period for deleting pods on failed nodes.
PodEvictionTimeout metav1.Duration
Expand Down