Update Node Monitor Grace Period default duration to 50s

Update description

Improve flag comment

Update Test case value to be 50s by default

Update Description

Run make update

Minor description fix
This commit is contained in:
devppratik 2024-07-23 15:56:30 +05:30
parent 5420b2fe9a
commit f8bf6b97b8
7 changed files with 19 additions and 7 deletions

View File

@ -41,7 +41,8 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&o.NodeMonitorGracePeriod.Duration, "node-monitor-grace-period", o.NodeMonitorGracePeriod.Duration,
"Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+
"Must be N times more than kubelet's nodeStatusUpdateFrequency, "+
"where N means number of retries allowed for kubelet to post node status.")
"where N means number of retries allowed for kubelet to post node status. "+
"This value should also be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS")
fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.")
fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.")
fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, fmt.Sprintf("Number of nodes from which %s treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller. Notice: If nodes reside in multiple zones, this threshold will be considered as zone node size threshold for each zone to determine node eviction rate independently.", names.NodeLifecycleController))

View File

@ -32,7 +32,8 @@ type NodeLifecycleControllerConfiguration struct {
// NodeMonitorGracePeriod is the amount of time which we allow a running node to be
// unresponsive before marking it unhealthy. Must be N times more than kubelet's
// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
// to post node status.
// to post node status. This value should also be greater than the sum of
// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
NodeMonitorGracePeriod metav1.Duration
// secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold
LargeClusterSizeThreshold int32

View File

@ -37,8 +37,13 @@ func RecommendedDefaultNodeLifecycleControllerConfiguration(obj *kubectrlmgrconf
if obj.PodEvictionTimeout == zero {
obj.PodEvictionTimeout = metav1.Duration{Duration: 5 * time.Minute}
}
// NodeMonitorGracePeriod is set to a default value of 50 seconds.
// This value should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
// to ensure that the server has adequate time to handle slow or idle connections
// properly before marking a node as unhealthy.
if obj.NodeMonitorGracePeriod == zero {
obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 40 * time.Second}
obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 50 * time.Second}
}
if obj.NodeStartupGracePeriod == zero {
obj.NodeStartupGracePeriod = metav1.Duration{Duration: 60 * time.Second}

View File

@ -284,7 +284,11 @@ type Controller struct {
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
// update frequency.
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
// 2. nodeMonitorGracePeriod should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
// to ensure that the server has adequate time to handle slow or idle connections
// properly before marking a node as unhealthy.
// 3. nodeMonitorGracePeriod can't be too large for user experience - larger
// value takes longer for user to see up-to-date node health.
nodeMonitorGracePeriod time.Duration

View File

@ -52,7 +52,7 @@ import (
)
const (
testNodeMonitorGracePeriod = 40 * time.Second
testNodeMonitorGracePeriod = 50 * time.Second
testNodeStartupGracePeriod = 60 * time.Second
testNodeMonitorPeriod = 5 * time.Second
testRateLimiterQPS = float32(100000)

View File

@ -58828,7 +58828,7 @@ func schema_k8sio_kube_controller_manager_config_v1alpha1_NodeLifecycleControlle
},
"NodeMonitorGracePeriod": {
SchemaProps: spec.SchemaProps{
Description: "nodeMontiorGracePeriod is the amount of time which we allow a running node to be unresponsive before marking it unhealthy. Must be N times more than kubelet's nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet to post node status.",
Description: "nodeMontiorGracePeriod is the amount of time which we allow a running node to be unresponsive before marking it unhealthy. Must be N times more than kubelet's nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet to post node status. This value should also be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.",
Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Duration"),
},
},

View File

@ -403,7 +403,8 @@ type NodeLifecycleControllerConfiguration struct {
// nodeMontiorGracePeriod is the amount of time which we allow a running node to be
// unresponsive before marking it unhealthy. Must be N times more than kubelet's
// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
// to post node status.
// to post node status. This value should also be greater than the sum of
// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
NodeMonitorGracePeriod metav1.Duration
// podEvictionTimeout is the grace period for deleting pods on failed nodes.
PodEvictionTimeout metav1.Duration