Merge pull request #126287 from devppratik/121793-update-node-monitor-grace-period

node: Update Node Monitor Grace Period default duration to 50s
2026-01-21 13:49:13 +00:00 · 2024-08-13 21:03:16 -07:00
parent cc4ed1d3fa f8bf6b97b8
commit bb7411120a
7 changed files with 19 additions and 7 deletions
--- a/pkg/controller/nodelifecycle/config/types.go
+++ b/pkg/controller/nodelifecycle/config/types.go
@@ -32,7 +32,8 @@ type NodeLifecycleControllerConfiguration struct {
 	// NodeMonitorGracePeriod is the amount of time which we allow a running node to be
 	// unresponsive before marking it unhealthy. Must be N times more than kubelet's
 	// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
-	// to post node status.
+	// to post node status. This value should also be greater than the sum of
+	// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
 	NodeMonitorGracePeriod metav1.Duration
 	// secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold
 	LargeClusterSizeThreshold int32
--- a/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go
+++ b/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go
@@ -37,8 +37,13 @@ func RecommendedDefaultNodeLifecycleControllerConfiguration(obj *kubectrlmgrconf
 	if obj.PodEvictionTimeout == zero {
 		obj.PodEvictionTimeout = metav1.Duration{Duration: 5 * time.Minute}
 	}
+	// NodeMonitorGracePeriod is set to a default value of 50 seconds.
+	// This value should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
+	// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
+	// to ensure that the server has adequate time to handle slow or idle connections
+	// properly before marking a node as unhealthy.
 	if obj.NodeMonitorGracePeriod == zero {
-		obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 40 * time.Second}
+		obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 50 * time.Second}
 	}
 	if obj.NodeStartupGracePeriod == zero {
 		obj.NodeStartupGracePeriod = metav1.Duration{Duration: 60 * time.Second}
--- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go
+++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go
@@ -284,7 +284,11 @@ type Controller struct {
 	//    be less than the node health signal update frequency, since there will
 	//    only be fresh values from Kubelet at an interval of node health signal
 	//    update frequency.
-	// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
+	// 2. nodeMonitorGracePeriod should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
+	// 	  and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
+	// 	  to ensure that the server has adequate time to handle slow or idle connections
+	//    properly before marking a node as unhealthy.
+	// 3. nodeMonitorGracePeriod can't be too large for user experience - larger
 	//    value takes longer for user to see up-to-date node health.
 	nodeMonitorGracePeriod time.Duration

--- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go
+++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go
@@ -52,7 +52,7 @@ import (
 )

 const (
-	testNodeMonitorGracePeriod = 40 * time.Second
+	testNodeMonitorGracePeriod = 50 * time.Second
 	testNodeStartupGracePeriod = 60 * time.Second
 	testNodeMonitorPeriod      = 5 * time.Second
 	testRateLimiterQPS         = float32(100000)