apiserver: make watch termination during shutdown configurable

2025-07-21 19:01:49 +00:00 · 2023-02-10 11:43:06 -05:00 · 2023-02-10 11:43:06 -05:00 · 791fcd6fb4
commit 791fcd6fb4
parent 6385b86a9b
5 changed files with 140 additions and 21 deletions
--- a/staging/src/k8s.io/apiserver/pkg/server/config.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/config.go
@ -276,6 +276,23 @@ type Config struct {

 	// AggregatedDiscoveryGroupManager serves /apis in an aggregated form.
 	AggregatedDiscoveryGroupManager discoveryendpoint.ResourceManager
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 type RecommendedConfig struct {
@ -416,6 +433,7 @@ func NewConfig(codecs serializer.CodecFactory) *Config {
 		LongRunningFunc:                     genericfilters.BasicLongRunningRequestCheck(sets.NewString("watch"), sets.NewString()),
 		lifecycleSignals:                    lifecycleSignals,
 		StorageObjectCountTracker:           flowcontrolrequest.NewStorageObjectCountTracker(),
+		ShutdownWatchTerminationGracePeriod: time.Duration(0),

 		APIServerID:           id,
 		StorageVersionManager: storageversion.NewDefaultManager(),
@ -684,6 +702,7 @@ func (c completedConfig) New(name string, delegationTarget DelegationTarget) (*G
 		minRequestTimeout:                   time.Duration(c.MinRequestTimeout) * time.Second,
 		ShutdownTimeout:                     c.RequestTimeout,
 		ShutdownDelayDuration:               c.ShutdownDelayDuration,
+		ShutdownWatchTerminationGracePeriod: c.ShutdownWatchTerminationGracePeriod,
 		SecureServingInfo:                   c.SecureServing,
 		ExternalAddress:                     c.ExternalAddress,

@ -913,7 +932,9 @@ func DefaultBuildHandlerChain(apiHandler http.Handler, c *Config) http.Handler {
 	handler = genericapifilters.WithRequestDeadline(handler, c.AuditBackend, c.AuditPolicyRuleEvaluator,
 		c.LongRunningFunc, c.Serializer, c.RequestTimeout)
 	handler = genericfilters.WithWaitGroup(handler, c.LongRunningFunc, c.NonLongRunningRequestWaitGroup)
+	if c.ShutdownWatchTerminationGracePeriod > 0 {
 		handler = genericfilters.WithWatchTerminationDuringShutdown(handler, c.lifecycleSignals, c.WatchRequestWaitGroup)
+	}
 	if c.SecureServing != nil && !c.SecureServing.DisableHTTP2 && c.GoawayChance > 0 {
 		handler = genericfilters.WithProbabilisticGoaway(handler, c.GoawayChance)
 	}
--- a/staging/src/k8s.io/apiserver/pkg/server/genericapiserver.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/genericapiserver.go
@ -266,6 +266,23 @@ type GenericAPIServer struct {
 	// If enabled, after ShutdownDelayDuration elapses, any incoming request is
 	// rejected with a 429 status code and a 'Retry-After' response.
 	ShutdownSendRetryAfter bool
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 // DelegationTarget is an interface which allows for composition of API servers with top level handling that works
@ -617,10 +634,13 @@ func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
 		defer close(activeWatchesDrainedCh)

 		<-notAcceptingNewRequestCh.Signaled()
+		if s.ShutdownWatchTerminationGracePeriod <= time.Duration(0) {
+			klog.V(1).InfoS("[graceful-termination] not going to wait for active watch request(s) to drain")
+			return
+		}

 		// Wait for all active watches to finish
-		// TODO(tkashem): make the grace period configurable
-		grace := 10 * time.Second
+		grace := s.ShutdownWatchTerminationGracePeriod
 		activeBefore, activeAfter, err := s.WatchRequestWaitGroup.Wait(func(count int) (utilwaitgroup.RateLimiter, context.Context, context.CancelFunc) {
 			qps := float64(count) / grace.Seconds()
 			// TODO: we don't want the QPS (max requests drained per second) to
--- a/staging/src/k8s.io/apiserver/pkg/server/genericapiserver_graceful_termination_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/genericapiserver_graceful_termination_test.go
@ -1011,6 +1011,8 @@ func newGenericAPIServer(t *testing.T, fAudit *fakeAudit, keepListening bool) *G
 	config, _ := setUp(t)
 	config.ShutdownDelayDuration = 100 * time.Millisecond
 	config.ShutdownSendRetryAfter = keepListening
+	// we enable watch draining, any positive value will do that
+	config.ShutdownWatchTerminationGracePeriod = 2 * time.Second
 	config.AuditPolicyRuleEvaluator = fAudit
 	config.AuditBackend = fAudit

--- a/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options.go
@ -73,6 +73,23 @@ type ServerRunOptions struct {
 	// If enabled, after ShutdownDelayDuration elapses, any incoming request is
 	// rejected with a 429 status code and a 'Retry-After' response.
 	ShutdownSendRetryAfter bool
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 func NewServerRunOptions() *ServerRunOptions {
@ -84,6 +101,7 @@ func NewServerRunOptions() *ServerRunOptions {
 		LivezGracePeriod:                    defaults.LivezGracePeriod,
 		MinRequestTimeout:                   defaults.MinRequestTimeout,
 		ShutdownDelayDuration:               defaults.ShutdownDelayDuration,
+		ShutdownWatchTerminationGracePeriod: defaults.ShutdownWatchTerminationGracePeriod,
 		JSONPatchMaxCopyBytes:               defaults.JSONPatchMaxCopyBytes,
 		MaxRequestBodyBytes:                 defaults.MaxRequestBodyBytes,
 		EnablePriorityAndFairness:           true,
@ -107,6 +125,7 @@ func (s *ServerRunOptions) ApplyTo(c *server.Config) error {
 	c.MaxRequestBodyBytes = s.MaxRequestBodyBytes
 	c.PublicAddress = s.AdvertiseAddress
 	c.ShutdownSendRetryAfter = s.ShutdownSendRetryAfter
+	c.ShutdownWatchTerminationGracePeriod = s.ShutdownWatchTerminationGracePeriod

 	return nil
 }
@ -160,6 +179,10 @@ func (s *ServerRunOptions) Validate() []error {
 		errors = append(errors, fmt.Errorf("--shutdown-delay-duration can not be negative value"))
 	}

+	if s.ShutdownWatchTerminationGracePeriod < 0 {
+		errors = append(errors, fmt.Errorf("shutdown-watch-termination-grace-period, if provided, can not be a negative value"))
+	}
+
 	if s.JSONPatchMaxCopyBytes < 0 {
 		errors = append(errors, fmt.Errorf("ServerRunOptions.JSONPatchMaxCopyBytes can not be negative value"))
 	}
@ -315,5 +338,9 @@ func (s *ServerRunOptions) AddUniversalFlags(fs *pflag.FlagSet) {
 		"during this window all incoming requests will be rejected with a status code 429 and a 'Retry-After' response header, "+
 		"in addition 'Connection: close' response header is set in order to tear down the TCP connection when idle.")

+	fs.DurationVar(&s.ShutdownWatchTerminationGracePeriod, "shutdown-watch-termination-grace-period", s.ShutdownWatchTerminationGracePeriod, ""+
+		"This option, if set, represents the maximum amount of grace period the apiserver will wait "+
+		"for active watch request(s) to drain during the graceful server shutdown window.")
+
 	utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
 }
--- a/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options_test.go
@ -261,3 +261,52 @@ func TestValidateCorsAllowedOriginList(t *testing.T) {
 		}
 	}
 }
+
+func TestServerRunOptionsWithShutdownWatchTerminationGracePeriod(t *testing.T) {
+	tests := []struct {
+		name             string
+		optionsFn        func() *ServerRunOptions
+		errShouldContain string
+	}{
+		{
+			name: "default should be valid",
+			optionsFn: func() *ServerRunOptions {
+				return NewServerRunOptions()
+			},
+		},
+		{
+			name: "negative not allowed",
+			optionsFn: func() *ServerRunOptions {
+				o := NewServerRunOptions()
+				o.ShutdownWatchTerminationGracePeriod = -time.Second
+				return o
+			},
+			errShouldContain: "shutdown-watch-termination-grace-period, if provided, can not be a negative value",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			options := test.optionsFn()
+			errsGot := options.Validate()
+			switch {
+			case len(test.errShouldContain) == 0:
+				if len(errsGot) != 0 {
+					t.Errorf("expected no error, but got: %v", errsGot)
+				}
+			default:
+				if len(errsGot) == 0 ||
+					!strings.Contains(utilerrors.NewAggregate(errsGot).Error(), test.errShouldContain) {
+					t.Errorf("expected error to contain: %s, but got: %v", test.errShouldContain, errsGot)
+				}
+			}
+		})
+	}
+
+	t.Run("default should be zero", func(t *testing.T) {
+		options := NewServerRunOptions()
+		if options.ShutdownWatchTerminationGracePeriod != time.Duration(0) {
+			t.Errorf("expected default of ShutdownWatchTerminationGracePeriod to be zero, but got: %s", options.ShutdownWatchTerminationGracePeriod)
+		}
+	})
+}