Merge pull request #114925 from tkashem/watch-termination

apiserver: terminate watch with rate limiting during shutdown
2025-09-07 04:03:20 +00:00 · 2023-02-27 10:26:38 -08:00
parent 85c09cd1fe 791fcd6fb4
commit a16fd5467e
10 changed files with 606 additions and 61 deletions
--- a/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/watch.go
+++ b/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/watch.go
@@ -34,6 +34,7 @@ import (
 	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/apiserver/pkg/endpoints/handlers/negotiation"
 	"k8s.io/apiserver/pkg/endpoints/metrics"
+	apirequest "k8s.io/apiserver/pkg/endpoints/request"
 	"k8s.io/apiserver/pkg/util/wsstream"
 )

@@ -105,6 +106,11 @@ func serveWatch(watcher watch.Interface, scope *RequestScope, mediaTypeOptions n
 		embeddedEncoder = scope.Serializer.EncoderForVersion(serializer.Serializer, contentKind.GroupVersion())
 	}

+	var serverShuttingDownCh <-chan struct{}
+	if signals := apirequest.ServerShutdownSignalFrom(req.Context()); signals != nil {
+		serverShuttingDownCh = signals.ShuttingDown()
+	}
+
 	ctx := req.Context()

 	server := &WatchServer{
@@ -133,6 +139,7 @@ func serveWatch(watcher watch.Interface, scope *RequestScope, mediaTypeOptions n
 		},

 		TimeoutFactory:       &realTimeoutFactory{timeout},
+		ServerShuttingDownCh: serverShuttingDownCh,
 	}

 	server.ServeHTTP(w, req)
@@ -157,6 +164,7 @@ type WatchServer struct {
 	Fixup func(runtime.Object) runtime.Object

 	TimeoutFactory       TimeoutFactory
+	ServerShuttingDownCh <-chan struct{}
 }

 // ServeHTTP serves a series of encoded events via HTTP with Transfer-Encoding: chunked
@@ -230,6 +238,15 @@ func (s *WatchServer) ServeHTTP(w http.ResponseWriter, req *http.Request) {

 	for {
 		select {
+		case <-s.ServerShuttingDownCh:
+			// the server has signaled that it is shutting down (not accepting
+			// any new request), all active watch request(s) should return
+			// immediately here. The WithWatchTerminationDuringShutdown server
+			// filter will ensure that the response to the client is rate
+			// limited in order to avoid any thundering herd issue when the
+			// client(s) try to reestablish the WATCH on the other
+			// available apiserver instance(s).
+			return
 		case <-done:
 			return
 		case <-timeoutCh:
--- a/staging/src/k8s.io/apiserver/pkg/endpoints/request/server_shutdown_signal.go
+++ b/staging/src/k8s.io/apiserver/pkg/endpoints/request/server_shutdown_signal.go
@@ -0,0 +1,55 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package request
+
+import (
+	"context"
+)
+
+// The serverShutdownSignalKeyType type is unexported to prevent collisions
+type serverShutdownSignalKeyType int
+
+// serverShutdownSignalKey is the context key for storing the
+// watch termination interface instance for a WATCH request.
+const serverShutdownSignalKey serverShutdownSignalKeyType = iota
+
+// ServerShutdownSignal is associated with the request context so
+// the request handler logic has access to signals rlated to
+// the server shutdown events
+type ServerShutdownSignal interface {
+	// Signaled when the apiserver is not receiving any new request
+	ShuttingDown() <-chan struct{}
+}
+
+// ServerShutdownSignalFrom returns the ServerShutdownSignal instance
+// associated with the request context.
+// If there is no ServerShutdownSignal asscoaied with the context,
+// nil is returned.
+func ServerShutdownSignalFrom(ctx context.Context) ServerShutdownSignal {
+	ev, _ := ctx.Value(serverShutdownSignalKey).(ServerShutdownSignal)
+	return ev
+}
+
+// WithServerShutdownSignal returns a new context that stores
+// the ServerShutdownSignal interface instance.
+func WithServerShutdownSignal(parent context.Context, window ServerShutdownSignal) context.Context {
+	if ServerShutdownSignalFrom(parent) != nil {
+		return parent // Avoid double registering.
+	}
+
+	return context.WithValue(parent, serverShutdownSignalKey, window)
+}
--- a/staging/src/k8s.io/apiserver/pkg/server/config.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/config.go
@@ -161,6 +161,10 @@ type Config struct {
 	// handlers associated with non long-running requests
 	// to complete while the server is shuting down.
 	NonLongRunningRequestWaitGroup *utilwaitgroup.SafeWaitGroup
+	// WatchRequestWaitGroup allows us to wait for all chain
+	// handlers associated with active watch requests to
+	// complete while the server is shuting down.
+	WatchRequestWaitGroup *utilwaitgroup.RateLimitedSafeWaitGroup
 	// DiscoveryAddresses is used to build the IPs pass to discovery. If nil, the ExternalAddress is
 	// always reported
 	DiscoveryAddresses discovery.Addresses
@@ -272,6 +276,23 @@ type Config struct {

 	// AggregatedDiscoveryGroupManager serves /apis in an aggregated form.
 	AggregatedDiscoveryGroupManager discoveryendpoint.ResourceManager
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 type RecommendedConfig struct {
@@ -371,6 +392,7 @@ func NewConfig(codecs serializer.CodecFactory) *Config {
 		Serializer:                     codecs,
 		BuildHandlerChainFunc:          DefaultBuildHandlerChain,
 		NonLongRunningRequestWaitGroup: new(utilwaitgroup.SafeWaitGroup),
+		WatchRequestWaitGroup:          &utilwaitgroup.RateLimitedSafeWaitGroup{},
 		LegacyAPIGroupPrefixes:         sets.NewString(DefaultLegacyAPIPrefix),
 		DisabledPostStartHooks:         sets.NewString(),
 		PostStartHooks:                 map[string]PostStartHookConfigEntry{},
@@ -411,6 +433,7 @@ func NewConfig(codecs serializer.CodecFactory) *Config {
 		LongRunningFunc:                     genericfilters.BasicLongRunningRequestCheck(sets.NewString("watch"), sets.NewString()),
 		lifecycleSignals:                    lifecycleSignals,
 		StorageObjectCountTracker:           flowcontrolrequest.NewStorageObjectCountTracker(),
+		ShutdownWatchTerminationGracePeriod: time.Duration(0),

 		APIServerID:           id,
 		StorageVersionManager: storageversion.NewDefaultManager(),
@@ -670,6 +693,7 @@ func (c completedConfig) New(name string, delegationTarget DelegationTarget) (*G
 		delegationTarget:               delegationTarget,
 		EquivalentResourceRegistry:     c.EquivalentResourceRegistry,
 		NonLongRunningRequestWaitGroup: c.NonLongRunningRequestWaitGroup,
+		WatchRequestWaitGroup:          c.WatchRequestWaitGroup,
 		Handler:                        apiServerHandler,
 		UnprotectedDebugSocket:         debugSocket,

@@ -678,6 +702,7 @@ func (c completedConfig) New(name string, delegationTarget DelegationTarget) (*G
 		minRequestTimeout:                   time.Duration(c.MinRequestTimeout) * time.Second,
 		ShutdownTimeout:                     c.RequestTimeout,
 		ShutdownDelayDuration:               c.ShutdownDelayDuration,
+		ShutdownWatchTerminationGracePeriod: c.ShutdownWatchTerminationGracePeriod,
 		SecureServingInfo:                   c.SecureServing,
 		ExternalAddress:                     c.ExternalAddress,

@@ -907,6 +932,9 @@ func DefaultBuildHandlerChain(apiHandler http.Handler, c *Config) http.Handler {
 	handler = genericapifilters.WithRequestDeadline(handler, c.AuditBackend, c.AuditPolicyRuleEvaluator,
 		c.LongRunningFunc, c.Serializer, c.RequestTimeout)
 	handler = genericfilters.WithWaitGroup(handler, c.LongRunningFunc, c.NonLongRunningRequestWaitGroup)
+	if c.ShutdownWatchTerminationGracePeriod > 0 {
+		handler = genericfilters.WithWatchTerminationDuringShutdown(handler, c.lifecycleSignals, c.WatchRequestWaitGroup)
+	}
 	if c.SecureServing != nil && !c.SecureServing.DisableHTTP2 && c.GoawayChance > 0 {
 		handler = genericfilters.WithProbabilisticGoaway(handler, c.GoawayChance)
 	}
--- a/staging/src/k8s.io/apiserver/pkg/server/filters/watch_termination.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/filters/watch_termination.go
@@ -0,0 +1,62 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package filters
+
+import (
+	"errors"
+	"net/http"
+
+	"k8s.io/apiserver/pkg/endpoints/handlers/responsewriters"
+	apirequest "k8s.io/apiserver/pkg/endpoints/request"
+	"k8s.io/klog/v2"
+)
+
+func WithWatchTerminationDuringShutdown(handler http.Handler, termination apirequest.ServerShutdownSignal, wg RequestWaitGroup) http.Handler {
+	if termination == nil || wg == nil {
+		klog.Warningf("watch termination during shutdown not attached to the handler chain")
+		return handler
+	}
+	return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
+		ctx := req.Context()
+		requestInfo, ok := apirequest.RequestInfoFrom(ctx)
+		if !ok {
+			// if this happens, the handler chain isn't setup correctly because there is no request info
+			responsewriters.InternalError(w, req, errors.New("no RequestInfo found in the context"))
+			return
+		}
+		if !watchVerbs.Has(requestInfo.Verb) {
+			handler.ServeHTTP(w, req)
+			return
+		}
+
+		if err := wg.Add(1); err != nil {
+			// When apiserver is shutting down, signal clients to retry
+			// There is a good chance the client hit a different server, so a tight retry is good for client responsiveness.
+			waitGroupWriteRetryAfterToResponse(w)
+			return
+		}
+
+		// attach ServerShutdownSignal to the watch request so that the
+		// watch handler loop can return as soon as the server signals
+		// that it is shutting down.
+		ctx = apirequest.WithServerShutdownSignal(req.Context(), termination)
+		req = req.WithContext(ctx)
+
+		defer wg.Done()
+		handler.ServeHTTP(w, req)
+	})
+}
--- a/staging/src/k8s.io/apiserver/pkg/server/filters/watch_termination_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/filters/watch_termination_test.go
@@ -0,0 +1,166 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package filters
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	apirequest "k8s.io/apiserver/pkg/endpoints/request"
+)
+
+func TestWithWatchTerminationDuringShutdown(t *testing.T) {
+	tests := []struct {
+		name                    string
+		requestInfo             *apirequest.RequestInfo
+		signal                  *fakeServerShutdownSignal
+		wg                      *fakeRequestWaitGroup
+		handlerInvoked          int
+		statusCodeExpected      int
+		retryAfterExpected      bool
+		wgInvokedExpected       int
+		signalAttachedToContext bool
+	}{
+		{
+			name:               "no RequestInfo attached to request context",
+			handlerInvoked:     0,
+			statusCodeExpected: http.StatusInternalServerError,
+		},
+		{
+			name:               "request is not a WATCH, not added into wait group",
+			requestInfo:        &apirequest.RequestInfo{Verb: "get"},
+			handlerInvoked:     1,
+			statusCodeExpected: http.StatusOK,
+		},
+		{
+			name:                    "request is a WATCH, wait group is in waiting mode",
+			requestInfo:             &apirequest.RequestInfo{Verb: "watch"},
+			wg:                      &fakeRequestWaitGroup{waiting: true},
+			handlerInvoked:          0,
+			signalAttachedToContext: false,
+			wgInvokedExpected:       1,
+			retryAfterExpected:      true,
+			statusCodeExpected:      http.StatusServiceUnavailable,
+		},
+		{
+			name:                    "request is a WATCH, wait group is accepting",
+			requestInfo:             &apirequest.RequestInfo{Verb: "watch"},
+			wg:                      &fakeRequestWaitGroup{},
+			signal:                  &fakeServerShutdownSignal{},
+			wgInvokedExpected:       1,
+			signalAttachedToContext: true,
+			handlerInvoked:          1,
+			statusCodeExpected:      http.StatusOK,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var (
+				handlerInvokedGot int
+				signalGot         *fakeServerShutdownSignal
+			)
+			delegate := http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
+				handlerInvokedGot++
+				if signal := apirequest.ServerShutdownSignalFrom(req.Context()); signal != nil {
+					signalGot, _ = signal.(*fakeServerShutdownSignal)
+				}
+				w.WriteHeader(http.StatusOK)
+			})
+
+			handler := WithWatchTerminationDuringShutdown(delegate, test.signal, test.wg)
+
+			req, err := http.NewRequest(http.MethodGet, "/apis/groups.k8s.io/v1/namespaces", nil)
+			if err != nil {
+				t.Fatalf("failed to create new http request - %v", err)
+			}
+			if test.requestInfo != nil {
+				req = req.WithContext(apirequest.WithRequestInfo(req.Context(), test.requestInfo))
+			}
+
+			w := httptest.NewRecorder()
+			w.Code = 0
+			handler.ServeHTTP(w, req)
+			responseGot := w.Result()
+
+			if test.handlerInvoked != handlerInvokedGot {
+				t.Errorf("expected the handler to be invoked: %d timed, but got: %d", test.handlerInvoked, handlerInvokedGot)
+			}
+			if test.statusCodeExpected != responseGot.StatusCode {
+				t.Errorf("expected status code: %d, but got: %d", test.statusCodeExpected, w.Result().StatusCode)
+			}
+			retryAfterGot := retryAfterSent(responseGot)
+			if test.retryAfterExpected != retryAfterGot {
+				t.Errorf("expected retry-after: %t, but got: %t, response: %v#", test.retryAfterExpected, retryAfterGot, responseGot)
+			}
+
+			switch {
+			case test.signalAttachedToContext:
+				if test.signal == nil || test.signal != signalGot {
+					t.Errorf("expected request context to have server shutdown signal: %p, but got: %p", test.signal, signalGot)
+				}
+			default:
+				if signalGot != nil {
+					t.Errorf("expected request context to not have server shutdown signal: %p, but got: %p", test.signal, signalGot)
+				}
+			}
+			if test.wg == nil {
+				return
+			}
+			if test.wg.inflight != 0 {
+				t.Errorf("expected wait group inflight to be zero, but got: %d", test.wg.inflight)
+			}
+			if test.wgInvokedExpected != test.wg.invoked {
+				t.Errorf("expected wait group Add to be invoked: %d times, but got: %d", test.wgInvokedExpected, test.wg.invoked)
+			}
+		})
+	}
+}
+
+type fakeServerShutdownSignal struct{}
+
+func (fakeServerShutdownSignal) ShuttingDown() <-chan struct{} { return nil }
+
+type fakeRequestWaitGroup struct {
+	waiting           bool
+	invoked, inflight int
+}
+
+func (f *fakeRequestWaitGroup) Add(delta int) error {
+	f.invoked++
+	if f.waiting {
+		return fmt.Errorf("waitgroup is in waiting mode")
+	}
+	f.inflight += delta
+	return nil
+}
+func (f *fakeRequestWaitGroup) Done() { f.inflight-- }
+
+func retryAfterSent(resp *http.Response) bool {
+	switch {
+	case resp.StatusCode == http.StatusServiceUnavailable &&
+		resp.Header.Get("Retry-After") == "1" &&
+		resp.Header.Get("Content-Type") == runtime.ContentTypeJSON &&
+		resp.Header.Get("X-Content-Type-Options") == "nosniff":
+		return true
+	default:
+		return false
+	}
+}
--- a/staging/src/k8s.io/apiserver/pkg/server/genericapiserver.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/genericapiserver.go
@@ -17,6 +17,7 @@ limitations under the License.
 package server

 import (
+	"context"
 	"fmt"
 	"net/http"
 	gpath "path"
@@ -26,6 +27,7 @@ import (

 	systemd "github.com/coreos/go-systemd/v22/daemon"

+	"golang.org/x/time/rate"
 	apidiscoveryv2beta1 "k8s.io/api/apidiscovery/v2beta1"
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -221,6 +223,10 @@ type GenericAPIServer struct {
 	// handlers associated with non long-running requests
 	// to complete while the server is shuting down.
 	NonLongRunningRequestWaitGroup *utilwaitgroup.SafeWaitGroup
+	// WatchRequestWaitGroup allows us to wait for all chain
+	// handlers associated with active watch requests to
+	// complete while the server is shuting down.
+	WatchRequestWaitGroup *utilwaitgroup.RateLimitedSafeWaitGroup

 	// ShutdownDelayDuration allows to block shutdown for some time, e.g. until endpoints pointing to this API server
 	// have converged on all node. During this time, the API server keeps serving, /healthz will return 200,
@@ -260,6 +266,23 @@ type GenericAPIServer struct {
 	// If enabled, after ShutdownDelayDuration elapses, any incoming request is
 	// rejected with a 429 status code and a 'Retry-After' response.
 	ShutdownSendRetryAfter bool
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 // DelegationTarget is an interface which allows for composition of API servers with top level handling that works
@@ -447,18 +470,22 @@ func (s *GenericAPIServer) PrepareRun() preparedGenericAPIServer {
 // |               NotAcceptingNewRequest (notAcceptingNewRequestCh)
 // |                                    |
 // |                                    |
-// |           |---------------------------------------------------------|
+// |           |----------------------------------------------------------------------------------|
 // |           |                        |              |                                          |
 // |        [without                 [with             |                                          |
 // | ShutdownSendRetryAfter]  ShutdownSendRetryAfter]  |                                          |
 // |           |                        |              |                                          |
 // |           |                        ---------------|                                          |
 // |           |                                       |                                          |
-// |           |                (NonLongRunningRequestWaitGroup::Wait)   |
+// |           |                      |----------------|-----------------------|                  |
+// |           |                      |                                        |                  |
+// |           |         (NonLongRunningRequestWaitGroup::Wait)   (WatchRequestWaitGroup::Wait)   |
+// |           |                      |                                        |                  |
+// |           |                      |------------------|---------------------|                  |
 // |           |                                         |                                        |
 // |           |                         InFlightRequestsDrained (drainedCh)                      |
 // |           |                                         |                                        |
-// |           ----------------------------------------|-----------------|
+// |           |-------------------|---------------------|----------------------------------------|
 // |                               |                     |
 // |                       stopHttpServerCh     (AuditBackend::Shutdown())
 // |                               |
@@ -576,9 +603,11 @@ func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
 		<-preShutdownHooksHasStoppedCh.Signaled()
 	}()

+	// wait for all in-flight non-long running requests to finish
+	nonLongRunningRequestDrainedCh := make(chan struct{})
 	go func() {
-		defer klog.V(1).InfoS("[graceful-termination] shutdown event", "name", drainedCh.Name())
-		defer drainedCh.Signal()
+		defer close(nonLongRunningRequestDrainedCh)
+		defer klog.V(1).Info("[graceful-termination] in-flight non long-running request(s) have drained")

 		// wait for the delayed stopCh before closing the handler chain (it rejects everything after Wait has been called).
 		<-notAcceptingNewRequestCh.Signaled()
@@ -599,6 +628,47 @@ func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
 		s.NonLongRunningRequestWaitGroup.Wait()
 	}()

+	// wait for all in-flight watches to finish
+	activeWatchesDrainedCh := make(chan struct{})
+	go func() {
+		defer close(activeWatchesDrainedCh)
+
+		<-notAcceptingNewRequestCh.Signaled()
+		if s.ShutdownWatchTerminationGracePeriod <= time.Duration(0) {
+			klog.V(1).InfoS("[graceful-termination] not going to wait for active watch request(s) to drain")
+			return
+		}
+
+		// Wait for all active watches to finish
+		grace := s.ShutdownWatchTerminationGracePeriod
+		activeBefore, activeAfter, err := s.WatchRequestWaitGroup.Wait(func(count int) (utilwaitgroup.RateLimiter, context.Context, context.CancelFunc) {
+			qps := float64(count) / grace.Seconds()
+			// TODO: we don't want the QPS (max requests drained per second) to
+			//  get below a certain floor value, since we want the server to
+			//  drain the active watch requests as soon as possible.
+			//  For now, it's hard coded to 200, and it is subject to change
+			//  based on the result from the scale testing.
+			if qps < 200 {
+				qps = 200
+			}
+
+			ctx, cancel := context.WithTimeout(context.Background(), grace)
+			// We don't expect more than one token to be consumed
+			// in a single Wait call, so setting burst to 1.
+			return rate.NewLimiter(rate.Limit(qps), 1), ctx, cancel
+		})
+		klog.V(1).InfoS("[graceful-termination] active watch request(s) have drained",
+			"duration", grace, "activeWatchesBefore", activeBefore, "activeWatchesAfter", activeAfter, "error", err)
+	}()
+
+	go func() {
+		defer klog.V(1).InfoS("[graceful-termination] shutdown event", "name", drainedCh.Name())
+		defer drainedCh.Signal()
+
+		<-nonLongRunningRequestDrainedCh
+		<-activeWatchesDrainedCh
+	}()
+
 	klog.V(1).Info("[graceful-termination] waiting for shutdown to be initiated")
 	<-stopCh

--- a/staging/src/k8s.io/apiserver/pkg/server/genericapiserver_graceful_termination_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/genericapiserver_graceful_termination_test.go
@@ -38,6 +38,7 @@ import (
 	auditinternal "k8s.io/apiserver/pkg/apis/audit"
 	"k8s.io/apiserver/pkg/audit"
 	"k8s.io/apiserver/pkg/authorization/authorizer"
+	apirequest "k8s.io/apiserver/pkg/endpoints/request"
 	"k8s.io/apiserver/pkg/server/dynamiccertificates"
 	"k8s.io/klog/v2"

@@ -147,7 +148,7 @@ func newSignalInterceptingTestStep() *signalInterceptingTestStep {
 //	             |             close(stopHttpServerCh)                         NonLongRunningRequestWaitGroup.Wait()
 //	             |                       |                                                 |
 //	             |            server.Shutdown(timeout=60s)                                 |
-//	             |                       |                                                 |
+//	             |                       |                                         WatchRequestWaitGroup.Wait()
 //	             |              stop listener (net/http)                                   |
 //	             |                       |                                                 |
 //	             |          |-------------------------------------|                        |
@@ -176,8 +177,10 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationDisabled(t
 	connReusingClient := newClient(false)
 	doer := setupDoer(t, s.SecureServingInfo)

-	// handler for a request that we want to keep in flight through to the end
-	inflightRequest := setupInFlightReuestHandler(s)
+	// handler for a non long-running and a watch request that
+	// we want to keep in flight through to the end.
+	inflightNonLongRunning := setupInFlightNonLongRunningRequestHandler(s)
+	inflightWatch := setupInFlightWatchRequestHandler(s)

 	// API calls from the pre-shutdown hook(s) must succeed up to
 	// the point where the HTTP server is shut down.
@@ -204,10 +207,13 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationDisabled(t
 	}()
 	waitForAPIServerStarted(t, doer)

-	// fire a request now so it is in-flight on the server now, and
-	// we will unblock it after ShutdownDelayDuration elapses
-	inflightRequest.launch(doer, connReusingClient)
-	waitForeverUntil(t, inflightRequest.startedCh, "in-flight request did not reach the server")
+	// fire the non long-running and the watch request so it is
+	// in-flight on the server now, and we will unblock them
+	// after ShutdownDelayDuration elapses.
+	inflightNonLongRunning.launch(doer, connReusingClient)
+	waitForeverUntil(t, inflightNonLongRunning.startedCh, "in-flight non long-running request did not reach the server")
+	inflightWatch.launch(doer, connReusingClient)
+	waitForeverUntil(t, inflightWatch.startedCh, "in-flight watch request did not reach the server")

 	// /readyz should return OK
 	resultGot := doer.Do(newClient(true), func(httptrace.GotConnInfo) {}, "/readyz", time.Second)
@@ -300,13 +306,21 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationDisabled(t
 		t.Errorf("Expected error %v, but got: %v %v", syscall.ECONNREFUSED, resultGot.err, resultGot.response)
 	}

-	// the server has stopped listening but we still have a request
-	// in flight, let it unblock and we expect the request to succeed.
-	inFlightResultGot := inflightRequest.unblockAndWaitForResult(t)
-	if err := assertResponseStatusCode(inFlightResultGot, http.StatusOK); err != nil {
+	// the server has stopped listening but we still have a non long-running,
+	// and a watch request in flight, unblock both of these, and we expect
+	// the requests to return appropriate response to the caller.
+	inflightNonLongRunningResultGot := inflightNonLongRunning.unblockAndWaitForResult(t)
+	if err := assertResponseStatusCode(inflightNonLongRunningResultGot, http.StatusOK); err != nil {
 		t.Errorf("%s", err.Error())
 	}
-	if err := assertRequestAudited(inFlightResultGot, fakeAudit); err != nil {
+	if err := assertRequestAudited(inflightNonLongRunningResultGot, fakeAudit); err != nil {
+		t.Errorf("%s", err.Error())
+	}
+	inflightWatchResultGot := inflightWatch.unblockAndWaitForResult(t)
+	if err := assertResponseStatusCode(inflightWatchResultGot, http.StatusOK); err != nil {
+		t.Errorf("%s", err.Error())
+	}
+	if err := assertRequestAudited(inflightWatchResultGot, fakeAudit); err != nil {
 		t.Errorf("%s", err.Error())
 	}

@@ -359,6 +373,8 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationDisabled(t
 //     |                                            |
 //     |                              NonLongRunningRequestWaitGroup.Wait()
 //     |                                            |
+//     |                                 WatchRequestWaitGroup.Wait()
+//     |                                            |
 //     |                                (InFlightRequestsDrained)
 //     |                                            |
 //     |                                            |
@@ -384,8 +400,10 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationEnabled(t
 	connReusingClient := newClient(false)
 	doer := setupDoer(t, s.SecureServingInfo)

-	// handler for a request that we want to keep in flight through to the end
-	inflightRequest := setupInFlightReuestHandler(s)
+	// handler for a non long-running and a watch request that
+	// we want to keep in flight through to the end.
+	inflightNonLongRunning := setupInFlightNonLongRunningRequestHandler(s)
+	inflightWatch := setupInFlightWatchRequestHandler(s)

 	// API calls from the pre-shutdown hook(s) must succeed up to
 	// the point where the HTTP server is shut down.
@@ -412,10 +430,13 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationEnabled(t
 	}()
 	waitForAPIServerStarted(t, doer)

-	// fire a request now so it is in-flight on the server now, and
-	// we will unblock it after ShutdownDelayDuration elapses
-	inflightRequest.launch(doer, connReusingClient)
-	waitForeverUntil(t, inflightRequest.startedCh, "in-flight request did not reach the server")
+	// fire the non long-running and the watch request so it is
+	// in-flight on the server now, and we will unblock them
+	// after ShutdownDelayDuration elapses.
+	inflightNonLongRunning.launch(doer, connReusingClient)
+	waitForeverUntil(t, inflightNonLongRunning.startedCh, "in-flight request did not reach the server")
+	inflightWatch.launch(doer, connReusingClient)
+	waitForeverUntil(t, inflightWatch.startedCh, "in-flight watch request did not reach the server")

 	// /readyz should return OK
 	resultGot := doer.Do(newClient(true), func(httptrace.GotConnInfo) {}, "/readyz", time.Second)
@@ -487,12 +508,21 @@ func TestGracefulTerminationWithKeepListeningDuringGracefulTerminationEnabled(t
 		t.Errorf("%s", err.Error())
 	}

-	// we still have a request in flight, let it unblock and we expect the request to succeed.
-	inFlightResultGot := inflightRequest.unblockAndWaitForResult(t)
-	if err := assertResponseStatusCode(inFlightResultGot, http.StatusOK); err != nil {
+	// we still have a non long-running, and a watch request in flight,
+	// unblock both of these, and we expect the requests
+	// to return appropriate response to the caller.
+	inflightNonLongRunningResultGot := inflightNonLongRunning.unblockAndWaitForResult(t)
+	if err := assertResponseStatusCode(inflightNonLongRunningResultGot, http.StatusOK); err != nil {
 		t.Errorf("%s", err.Error())
 	}
-	if err := assertRequestAudited(inFlightResultGot, fakeAudit); err != nil {
+	if err := assertRequestAudited(inflightNonLongRunningResultGot, fakeAudit); err != nil {
+		t.Errorf("%s", err.Error())
+	}
+	inflightWatchResultGot := inflightWatch.unblockAndWaitForResult(t)
+	if err := assertResponseStatusCode(inflightWatchResultGot, http.StatusOK); err != nil {
+		t.Errorf("%s", err.Error())
+	}
+	if err := assertRequestAudited(inflightWatchResultGot, fakeAudit); err != nil {
 		t.Errorf("%s", err.Error())
 	}

@@ -663,12 +693,12 @@ type inFlightRequest struct {
 	url                  string
 }

-func setupInFlightReuestHandler(s *GenericAPIServer) *inFlightRequest {
+func setupInFlightNonLongRunningRequestHandler(s *GenericAPIServer) *inFlightRequest {
 	inflight := &inFlightRequest{
 		blockedCh: make(chan struct{}),
 		startedCh: make(chan struct{}),
 		resultCh:  make(chan result),
-		url:       "/in-flight-request-as-designed",
+		url:       "/in-flight-non-long-running-request-as-designed",
 	}
 	handler := http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
 		close(inflight.startedCh)
@@ -680,6 +710,37 @@ func setupInFlightReuestHandler(s *GenericAPIServer) *inFlightRequest {
 	return inflight
 }

+func setupInFlightWatchRequestHandler(s *GenericAPIServer) *inFlightRequest {
+	inflight := &inFlightRequest{
+		blockedCh: make(chan struct{}),
+		startedCh: make(chan struct{}),
+		resultCh:  make(chan result),
+		url:       "/apis/watches.group/v1/namespaces/foo/bar?watch=true",
+	}
+
+	handler := http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
+		close(inflight.startedCh)
+		// this request handler blocks until we deliberately unblock it.
+		<-inflight.blockedCh
+
+		// this simulates a watch well enough for our test
+		signals := apirequest.ServerShutdownSignalFrom(req.Context())
+		if signals == nil {
+			w.WriteHeader(http.StatusInternalServerError)
+			return
+		}
+		select {
+		case <-signals.ShuttingDown():
+			w.WriteHeader(http.StatusOK)
+			return
+		}
+
+		w.WriteHeader(http.StatusInternalServerError)
+	})
+	s.Handler.NonGoRestfulMux.Handle("/apis/watches.group/v1/namespaces/foo/bar", handler)
+	return inflight
+}
+
 func (ifr *inFlightRequest) launch(doer doer, client *http.Client) {
 	go func() {
 		result := doer.Do(client, func(httptrace.GotConnInfo) {}, ifr.url, 0)
@@ -950,6 +1011,8 @@ func newGenericAPIServer(t *testing.T, fAudit *fakeAudit, keepListening bool) *G
 	config, _ := setUp(t)
 	config.ShutdownDelayDuration = 100 * time.Millisecond
 	config.ShutdownSendRetryAfter = keepListening
+	// we enable watch draining, any positive value will do that
+	config.ShutdownWatchTerminationGracePeriod = 2 * time.Second
 	config.AuditPolicyRuleEvaluator = fAudit
 	config.AuditBackend = fAudit

--- a/staging/src/k8s.io/apiserver/pkg/server/lifecycle_signals.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/lifecycle_signals.go
@@ -146,6 +146,14 @@ type lifecycleSignals struct {
 	MuxAndDiscoveryComplete lifecycleSignal
 }

+// ShuttingDown returns the lifecycle signal that is signaled when
+// the server is not accepting any new requests.
+// this is the lifecycle event that is exported to the request handler
+// logic to indicate that the server is shutting down.
+func (s lifecycleSignals) ShuttingDown() <-chan struct{} {
+	return s.NotAcceptingNewRequest.Signaled()
+}
+
 // newLifecycleSignals returns an instance of lifecycleSignals interface to be used
 // to coordinate lifecycle of the apiserver
 func newLifecycleSignals() lifecycleSignals {
--- a/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options.go
@@ -73,6 +73,23 @@ type ServerRunOptions struct {
 	// If enabled, after ShutdownDelayDuration elapses, any incoming request is
 	// rejected with a 429 status code and a 'Retry-After' response.
 	ShutdownSendRetryAfter bool
+
+	// ShutdownWatchTerminationGracePeriod, if set to a positive value,
+	// is the maximum duration the apiserver will wait for all active
+	// watch request(s) to drain.
+	// Once this grace period elapses, the apiserver will no longer
+	// wait for any active watch request(s) in flight to drain, it will
+	// proceed to the next step in the graceful server shutdown process.
+	// If set to a positive value, the apiserver will keep track of the
+	// number of active watch request(s) in flight and during shutdown
+	// it will wait, at most, for the specified duration and allow these
+	// active watch requests to drain with some rate limiting in effect.
+	// The default is zero, which implies the apiserver will not keep
+	// track of active watch request(s) in flight and will not wait
+	// for them to drain, this maintains backward compatibility.
+	// This grace period is orthogonal to other grace periods, and
+	// it is not overridden by any other grace period.
+	ShutdownWatchTerminationGracePeriod time.Duration
 }

 func NewServerRunOptions() *ServerRunOptions {
@@ -84,6 +101,7 @@ func NewServerRunOptions() *ServerRunOptions {
 		LivezGracePeriod:                    defaults.LivezGracePeriod,
 		MinRequestTimeout:                   defaults.MinRequestTimeout,
 		ShutdownDelayDuration:               defaults.ShutdownDelayDuration,
+		ShutdownWatchTerminationGracePeriod: defaults.ShutdownWatchTerminationGracePeriod,
 		JSONPatchMaxCopyBytes:               defaults.JSONPatchMaxCopyBytes,
 		MaxRequestBodyBytes:                 defaults.MaxRequestBodyBytes,
 		EnablePriorityAndFairness:           true,
@@ -107,6 +125,7 @@ func (s *ServerRunOptions) ApplyTo(c *server.Config) error {
 	c.MaxRequestBodyBytes = s.MaxRequestBodyBytes
 	c.PublicAddress = s.AdvertiseAddress
 	c.ShutdownSendRetryAfter = s.ShutdownSendRetryAfter
+	c.ShutdownWatchTerminationGracePeriod = s.ShutdownWatchTerminationGracePeriod

 	return nil
 }
@@ -160,6 +179,10 @@ func (s *ServerRunOptions) Validate() []error {
 		errors = append(errors, fmt.Errorf("--shutdown-delay-duration can not be negative value"))
 	}

+	if s.ShutdownWatchTerminationGracePeriod < 0 {
+		errors = append(errors, fmt.Errorf("shutdown-watch-termination-grace-period, if provided, can not be a negative value"))
+	}
+
 	if s.JSONPatchMaxCopyBytes < 0 {
 		errors = append(errors, fmt.Errorf("ServerRunOptions.JSONPatchMaxCopyBytes can not be negative value"))
 	}
@@ -315,5 +338,9 @@ func (s *ServerRunOptions) AddUniversalFlags(fs *pflag.FlagSet) {
 		"during this window all incoming requests will be rejected with a status code 429 and a 'Retry-After' response header, "+
 		"in addition 'Connection: close' response header is set in order to tear down the TCP connection when idle.")

+	fs.DurationVar(&s.ShutdownWatchTerminationGracePeriod, "shutdown-watch-termination-grace-period", s.ShutdownWatchTerminationGracePeriod, ""+
+		"This option, if set, represents the maximum amount of grace period the apiserver will wait "+
+		"for active watch request(s) to drain during the graceful server shutdown window.")
+
 	utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
 }
--- a/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/server_run_options_test.go
@@ -261,3 +261,52 @@ func TestValidateCorsAllowedOriginList(t *testing.T) {
 		}
 	}
 }
+
+func TestServerRunOptionsWithShutdownWatchTerminationGracePeriod(t *testing.T) {
+	tests := []struct {
+		name             string
+		optionsFn        func() *ServerRunOptions
+		errShouldContain string
+	}{
+		{
+			name: "default should be valid",
+			optionsFn: func() *ServerRunOptions {
+				return NewServerRunOptions()
+			},
+		},
+		{
+			name: "negative not allowed",
+			optionsFn: func() *ServerRunOptions {
+				o := NewServerRunOptions()
+				o.ShutdownWatchTerminationGracePeriod = -time.Second
+				return o
+			},
+			errShouldContain: "shutdown-watch-termination-grace-period, if provided, can not be a negative value",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			options := test.optionsFn()
+			errsGot := options.Validate()
+			switch {
+			case len(test.errShouldContain) == 0:
+				if len(errsGot) != 0 {
+					t.Errorf("expected no error, but got: %v", errsGot)
+				}
+			default:
+				if len(errsGot) == 0 ||
+					!strings.Contains(utilerrors.NewAggregate(errsGot).Error(), test.errShouldContain) {
+					t.Errorf("expected error to contain: %s, but got: %v", test.errShouldContain, errsGot)
+				}
+			}
+		})
+	}
+
+	t.Run("default should be zero", func(t *testing.T) {
+		options := NewServerRunOptions()
+		if options.ShutdownWatchTerminationGracePeriod != time.Duration(0) {
+			t.Errorf("expected default of ShutdownWatchTerminationGracePeriod to be zero, but got: %s", options.ShutdownWatchTerminationGracePeriod)
+		}
+	})
+}