Optimize APF support for watch initialization to fix the pod startup time regression.

2025-07-27 21:47:07 +00:00 · 2021-07-13 10:22:30 +02:00 · 2021-07-13 10:22:30 +02:00 · ef435b85b4
commit ef435b85b4
parent 43c9e8e7a1
3 changed files with 145 additions and 89 deletions
--- a/staging/src/k8s.io/apiserver/pkg/server/filters/priority-and-fairness.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/filters/priority-and-fairness.go
@ -17,6 +17,7 @@ limitations under the License.
 package filters
 import (
 	"context"
 	"fmt"
 	"net/http"
 	"runtime"
@ -113,30 +114,76 @@ func WithPriorityAndFairness(
 				waitingMark.recordReadOnly(int(atomic.AddInt32(&atomicReadOnlyWaiting, delta)))
 			}
 		}
-		var resultCh chan interface{}
+		queueNote := func(inQueue bool) {
-		var forgetWatch utilflowcontrol.ForgetWatchFunc
+			if inQueue {
-		if isWatchRequest {
+				noteWaitingDelta(1)
-			resultCh = make(chan interface{})
+			} else {
 				noteWaitingDelta(-1)
 			}
 		}
 		// find the estimated "width" of the request
 		// TODO: Maybe just make it costEstimator and let it return additionalLatency too for the watch?
 		// TODO: Estimate cost should also take fcIfc.GetWatchCount(requestInfo) as a parameter.
 		width := widthEstimator.EstimateWidth(r)
 		digest := utilflowcontrol.RequestDigest{RequestInfo: requestInfo, User: user, Width: width}
 		if isWatchRequest {
 			// This channel blocks calling handler.ServeHTTP() until closed, and is closed inside execute().
 			// If APF rejects the request, it is never closed.
 			shouldStartWatchCh := make(chan struct{})
 			watchInitializationSignal := newInitializationSignal()
 			// This wraps the request passed to handler.ServeHTTP(),
 			// setting a context that plumbs watchInitializationSignal to storage
 			var watchReq *http.Request
 			// This is set inside execute(), prior to closing shouldStartWatchCh.
 			// If the request is rejected by APF it is left nil.
 			var forgetWatch utilflowcontrol.ForgetWatchFunc
 			defer func() {
 				// Protect from the situation when request will not reach storage layer
 				// and the initialization signal will not be send.
 				if watchInitializationSignal != nil {
 					watchInitializationSignal.Signal()
 				}
 				// Forget the watcher if it was registered.
 				//
 				// // This is race-free because by this point, one of the following occurred:
 				// case <-shouldStartWatchCh: execute() completed the assignment to forgetWatch
 				// case <-resultCh: Handle() completed, and Handle() does not return
 				//   while execute() is running
 				if forgetWatch != nil {
 					forgetWatch()
 				}
 			}()
 			execute := func() {
 				noteExecutingDelta(1)
 				defer noteExecutingDelta(-1)
 				served = true
 			innerCtx := ctx
 			innerReq := r
 			var watchInitializationSignal utilflowcontrol.InitializationSignal
 			if isWatchRequest {
 				watchInitializationSignal = newInitializationSignal()
 				innerCtx = utilflowcontrol.WithInitializationSignal(ctx, watchInitializationSignal)
 				innerReq = r.Clone(innerCtx)
 			}
 				setResponseHeaders(classification, w)
 				forgetWatch = fcIfc.RegisterWatch(requestInfo)
-			if isWatchRequest {
+				// Notify the main thread that we're ready to start the watch.
 				close(shouldStartWatchCh)
 				// Wait until the request is finished from the APF point of view
 				// (which is when its initialization is done).
 				watchInitializationSignal.Wait()
 			}
 			// Ensure that an item can be put to resultCh asynchronously.
 			resultCh := make(chan interface{}, 1)
 			// Call Handle in a separate goroutine.
 			// The reason for it is that from APF point of view, the request processing
 			// finishes as soon as watch is initialized (which is generally orders of
 			// magnitude faster then the watch request itself). This means that Handle()
 			// call finishes much faster and for performance reasons we want to reduce
 			// the number of running goroutines - so we run the shorter thing in a
 			// dedicated goroutine and the actual watch handler in the main one.
 			go func() {
 				defer func() {
 					err := recover()
@ -149,35 +196,60 @@ func WithPriorityAndFairness(
 						buf = buf[:runtime.Stack(buf, false)]
 						err = fmt.Sprintf("%v\n%s", err, buf)
 					}
 					// Ensure that the result is put into resultCh independently of the panic.
 					resultCh <- err
 				}()
-					// Protect from the situations when request will not reach storage layer
+				// We create handleCtx with explicit cancelation function.
-					// and the initialization signal will not be send.
+				// The reason for it is that Handle() underneath may start additional goroutine
-					defer watchInitializationSignal.Signal()
+				// that is blocked on context cancellation. However, from APF point of view,
 				// we don't want to wait until the whole watch request is processed (which is
 				// when it context is actually cancelled) - we want to unblock the goroutine as
 				// soon as the request is processed from the APF point of view.
 				//
 				// Note that we explicitly do NOT call the actuall handler using that context
 				// to avoid cancelling request too early.
 				handleCtx, handleCtxCancel := context.WithCancel(ctx)
 				defer handleCtxCancel()
-					handler.ServeHTTP(w, innerReq)
+				// Note that Handle will return irrespective of whether the request
 				// executes or is rejected. In the latter case, the function will return
 				// without calling the passed `execute` function.
 				fcIfc.Handle(handleCtx, digest, note, queueNote, execute)
 			}()
-				watchInitializationSignal.Wait()
+			select {
-			} else {
+			case <-shouldStartWatchCh:
-				handler.ServeHTTP(w, innerReq)
+				watchCtx := utilflowcontrol.WithInitializationSignal(ctx, watchInitializationSignal)
 				watchReq = r.WithContext(watchCtx)
 				handler.ServeHTTP(w, watchReq)
 				// Protect from the situation when request will not reach storage layer
 				// and the initialization signal will not be send.
 				// It has to happen before waiting on the resultCh below.
 				watchInitializationSignal.Signal()
 				// TODO: Consider finishing the request as soon as Handle call panics.
 				if err := <-resultCh; err != nil {
 					panic(err)
 				}
 			case err := <-resultCh:
 				if err != nil {
 					panic(err)
 				}
 			}
 		} else {
 			execute := func() {
 				noteExecutingDelta(1)
 				defer noteExecutingDelta(-1)
 				served = true
 				setResponseHeaders(classification, w)
 				handler.ServeHTTP(w, r)
 			}
-		// find the estimated "width" of the request
+			fcIfc.Handle(ctx, digest, note, queueNote, execute)
 		// TODO: Maybe just make it costEstimator and let it return additionalLatency too for the watch?
 		// TODO: Estimate cost should also take fcIfc.GetWatchCount(requestInfo) as a parameter.
 		width := widthEstimator.EstimateWidth(r)
 		digest := utilflowcontrol.RequestDigest{RequestInfo: requestInfo, User: user, Width: width}
 		fcIfc.Handle(ctx, digest, note, func(inQueue bool) {
 			if inQueue {
 				noteWaitingDelta(1)
 			} else {
 				noteWaitingDelta(-1)
 		}
-		}, execute)
+
 		if !served {
 			setResponseHeaders(classification, w)
@ -187,26 +259,8 @@ func WithPriorityAndFairness(
 				epmetrics.DroppedRequests.WithContext(ctx).WithLabelValues(epmetrics.ReadOnlyKind).Inc()
 			}
 			epmetrics.RecordRequestTermination(r, requestInfo, epmetrics.APIServerComponent, http.StatusTooManyRequests)
 			if isWatchRequest {
 				close(resultCh)
 			}
 			tooManyRequests(r, w)
 		}
 		// For watch requests, from the APF point of view the request is already
 		// finished at this point. However, that doesn't mean it is already finished
 		// from the non-APF point of view. So we need to wait here until the request is:
 		// 1) finished being processed or
 		// 2) rejected
 		if isWatchRequest {
 			if forgetWatch != nil {
 				forgetWatch()
 			}
 			err := <-resultCh
 			if err != nil {
 				panic(err)
 			}
 		}
 	})
 }
--- a/staging/src/k8s.io/apiserver/pkg/server/filters/priority-and-fairness_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/filters/priority-and-fairness_test.go
@ -351,6 +351,23 @@ func TestApfExecuteMultipleRequests(t *testing.T) {
 	})
 }
 func TestApfCancelWaitRequest(t *testing.T) {
 	epmetrics.Register()
 	server := newApfServerWithSingleRequest(t, decisionCancelWait)
 	defer server.Close()
 	if err := expectHTTPGet(fmt.Sprintf("%s/api/v1/namespaces/default", server.URL), http.StatusTooManyRequests); err != nil {
 		t.Error(err)
 	}
 	checkForExpectedMetrics(t, []string{
 		"apiserver_current_inflight_requests",
 		"apiserver_request_terminations_total",
 		"apiserver_dropped_requests_total",
 	})
 }
 type fakeWatchApfFilter struct {
 	lock     sync.Mutex
 	inflight int
@ -556,23 +573,6 @@ func TestContextClosesOnRequestProcessed(t *testing.T) {
 	wg.Wait()
 }
 func TestApfCancelWaitRequest(t *testing.T) {
 	epmetrics.Register()
 	server := newApfServerWithSingleRequest(t, decisionCancelWait)
 	defer server.Close()
 	if err := expectHTTPGet(fmt.Sprintf("%s/api/v1/namespaces/default", server.URL), http.StatusTooManyRequests); err != nil {
 		t.Error(err)
 	}
 	checkForExpectedMetrics(t, []string{
 		"apiserver_current_inflight_requests",
 		"apiserver_request_terminations_total",
 		"apiserver_dropped_requests_total",
 	})
 }
 type fakeFilterRequestDigest struct {
 	*fakeApfFilter
 	requestDigestGot *utilflowcontrol.RequestDigest
--- a/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_filter.go
+++ b/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_filter.go
@ -49,6 +49,8 @@ type Interface interface {
 	// that the request should be executed then `execute()` will be
 	// invoked once to execute the request; otherwise `execute()` will
 	// not be invoked.
 	// Handle() should never return while execute() is running, even if
 	// ctx is cancelled or times out.
 	Handle(ctx context.Context,
 		requestDigest RequestDigest,
 		noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration),