From 66587afb60b10569aa4ed35d53a4a91fcc40abd2 Mon Sep 17 00:00:00 2001 From: Abu Kashem Date: Fri, 9 Apr 2021 12:09:35 -0400 Subject: [PATCH] add more context to post timeout request activities --- .../endpoints/handlers/finisher/finisher.go | 73 ++++++++++- .../handlers/finisher/finisher_test.go | 120 ++++++++++++++++++ .../pkg/endpoints/metrics/metrics.go | 55 ++++++++ .../apiserver/pkg/server/filters/timeout.go | 17 ++- .../pkg/server/filters/timeout_test.go | 28 +++- 5 files changed, 278 insertions(+), 15 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher.go b/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher.go index 42b81c7f703..dd7651718b9 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher.go @@ -21,10 +21,14 @@ import ( "fmt" "net/http" goruntime "runtime" + "time" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apiserver/pkg/endpoints/metrics" + "k8s.io/klog/v2" ) // ResultFunc is a function that returns a rest result and can be run in a goroutine @@ -62,10 +66,27 @@ func (r *result) Return() (runtime.Object, error) { } } +// PostTimeoutLoggerFunc is a function that can be used to log the result returned +// by a ResultFunc after the request had timed out. +// timedOutAt is the time the request had been timed out. +// r is the result returned by the child goroutine. +type PostTimeoutLoggerFunc func(timedOutAt time.Time, r *result) + +const ( + // how much time the post-timeout receiver goroutine will wait for the sender + // (child goroutine executing ResultFunc) to send a result after the request. + // had timed out. + postTimeoutLoggerWait = 5 * time.Minute +) + // FinishRequest makes a given ResultFunc asynchronous and handles errors returned by the response. func FinishRequest(ctx context.Context, fn ResultFunc) (runtime.Object, error) { - // the channel needs to be buffered to prevent the goroutine below from hanging indefinitely - // when the select statement reads something other than the one the goroutine sends on. + return finishRequest(ctx, fn, postTimeoutLoggerWait, logPostTimeoutResult) +} + +func finishRequest(ctx context.Context, fn ResultFunc, postTimeoutWait time.Duration, postTimeoutLogger PostTimeoutLoggerFunc) (runtime.Object, error) { + // the channel needs to be buffered since the post-timeout receiver goroutine + // waits up to 5 minutes for the child goroutine to return. resultCh := make(chan *result, 1) go func() { @@ -104,6 +125,52 @@ func FinishRequest(ctx context.Context, fn ResultFunc) (runtime.Object, error) { case result := <-resultCh: return result.Return() case <-ctx.Done(): - return nil, errors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout %s", ctx.Err()), 0) + // we are going to send a timeout response to the caller, but the asynchronous goroutine + // (sender) is still executing the ResultFunc function. + // kick off a goroutine (receiver) here to wait for the sender (goroutine executing ResultFunc) + // to send the result and then log details of the result. + defer func() { + go func() { + timedOutAt := time.Now() + + var result *result + select { + case result = <-resultCh: + case <-time.After(postTimeoutWait): + // we will not wait forever, if we are here then we know that some sender + // goroutines are taking longer than postTimeoutWait. + } + postTimeoutLogger(timedOutAt, result) + }() + }() + return nil, errors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout - %s", ctx.Err()), 0) } } + +// logPostTimeoutResult logs a panic or an error from the result that the sender (goroutine that is +// executing the ResultFunc function) has sent to the receiver after the request had timed out. +// timedOutAt is the time the request had been timed out +func logPostTimeoutResult(timedOutAt time.Time, r *result) { + if r == nil { + // we are using r == nil to indicate that the child goroutine never returned a result. + metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceRestHandler, metrics.PostTimeoutHandlerPending) + klog.Errorf("FinishRequest: post-timeout activity, waited for %s, child goroutine has not returned yet", time.Since(timedOutAt)) + return + } + + var status string + switch { + case r.reason != nil: + // a non empty reason inside a result object indicates that there was a panic. + status = metrics.PostTimeoutHandlerPanic + case r.err != nil: + status = metrics.PostTimeoutHandlerError + default: + status = metrics.PostTimeoutHandlerOK + } + + metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceRestHandler, status) + err := fmt.Errorf("FinishRequest: post-timeout activity - time-elapsed: %s, panicked: %t, err: %v, panic-reason: %v", + time.Since(timedOutAt), r.reason != nil, r.err, r.reason) + utilruntime.HandleError(err) +} diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher_test.go b/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher_test.go index f4e667aaf6c..7da78348949 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher_test.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/handlers/finisher/finisher_test.go @@ -18,6 +18,7 @@ package finisher import ( "context" + "errors" "fmt" "net/http" "reflect" @@ -30,6 +31,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apiserver/pkg/apis/example" + + "github.com/google/go-cmp/cmp" ) func TestFinishRequest(t *testing.T) { @@ -156,3 +159,120 @@ func TestFinishRequest(t *testing.T) { }) } } + +func TestFinishRequestWithPostTimeoutTracker(t *testing.T) { + tests := []struct { + name string + object runtime.Object + postTimeoutWait time.Duration + childGoroutineNeverReturns bool + err error + reason string + }{ + { + name: "ResultFunc function returns a result after the request had timed out", + object: &example.Pod{}, + postTimeoutWait: 5 * time.Minute, + }, + { + name: "ResultFunc function returns an error after the request had timed out", + err: errors.New("my error"), + postTimeoutWait: 5 * time.Minute, + }, + { + name: "ResultFunc function panics after the request had timed out", + reason: "my panic", + postTimeoutWait: 5 * time.Minute, + }, + { + name: "ResultFunc function never returns, parent gives up after postTimeoutWait", + postTimeoutWait: 1 * time.Second, + childGoroutineNeverReturns: true, + }, + } + + expectedTimeoutErr := apierrors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout - %s", + context.DeadlineExceeded), 0) + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.TODO(), time.Millisecond) + defer cancel() + + timeoutAsDesignedCh, resultFuncDoneCh := make(chan struct{}), make(chan struct{}) + resultFn := func() (runtime.Object, error) { + defer func() { + if test.childGoroutineNeverReturns { + // sleep a bit more than test.postTimeoutWait so the + // post-timeout monitor gives up. + time.Sleep(test.postTimeoutWait + time.Second) + } + close(resultFuncDoneCh) + }() + + // it will block here + <-timeoutAsDesignedCh + + if len(test.reason) > 0 { + panic(test.reason) + } + if test.err != nil && test.object != nil { + t.Fatal("both result and err are set, wrong test setup") + } + + return test.object, test.err + } + + var resultGot *result + postTimeoutLoggerCompletedCh := make(chan struct{}) + decoratedPostTimeoutLogger := func(timedOutAt time.Time, r *result) { + defer func() { + resultGot = r + close(postTimeoutLoggerCompletedCh) + }() + + logPostTimeoutResult(timedOutAt, r) + } + + _, err := finishRequest(ctx, resultFn, test.postTimeoutWait, decoratedPostTimeoutLogger) + if err == nil || err.Error() != expectedTimeoutErr.Error() { + t.Errorf("expected timeout error: %v, but got: %v", expectedTimeoutErr, err) + } + + // the rest ResultFunc is still running, let's unblock it so it can complete + close(timeoutAsDesignedCh) + + t.Log("waiting for the ResultFunc rest function to finish") + <-resultFuncDoneCh + + t.Log("waiting for the post-timeout logger to return") + <-postTimeoutLoggerCompletedCh + + switch { + case test.childGoroutineNeverReturns && resultGot != nil: + t.Fatal("expected the result for the post-timeout logger to be nil") + case test.childGoroutineNeverReturns: + // resultGot is nil, nothing more to verify + return + case !test.childGoroutineNeverReturns && resultGot == nil: + t.Fatal("expected a result for the post-timeout logger, but got nil") + } + + if test.object != resultGot.object { + t.Errorf("expected object to match, diff: %s", cmp.Diff(test.object, resultGot.object)) + } + if test.err != resultGot.err { + t.Errorf("expected err: %v, but got: %v", test.err, resultGot.err) + } + + switch { + case len(test.reason) == 0: + if resultGot.reason != nil { + t.Errorf("unexpected panic: %v", resultGot.reason) + } + case !strings.Contains(fmt.Sprintf("%v", resultGot.reason), test.reason): + t.Errorf("expected panic to contain: %q, but got: %v", test.reason, resultGot.reason) + } + }) + } +} diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go index a278566438c..92acf6381d4 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go @@ -211,6 +211,26 @@ var ( []string{"verb", "group", "version", "resource", "subresource", "scope"}, ) + // requestPostTimeoutTotal tracks the activity of the executing request handler after the associated request + // has been timed out by the apiserver. + // source: the name of the handler that is recording this metric. Currently, we have two: + // - timeout-handler: the "executing" handler returns after the timeout filter times out the request. + // - rest-handler: the "executing" handler returns after the rest layer times out the request. + // status: whether the handler panicked or threw an error, possible values: + // - 'panic': the handler panicked + // - 'error': the handler return an error + // - 'ok': the handler returned a result (no error and no panic) + // - 'pending': the handler is still running in the background and it did not return + // within the wait threshold. + requestPostTimeoutTotal = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Name: "apiserver_request_post_timeout_total", + Help: "Tracks the activity of the request handlers after the associated requests have been timed out by the apiserver", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"source", "status"}, + ) + metrics = []resettableCollector{ deprecatedRequestGauge, requestCounter, @@ -228,6 +248,7 @@ var ( apiSelfRequestCounter, requestFilterDuration, requestAbortsTotal, + requestPostTimeoutTotal, } // these are the valid request methods which we report in our metrics. Any other request methods @@ -271,6 +292,36 @@ const ( removedReleaseAnnotationKey = "k8s.io/removed-release" ) +const ( + // The source that is recording the apiserver_request_post_timeout_total metric. + // The "executing" request handler returns after the timeout filter times out the request. + PostTimeoutSourceTimeoutHandler = "timeout-handler" + + // The source that is recording the apiserver_request_post_timeout_total metric. + // The "executing" request handler returns after the rest layer times out the request. + PostTimeoutSourceRestHandler = "rest-handler" +) + +const ( + // The executing request handler panicked after the request had + // been timed out by the apiserver. + PostTimeoutHandlerPanic = "panic" + + // The executing request handler has returned an error to the post-timeout + // receiver after the request had been timed out by the apiserver. + PostTimeoutHandlerError = "error" + + // The executing request handler has returned a result to the post-timeout + // receiver after the request had been timed out by the apiserver. + PostTimeoutHandlerOK = "ok" + + // The executing request handler has not panicked or returned any error/result to + // the post-timeout receiver yet after the request had been timed out by the apiserver. + // The post-timeout receiver gives up after waiting for certain threshold and if the + // executing request handler has not returned yet we use the following label. + PostTimeoutHandlerPending = "pending" +) + var registerMetrics sync.Once // Register all metrics. @@ -308,6 +359,10 @@ func RecordFilterLatency(ctx context.Context, name string, elapsed time.Duration requestFilterDuration.WithContext(ctx).WithLabelValues(name).Observe(elapsed.Seconds()) } +func RecordRequestPostTimeout(source string, status string) { + requestPostTimeoutTotal.WithLabelValues(source, status).Inc() +} + // RecordRequestAbort records that the request was aborted possibly due to a timeout. func RecordRequestAbort(req *http.Request, requestInfo *request.RequestInfo) { if requestInfo == nil { diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go index ccbed60dba5..69e4fd4f21c 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go @@ -24,6 +24,7 @@ import ( "net/http" "runtime" "sync" + "time" apierrors "k8s.io/apimachinery/pkg/api/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -119,15 +120,19 @@ func (t *timeoutHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // the work needs to send to it. This is defer'd to ensure it runs // ever if the post timeout work itself panics. go func() { + timedOutAt := time.Now() res := <-resultCh + + status := metrics.PostTimeoutHandlerOK if res != nil { - switch t := res.(type) { - case error: - utilruntime.HandleError(t) - default: - utilruntime.HandleError(fmt.Errorf("%v", res)) - } + // a non nil res indicates that there was a panic. + status = metrics.PostTimeoutHandlerPanic } + + metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceTimeoutHandler, status) + err := fmt.Errorf("post-timeout activity - time-elapsed: %s, %v %q result: %v", + time.Since(timedOutAt), r.Method, r.URL.Path, res) + utilruntime.HandleError(err) }() }() diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go index faf8c1ad81a..8de45d933bb 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go @@ -258,10 +258,11 @@ func TestErrConnKilled(t *testing.T) { t.Fatal("expected to receive an error") } - // we should only get one line for this, not the big stack from before capturedOutput := readStdErr() - if strings.Count(capturedOutput, "\n") != 1 { - t.Errorf("unexpected output captured actual = %v", capturedOutput) + + // We don't expect stack trace from the panic to be included in the log. + if isStackTraceLoggedByRuntime(capturedOutput) { + t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput) } if !strings.Contains(capturedOutput, `timeout or abort while handling: GET "/"`) { t.Errorf("unexpected output captured actual = %v", capturedOutput) @@ -348,10 +349,11 @@ func TestErrConnKilledHTTP2(t *testing.T) { t.Fatal("expected to receive an error") } - // we should only get one line for this, not the big stack from before capturedOutput := readStdErr() - if strings.Count(capturedOutput, "\n") != 1 { - t.Errorf("unexpected output captured actual = %v", capturedOutput) + + // We don't expect stack trace from the panic to be included in the log. + if isStackTraceLoggedByRuntime(capturedOutput) { + t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput) } if !strings.Contains(capturedOutput, `timeout or abort while handling: GET "/"`) { t.Errorf("unexpected output captured actual = %v", capturedOutput) @@ -366,6 +368,20 @@ func TestErrConnKilledHTTP2(t *testing.T) { } } +func isStackTraceLoggedByRuntime(message string) bool { + // Check the captured output for the following patterns to find out if the + // stack trace is included in the log: + // - 'Observed a panic' (apimachinery runtime.go logs panic with this message) + // - 'goroutine 44 [running]:' (stack trace always starts with this) + if strings.Contains(message, "Observed a panic") && + strings.Contains(message, "goroutine") && + strings.Contains(message, "[running]:") { + return true + } + + return false +} + var tsCrt = []byte(`-----BEGIN CERTIFICATE----- MIIDTjCCAjagAwIBAgIJAJdcQEBN2CjoMA0GCSqGSIb3DQEBCwUAMFAxCzAJBgNV BAYTAlBMMQ8wDQYDVQQIDAZQb2xhbmQxDzANBgNVBAcMBkdkYW5zazELMAkGA1UE