add more context to post timeout request activities

This commit is contained in:
Abu Kashem 2021-04-09 12:09:35 -04:00
parent a55bd63172
commit 66587afb60
No known key found for this signature in database
GPG Key ID: 33A4FA7088DB68A9
5 changed files with 278 additions and 15 deletions

View File

@ -21,10 +21,14 @@ import (
"fmt"
"net/http"
goruntime "runtime"
"time"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apiserver/pkg/endpoints/metrics"
"k8s.io/klog/v2"
)
// ResultFunc is a function that returns a rest result and can be run in a goroutine
@ -62,10 +66,27 @@ func (r *result) Return() (runtime.Object, error) {
}
}
// PostTimeoutLoggerFunc is a function that can be used to log the result returned
// by a ResultFunc after the request had timed out.
// timedOutAt is the time the request had been timed out.
// r is the result returned by the child goroutine.
type PostTimeoutLoggerFunc func(timedOutAt time.Time, r *result)
const (
// how much time the post-timeout receiver goroutine will wait for the sender
// (child goroutine executing ResultFunc) to send a result after the request.
// had timed out.
postTimeoutLoggerWait = 5 * time.Minute
)
// FinishRequest makes a given ResultFunc asynchronous and handles errors returned by the response.
func FinishRequest(ctx context.Context, fn ResultFunc) (runtime.Object, error) {
// the channel needs to be buffered to prevent the goroutine below from hanging indefinitely
// when the select statement reads something other than the one the goroutine sends on.
return finishRequest(ctx, fn, postTimeoutLoggerWait, logPostTimeoutResult)
}
func finishRequest(ctx context.Context, fn ResultFunc, postTimeoutWait time.Duration, postTimeoutLogger PostTimeoutLoggerFunc) (runtime.Object, error) {
// the channel needs to be buffered since the post-timeout receiver goroutine
// waits up to 5 minutes for the child goroutine to return.
resultCh := make(chan *result, 1)
go func() {
@ -104,6 +125,52 @@ func FinishRequest(ctx context.Context, fn ResultFunc) (runtime.Object, error) {
case result := <-resultCh:
return result.Return()
case <-ctx.Done():
return nil, errors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout %s", ctx.Err()), 0)
// we are going to send a timeout response to the caller, but the asynchronous goroutine
// (sender) is still executing the ResultFunc function.
// kick off a goroutine (receiver) here to wait for the sender (goroutine executing ResultFunc)
// to send the result and then log details of the result.
defer func() {
go func() {
timedOutAt := time.Now()
var result *result
select {
case result = <-resultCh:
case <-time.After(postTimeoutWait):
// we will not wait forever, if we are here then we know that some sender
// goroutines are taking longer than postTimeoutWait.
}
postTimeoutLogger(timedOutAt, result)
}()
}()
return nil, errors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout - %s", ctx.Err()), 0)
}
}
// logPostTimeoutResult logs a panic or an error from the result that the sender (goroutine that is
// executing the ResultFunc function) has sent to the receiver after the request had timed out.
// timedOutAt is the time the request had been timed out
func logPostTimeoutResult(timedOutAt time.Time, r *result) {
if r == nil {
// we are using r == nil to indicate that the child goroutine never returned a result.
metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceRestHandler, metrics.PostTimeoutHandlerPending)
klog.Errorf("FinishRequest: post-timeout activity, waited for %s, child goroutine has not returned yet", time.Since(timedOutAt))
return
}
var status string
switch {
case r.reason != nil:
// a non empty reason inside a result object indicates that there was a panic.
status = metrics.PostTimeoutHandlerPanic
case r.err != nil:
status = metrics.PostTimeoutHandlerError
default:
status = metrics.PostTimeoutHandlerOK
}
metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceRestHandler, status)
err := fmt.Errorf("FinishRequest: post-timeout activity - time-elapsed: %s, panicked: %t, err: %v, panic-reason: %v",
time.Since(timedOutAt), r.reason != nil, r.err, r.reason)
utilruntime.HandleError(err)
}

View File

@ -18,6 +18,7 @@ package finisher
import (
"context"
"errors"
"fmt"
"net/http"
"reflect"
@ -30,6 +31,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/apis/example"
"github.com/google/go-cmp/cmp"
)
func TestFinishRequest(t *testing.T) {
@ -156,3 +159,120 @@ func TestFinishRequest(t *testing.T) {
})
}
}
func TestFinishRequestWithPostTimeoutTracker(t *testing.T) {
tests := []struct {
name string
object runtime.Object
postTimeoutWait time.Duration
childGoroutineNeverReturns bool
err error
reason string
}{
{
name: "ResultFunc function returns a result after the request had timed out",
object: &example.Pod{},
postTimeoutWait: 5 * time.Minute,
},
{
name: "ResultFunc function returns an error after the request had timed out",
err: errors.New("my error"),
postTimeoutWait: 5 * time.Minute,
},
{
name: "ResultFunc function panics after the request had timed out",
reason: "my panic",
postTimeoutWait: 5 * time.Minute,
},
{
name: "ResultFunc function never returns, parent gives up after postTimeoutWait",
postTimeoutWait: 1 * time.Second,
childGoroutineNeverReturns: true,
},
}
expectedTimeoutErr := apierrors.NewTimeoutError(fmt.Sprintf("request did not complete within requested timeout - %s",
context.DeadlineExceeded), 0)
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.TODO(), time.Millisecond)
defer cancel()
timeoutAsDesignedCh, resultFuncDoneCh := make(chan struct{}), make(chan struct{})
resultFn := func() (runtime.Object, error) {
defer func() {
if test.childGoroutineNeverReturns {
// sleep a bit more than test.postTimeoutWait so the
// post-timeout monitor gives up.
time.Sleep(test.postTimeoutWait + time.Second)
}
close(resultFuncDoneCh)
}()
// it will block here
<-timeoutAsDesignedCh
if len(test.reason) > 0 {
panic(test.reason)
}
if test.err != nil && test.object != nil {
t.Fatal("both result and err are set, wrong test setup")
}
return test.object, test.err
}
var resultGot *result
postTimeoutLoggerCompletedCh := make(chan struct{})
decoratedPostTimeoutLogger := func(timedOutAt time.Time, r *result) {
defer func() {
resultGot = r
close(postTimeoutLoggerCompletedCh)
}()
logPostTimeoutResult(timedOutAt, r)
}
_, err := finishRequest(ctx, resultFn, test.postTimeoutWait, decoratedPostTimeoutLogger)
if err == nil || err.Error() != expectedTimeoutErr.Error() {
t.Errorf("expected timeout error: %v, but got: %v", expectedTimeoutErr, err)
}
// the rest ResultFunc is still running, let's unblock it so it can complete
close(timeoutAsDesignedCh)
t.Log("waiting for the ResultFunc rest function to finish")
<-resultFuncDoneCh
t.Log("waiting for the post-timeout logger to return")
<-postTimeoutLoggerCompletedCh
switch {
case test.childGoroutineNeverReturns && resultGot != nil:
t.Fatal("expected the result for the post-timeout logger to be nil")
case test.childGoroutineNeverReturns:
// resultGot is nil, nothing more to verify
return
case !test.childGoroutineNeverReturns && resultGot == nil:
t.Fatal("expected a result for the post-timeout logger, but got nil")
}
if test.object != resultGot.object {
t.Errorf("expected object to match, diff: %s", cmp.Diff(test.object, resultGot.object))
}
if test.err != resultGot.err {
t.Errorf("expected err: %v, but got: %v", test.err, resultGot.err)
}
switch {
case len(test.reason) == 0:
if resultGot.reason != nil {
t.Errorf("unexpected panic: %v", resultGot.reason)
}
case !strings.Contains(fmt.Sprintf("%v", resultGot.reason), test.reason):
t.Errorf("expected panic to contain: %q, but got: %v", test.reason, resultGot.reason)
}
})
}
}

View File

@ -211,6 +211,26 @@ var (
[]string{"verb", "group", "version", "resource", "subresource", "scope"},
)
// requestPostTimeoutTotal tracks the activity of the executing request handler after the associated request
// has been timed out by the apiserver.
// source: the name of the handler that is recording this metric. Currently, we have two:
// - timeout-handler: the "executing" handler returns after the timeout filter times out the request.
// - rest-handler: the "executing" handler returns after the rest layer times out the request.
// status: whether the handler panicked or threw an error, possible values:
// - 'panic': the handler panicked
// - 'error': the handler return an error
// - 'ok': the handler returned a result (no error and no panic)
// - 'pending': the handler is still running in the background and it did not return
// within the wait threshold.
requestPostTimeoutTotal = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_request_post_timeout_total",
Help: "Tracks the activity of the request handlers after the associated requests have been timed out by the apiserver",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"source", "status"},
)
metrics = []resettableCollector{
deprecatedRequestGauge,
requestCounter,
@ -228,6 +248,7 @@ var (
apiSelfRequestCounter,
requestFilterDuration,
requestAbortsTotal,
requestPostTimeoutTotal,
}
// these are the valid request methods which we report in our metrics. Any other request methods
@ -271,6 +292,36 @@ const (
removedReleaseAnnotationKey = "k8s.io/removed-release"
)
const (
// The source that is recording the apiserver_request_post_timeout_total metric.
// The "executing" request handler returns after the timeout filter times out the request.
PostTimeoutSourceTimeoutHandler = "timeout-handler"
// The source that is recording the apiserver_request_post_timeout_total metric.
// The "executing" request handler returns after the rest layer times out the request.
PostTimeoutSourceRestHandler = "rest-handler"
)
const (
// The executing request handler panicked after the request had
// been timed out by the apiserver.
PostTimeoutHandlerPanic = "panic"
// The executing request handler has returned an error to the post-timeout
// receiver after the request had been timed out by the apiserver.
PostTimeoutHandlerError = "error"
// The executing request handler has returned a result to the post-timeout
// receiver after the request had been timed out by the apiserver.
PostTimeoutHandlerOK = "ok"
// The executing request handler has not panicked or returned any error/result to
// the post-timeout receiver yet after the request had been timed out by the apiserver.
// The post-timeout receiver gives up after waiting for certain threshold and if the
// executing request handler has not returned yet we use the following label.
PostTimeoutHandlerPending = "pending"
)
var registerMetrics sync.Once
// Register all metrics.
@ -308,6 +359,10 @@ func RecordFilterLatency(ctx context.Context, name string, elapsed time.Duration
requestFilterDuration.WithContext(ctx).WithLabelValues(name).Observe(elapsed.Seconds())
}
func RecordRequestPostTimeout(source string, status string) {
requestPostTimeoutTotal.WithLabelValues(source, status).Inc()
}
// RecordRequestAbort records that the request was aborted possibly due to a timeout.
func RecordRequestAbort(req *http.Request, requestInfo *request.RequestInfo) {
if requestInfo == nil {

View File

@ -24,6 +24,7 @@ import (
"net/http"
"runtime"
"sync"
"time"
apierrors "k8s.io/apimachinery/pkg/api/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@ -119,15 +120,19 @@ func (t *timeoutHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// the work needs to send to it. This is defer'd to ensure it runs
// ever if the post timeout work itself panics.
go func() {
timedOutAt := time.Now()
res := <-resultCh
status := metrics.PostTimeoutHandlerOK
if res != nil {
switch t := res.(type) {
case error:
utilruntime.HandleError(t)
default:
utilruntime.HandleError(fmt.Errorf("%v", res))
}
// a non nil res indicates that there was a panic.
status = metrics.PostTimeoutHandlerPanic
}
metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceTimeoutHandler, status)
err := fmt.Errorf("post-timeout activity - time-elapsed: %s, %v %q result: %v",
time.Since(timedOutAt), r.Method, r.URL.Path, res)
utilruntime.HandleError(err)
}()
}()

View File

@ -258,10 +258,11 @@ func TestErrConnKilled(t *testing.T) {
t.Fatal("expected to receive an error")
}
// we should only get one line for this, not the big stack from before
capturedOutput := readStdErr()
if strings.Count(capturedOutput, "\n") != 1 {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
// We don't expect stack trace from the panic to be included in the log.
if isStackTraceLoggedByRuntime(capturedOutput) {
t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput)
}
if !strings.Contains(capturedOutput, `timeout or abort while handling: GET "/"`) {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
@ -348,10 +349,11 @@ func TestErrConnKilledHTTP2(t *testing.T) {
t.Fatal("expected to receive an error")
}
// we should only get one line for this, not the big stack from before
capturedOutput := readStdErr()
if strings.Count(capturedOutput, "\n") != 1 {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
// We don't expect stack trace from the panic to be included in the log.
if isStackTraceLoggedByRuntime(capturedOutput) {
t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput)
}
if !strings.Contains(capturedOutput, `timeout or abort while handling: GET "/"`) {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
@ -366,6 +368,20 @@ func TestErrConnKilledHTTP2(t *testing.T) {
}
}
func isStackTraceLoggedByRuntime(message string) bool {
// Check the captured output for the following patterns to find out if the
// stack trace is included in the log:
// - 'Observed a panic' (apimachinery runtime.go logs panic with this message)
// - 'goroutine 44 [running]:' (stack trace always starts with this)
if strings.Contains(message, "Observed a panic") &&
strings.Contains(message, "goroutine") &&
strings.Contains(message, "[running]:") {
return true
}
return false
}
var tsCrt = []byte(`-----BEGIN CERTIFICATE-----
MIIDTjCCAjagAwIBAgIJAJdcQEBN2CjoMA0GCSqGSIb3DQEBCwUAMFAxCzAJBgNV
BAYTAlBMMQ8wDQYDVQQIDAZQb2xhbmQxDzANBgNVBAcMBkdkYW5zazELMAkGA1UE