From 5a130d2b71e5d70cfff15087f4d521c6b68fb01e Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 20 Nov 2023 20:25:00 +0100 Subject: [PATCH] apimachinery runtime: support contextual logging In contrast to the original HandleError and HandleCrash, the new HandleErrorWithContext and HandleCrashWithContext functions properly do contextual logging, so if a problem occurs while e.g. dealing with a certain request and WithValues was used for that request, then the error log entry will also contain information about it. The output changes from unstructured to structured, which might be a breaking change for users who grep for panics. Care was taken to format panics as similar as possible to the original output. For errors, a message string gets added. There was none before, which made it impossible to find all error output coming from HandleError. Keeping HandleError and HandleCrash around without deprecating while changing the signature of callbacks is a compromise between not breaking existing code and not adding too many special cases that need to be supported. There is some code which uses PanicHandlers or ErrorHandlers, but less than code that uses the Handle* calls. In Kubernetes, we want to replace the calls. logcheck warns about them in code which is supposed to be contextual. The steps towards that are: - add TODO remarks as reminder (this commit) - locally remove " TODO(pohly): " to enable the check with `//logcheck:context`, merge fixes for linter warnings - once there are none, remove the TODO to enable the check permanently --- hack/golangci-hints.yaml | 1 + hack/golangci-strict.yaml | 1 + hack/golangci.yaml | 1 + hack/logcheck.conf | 1 + .../apimachinery/pkg/util/runtime/runtime.go | 135 ++++++++++++++---- .../pkg/util/runtime/runtime_stack_test.go | 71 +++++++++ .../pkg/util/runtime/runtime_test.go | 14 +- .../apimachinery/pkg/util/wait/wait_test.go | 4 +- .../apiserver/pkg/server/filters/timeout.go | 4 +- .../pkg/server/filters/timeout_test.go | 8 +- .../apiserver/pkg/server/filters/wrap.go | 3 +- .../apiserver/pkg/util/proxy/websocket.go | 2 +- 12 files changed, 203 insertions(+), 42 deletions(-) create mode 100644 staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_stack_test.go diff --git a/hack/golangci-hints.yaml b/hack/golangci-hints.yaml index cd98978d137..3b30ada8e02 100644 --- a/hack/golangci-hints.yaml +++ b/hack/golangci-hints.yaml @@ -127,6 +127,7 @@ linters-settings: # please keep this alphabetized # The following packages have been migrated to contextual logging. # Packages matched here do not have to be listed above because # "contextual" implies "structured". + contextual k8s.io/apimachinery/pkg/util/runtime/.* contextual k8s.io/client-go/metadata/.* contextual k8s.io/client-go/tools/events/.* contextual k8s.io/client-go/tools/record/.* diff --git a/hack/golangci-strict.yaml b/hack/golangci-strict.yaml index ca7f524889a..7d9615a1473 100644 --- a/hack/golangci-strict.yaml +++ b/hack/golangci-strict.yaml @@ -173,6 +173,7 @@ linters-settings: # please keep this alphabetized # The following packages have been migrated to contextual logging. # Packages matched here do not have to be listed above because # "contextual" implies "structured". + contextual k8s.io/apimachinery/pkg/util/runtime/.* contextual k8s.io/client-go/metadata/.* contextual k8s.io/client-go/tools/events/.* contextual k8s.io/client-go/tools/record/.* diff --git a/hack/golangci.yaml b/hack/golangci.yaml index 94de6bcf3f8..c099e20a6c8 100644 --- a/hack/golangci.yaml +++ b/hack/golangci.yaml @@ -176,6 +176,7 @@ linters-settings: # please keep this alphabetized # The following packages have been migrated to contextual logging. # Packages matched here do not have to be listed above because # "contextual" implies "structured". + contextual k8s.io/apimachinery/pkg/util/runtime/.* contextual k8s.io/client-go/metadata/.* contextual k8s.io/client-go/tools/events/.* contextual k8s.io/client-go/tools/record/.* diff --git a/hack/logcheck.conf b/hack/logcheck.conf index ffb490368af..bee1fdc1026 100644 --- a/hack/logcheck.conf +++ b/hack/logcheck.conf @@ -24,6 +24,7 @@ structured k8s.io/apiserver/pkg/server/options/encryptionconfig/.* # The following packages have been migrated to contextual logging. # Packages matched here do not have to be listed above because # "contextual" implies "structured". +contextual k8s.io/apimachinery/pkg/util/runtime/.* contextual k8s.io/client-go/metadata/.* contextual k8s.io/client-go/tools/events/.* contextual k8s.io/client-go/tools/record/.* diff --git a/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime.go b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime.go index 3674914f701..4fe0c5eb250 100644 --- a/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime.go +++ b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime.go @@ -17,6 +17,7 @@ limitations under the License. package runtime import ( + "context" "fmt" "net/http" "runtime" @@ -35,7 +36,7 @@ var ( ) // PanicHandlers is a list of functions which will be invoked when a panic happens. -var PanicHandlers = []func(interface{}){logPanic} +var PanicHandlers = []func(context.Context, interface{}){logPanic} // HandleCrash simply catches a crash and logs an error. Meant to be called via // defer. Additional context-specific handlers can be provided, and will be @@ -43,23 +44,54 @@ var PanicHandlers = []func(interface{}){logPanic} // handlers and logging the panic message. // // E.g., you can provide one or more additional handlers for something like shutting down go routines gracefully. +// +// TODO(pohly): logcheck:context // HandleCrashWithContext should be used instead of HandleCrash in code which supports contextual logging. func HandleCrash(additionalHandlers ...func(interface{})) { if r := recover(); r != nil { - for _, fn := range PanicHandlers { - fn(r) - } - for _, fn := range additionalHandlers { - fn(r) - } - if ReallyCrash { - // Actually proceed to panic. - panic(r) + additionalHandlersWithContext := make([]func(context.Context, interface{}), len(additionalHandlers)) + for i, handler := range additionalHandlers { + handler := handler // capture loop variable + additionalHandlersWithContext[i] = func(_ context.Context, r interface{}) { + handler(r) + } } + + handleCrash(context.Background(), r, additionalHandlersWithContext...) + } +} + +// HandleCrashWithContext simply catches a crash and logs an error. Meant to be called via +// defer. Additional context-specific handlers can be provided, and will be +// called in case of panic. HandleCrash actually crashes, after calling the +// handlers and logging the panic message. +// +// E.g., you can provide one or more additional handlers for something like shutting down go routines gracefully. +// +// The context is used to determine how to log. +func HandleCrashWithContext(ctx context.Context, additionalHandlers ...func(context.Context, interface{})) { + if r := recover(); r != nil { + handleCrash(ctx, r, additionalHandlers...) + } +} + +// handleCrash is the common implementation of HandleCrash and HandleCrash. +// Having those call a common implementation ensures that the stack depth +// is the same regardless through which path the handlers get invoked. +func handleCrash(ctx context.Context, r any, additionalHandlers ...func(context.Context, interface{})) { + for _, fn := range PanicHandlers { + fn(ctx, r) + } + for _, fn := range additionalHandlers { + fn(ctx, r) + } + if ReallyCrash { + // Actually proceed to panic. + panic(r) } } // logPanic logs the caller tree when a panic occurs (except in the special case of http.ErrAbortHandler). -func logPanic(r interface{}) { +func logPanic(ctx context.Context, r interface{}) { if r == http.ErrAbortHandler { // honor the http.ErrAbortHandler sentinel panic value: // ErrAbortHandler is a sentinel panic value to abort a handler. @@ -73,10 +105,20 @@ func logPanic(r interface{}) { const size = 64 << 10 stacktrace := make([]byte, size) stacktrace = stacktrace[:runtime.Stack(stacktrace, false)] + + // We don't really know how many call frames to skip because the Go + // panic handler is between us and the code where the panic occurred. + // If it's one function (as in Go 1.21), then skipping four levels + // gets us to the function which called the `defer HandleCrashWithontext(...)`. + logger := klog.FromContext(ctx).WithCallDepth(4) + + // For backwards compatibility, conversion to string + // is handled here instead of defering to the logging + // backend. if _, ok := r.(string); ok { - klog.Errorf("Observed a panic: %s\n%s", r, stacktrace) + logger.Error(nil, "Observed a panic", "panic", r, "stacktrace", string(stacktrace)) } else { - klog.Errorf("Observed a panic: %#v (%v)\n%s", r, r, stacktrace) + logger.Error(nil, "Observed a panic", "panic", fmt.Sprintf("%v", r), "panicGoValue", fmt.Sprintf("%#v", r), "stacktrace", string(stacktrace)) } } @@ -84,35 +126,76 @@ func logPanic(r interface{}) { // error occurs. // TODO(lavalamp): for testability, this and the below HandleError function // should be packaged up into a testable and reusable object. -var ErrorHandlers = []func(error){ +var ErrorHandlers = []ErrorHandler{ logError, - (&rudimentaryErrorBackoff{ - lastErrorTime: time.Now(), - // 1ms was the number folks were able to stomach as a global rate limit. - // If you need to log errors more than 1000 times a second you - // should probably consider fixing your code instead. :) - minPeriod: time.Millisecond, - }).OnError, + func(_ context.Context, _ error, _ string, _ ...interface{}) { + (&rudimentaryErrorBackoff{ + lastErrorTime: time.Now(), + // 1ms was the number folks were able to stomach as a global rate limit. + // If you need to log errors more than 1000 times a second you + // should probably consider fixing your code instead. :) + minPeriod: time.Millisecond, + }).OnError() + }, } +type ErrorHandler func(ctx context.Context, err error, msg string, keysAndValues ...interface{}) + // HandlerError is a method to invoke when a non-user facing piece of code cannot // return an error and needs to indicate it has been ignored. Invoking this method // is preferable to logging the error - the default behavior is to log but the // errors may be sent to a remote server for analysis. +// +// TODO(pohly): logcheck:context // HandleErrorWithContext should be used instead of HandleError in code which supports contextual logging. func HandleError(err error) { // this is sometimes called with a nil error. We probably shouldn't fail and should do nothing instead if err == nil { return } + handleError(context.Background(), err, "Unhandled Error") +} + +// HandlerErrorWithContext is a method to invoke when a non-user facing piece of code cannot +// return an error and needs to indicate it has been ignored. Invoking this method +// is preferable to logging the error - the default behavior is to log but the +// errors may be sent to a remote server for analysis. The context is used to +// determine how to log the error. +// +// If contextual logging is enabled, the default log output is equivalent to +// +// logr.FromContext(ctx).WithName("UnhandledError").Error(err, msg, keysAndValues...) +// +// Without contextual logging, it is equivalent to: +// +// klog.ErrorS(err, msg, keysAndValues...) +// +// In contrast to HandleError, passing nil for the error is still going to +// trigger a log entry. Don't construct a new error or wrap an error +// with fmt.Errorf. Instead, add additional information via the mssage +// and key/value pairs. +// +// This variant should be used instead of HandleError because it supports +// structured, contextual logging. +func HandleErrorWithContext(ctx context.Context, err error, msg string, keysAndValues ...interface{}) { + handleError(ctx, err, msg, keysAndValues...) +} + +// handleError is the common implementation of HandleError and HandleErrorWithContext. +// Using this common implementation ensures that the stack depth +// is the same regardless through which path the handlers get invoked. +func handleError(ctx context.Context, err error, msg string, keysAndValues ...interface{}) { for _, fn := range ErrorHandlers { - fn(err) + fn(ctx, err, msg, keysAndValues...) } } -// logError prints an error with the call stack of the location it was reported -func logError(err error) { - klog.ErrorDepth(2, err) +// logError prints an error with the call stack of the location it was reported. +// It expects to be called as -> HandleError[WithContext] -> handleError -> logError. +func logError(ctx context.Context, err error, msg string, keysAndValues ...interface{}) { + logger := klog.FromContext(ctx).WithCallDepth(3) + logger = klog.LoggerWithName(logger, "UnhandledError") + logger.Error(err, msg, keysAndValues...) //nolint:logcheck // logcheck complains about unknown key/value pairs. } type rudimentaryErrorBackoff struct { @@ -125,7 +208,7 @@ type rudimentaryErrorBackoff struct { // OnError will block if it is called more often than the embedded period time. // This will prevent overly tight hot error loops. -func (r *rudimentaryErrorBackoff) OnError(error) { +func (r *rudimentaryErrorBackoff) OnError() { now := time.Now() // start the timer before acquiring the lock r.lastErrorTimeLock.Lock() d := now.Sub(r.lastErrorTime) diff --git a/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_stack_test.go b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_stack_test.go new file mode 100644 index 00000000000..380ad5b1a53 --- /dev/null +++ b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_stack_test.go @@ -0,0 +1,71 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "bytes" + "context" + "errors" + "flag" + "fmt" + "regexp" + + "k8s.io/klog/v2" +) + +//nolint:logcheck // Several functions are normally not okay in a package. +func ExampleHandleErrorWithContext() { + state := klog.CaptureState() + defer state.Restore() + var fs flag.FlagSet + klog.InitFlags(&fs) + for flag, value := range map[string]string{ + "one_output": "true", + "logtostderr": "false", + } { + if err := fs.Set(flag, value); err != nil { + fmt.Printf("Unexpected error configuring klog: %v", err) + return + } + } + var buffer bytes.Buffer + klog.SetOutput(&buffer) + + logger := klog.Background() + logger = klog.LoggerWithValues(logger, "request", 42) + ctx := klog.NewContext(context.Background(), logger) + + // The line number of the next call must be at line 60. Here are some + // blank lines that can be removed to keep the line unchanged. + // + // + // + // + // + // + HandleErrorWithContext(ctx, errors.New("fake error"), "test") + + klog.Flush() + // Strip varying header. Code location should be constant and something + // that needs to be tested. + output := buffer.String() + output = regexp.MustCompile(`^.* ([^[:space:]]*.go:[[:digit:]]*)\] `).ReplaceAllString(output, `xxx $1] `) + fmt.Print(output) + + // Output: + // xxx runtime_stack_test.go:60] "test" err="fake error" logger="UnhandledError" request=42 +} diff --git a/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_test.go b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_test.go index c886b6826ff..77e6edbd4e1 100644 --- a/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_test.go +++ b/staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_test.go @@ -18,6 +18,7 @@ package runtime import ( "bytes" + "context" "fmt" "io" "net/http" @@ -43,8 +44,8 @@ func TestCustomHandleCrash(t *testing.T) { old := PanicHandlers defer func() { PanicHandlers = old }() var result interface{} - PanicHandlers = []func(interface{}){ - func(r interface{}) { + PanicHandlers = []func(context.Context, interface{}){ + func(_ context.Context, r interface{}) { result = r }, } @@ -66,8 +67,8 @@ func TestCustomHandleError(t *testing.T) { old := ErrorHandlers defer func() { ErrorHandlers = old }() var result error - ErrorHandlers = []func(error){ - func(err error) { + ErrorHandlers = []ErrorHandler{ + func(_ context.Context, err error, msg string, keysAndValues ...interface{}) { result = err }, } @@ -101,7 +102,8 @@ func TestHandleCrashLog(t *testing.T) { if len(lines) < 4 { t.Fatalf("panic log should have 1 line of message, 1 line per goroutine and 2 lines per function call") } - if match, _ := regexp.MatchString("Observed a panic: test panic", lines[0]); !match { + t.Logf("Got log output:\n%s", strings.Join(lines, "\n")) + if match, _ := regexp.MatchString(`"Observed a panic" panic="test panic"`, lines[0]); !match { t.Errorf("mismatch panic message: %s", lines[0]) } // The following regexp's verify that Kubernetes panic log matches Golang stdlib @@ -170,7 +172,7 @@ func Test_rudimentaryErrorBackoff_OnError_ParallelSleep(t *testing.T) { wg.Add(1) go func() { <-start - r.OnError(nil) // input error is ignored + r.OnError() wg.Done() }() } diff --git a/staging/src/k8s.io/apimachinery/pkg/util/wait/wait_test.go b/staging/src/k8s.io/apimachinery/pkg/util/wait/wait_test.go index c8dd0bf58b3..f05decdec41 100644 --- a/staging/src/k8s.io/apimachinery/pkg/util/wait/wait_test.go +++ b/staging/src/k8s.io/apimachinery/pkg/util/wait/wait_test.go @@ -194,8 +194,8 @@ func TestJitterUntilRecoversPanic(t *testing.T) { // Hook up a custom crash handler to ensure it is called when a jitter function panics runtime.ReallyCrash = false - runtime.PanicHandlers = []func(interface{}){ - func(p interface{}) { + runtime.PanicHandlers = []func(context.Context, interface{}){ + func(_ context.Context, p interface{}) { handled++ }, } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go index 432851a631c..e9438732213 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go @@ -137,9 +137,7 @@ func (t *timeoutHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceTimeoutHandler, status) - err := fmt.Errorf("post-timeout activity - time-elapsed: %s, %v %q result: %v", - time.Since(timedOutAt), r.Method, r.URL.Path, res) - utilruntime.HandleError(err) + utilruntime.HandleErrorWithContext(r.Context(), nil, "Post-timeout activity", "timeElapsed", time.Since(timedOutAt), "method", r.Method, "path", r.URL.Path, "result", res) }() }() httplog.SetStacktracePredicate(r.Context(), func(status int) bool { diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go index 78b4ba7a89b..ffb15926bd4 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout_test.go @@ -408,7 +408,9 @@ func TestErrConnKilled(t *testing.T) { if isStackTraceLoggedByRuntime(capturedOutput) { t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput) } - if !strings.Contains(capturedOutput, `timeout or abort while handling: method=GET URI="/" audit-ID=""`) { + // For the sake of simplicity and clarity this matches the full log line. + // This is not part of the Kubernetes API and could change. + if !strings.Contains(capturedOutput, `"Timeout or abort while handling" logger="UnhandledError" method="GET" URI="/" auditID=""`) { t.Errorf("unexpected output captured actual = %v", capturedOutput) } } @@ -501,7 +503,9 @@ func TestErrConnKilledHTTP2(t *testing.T) { if isStackTraceLoggedByRuntime(capturedOutput) { t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput) } - if !strings.Contains(capturedOutput, `timeout or abort while handling: method=GET URI="/" audit-ID=""`) { + // For the sake of simplicity and clarity this matches the full log line. + // This is not part of the Kubernetes API and could change. + if !strings.Contains(capturedOutput, `"Timeout or abort while handling" logger="UnhandledError" method="GET" URI="/" auditID=""`) { t.Errorf("unexpected output captured actual = %v", capturedOutput) } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/wrap.go b/staging/src/k8s.io/apiserver/pkg/server/filters/wrap.go index a981eae78a8..73ce2702604 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/wrap.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/wrap.go @@ -17,7 +17,6 @@ limitations under the License. package filters import ( - "fmt" "net/http" "k8s.io/apimachinery/pkg/util/runtime" @@ -51,7 +50,7 @@ func WithPanicRecovery(handler http.Handler, resolver request.RequestInfoResolve // This call can have different handlers, but the default chain rate limits. Call it after the metrics are updated // in case the rate limit delays it. If you outrun the rate for this one timed out requests, something has gone // seriously wrong with your server, but generally having a logging signal for timeouts is useful. - runtime.HandleError(fmt.Errorf("timeout or abort while handling: method=%v URI=%q audit-ID=%q", req.Method, req.RequestURI, audit.GetAuditIDTruncated(req.Context()))) + runtime.HandleErrorWithContext(req.Context(), nil, "Timeout or abort while handling", "method", req.Method, "URI", req.RequestURI, "auditID", audit.GetAuditIDTruncated(req.Context())) return } http.Error(w, "This request caused apiserver to panic. Look in the logs for details.", http.StatusInternalServerError) diff --git a/staging/src/k8s.io/apiserver/pkg/util/proxy/websocket.go b/staging/src/k8s.io/apiserver/pkg/util/proxy/websocket.go index 3b9746b3b2f..798ce176724 100644 --- a/staging/src/k8s.io/apiserver/pkg/util/proxy/websocket.go +++ b/staging/src/k8s.io/apiserver/pkg/util/proxy/websocket.go @@ -79,7 +79,7 @@ func webSocketServerStreams(req *http.Request, w http.ResponseWriter, opts Optio if p := recover(); p != nil { // Standard panic logging. for _, fn := range runtime.PanicHandlers { - fn(p) + fn(req.Context(), p) } } }()