From 4c6e7247878477a1f2efc26df7f141258010374f Mon Sep 17 00:00:00 2001 From: Han Kang Date: Wed, 2 Oct 2019 15:04:29 -0700 Subject: [PATCH 1/3] Fix double counting issue for request metrics on timeout. Currently we record request metrics during the normal request flow and we also manually invoke `Record` in the timeout handler to record timeouts. This means that we effectively double count whenever we timeout. This PR renames the `Record` function to `RecordRequestError` to more accurately reflect the intended side-effect of the function call. Change-Id: Ie37fd0c1e501bd525640a434433d364a5fd6dde2 --- .../pkg/endpoints/metrics/metrics.go | 23 +++++++++++++------ .../pkg/server/filters/maxinflight.go | 2 +- .../apiserver/pkg/server/filters/timeout.go | 2 +- .../pkg/apiserver/handler_proxy.go | 2 +- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go index 30ce24a4bd8..e092e3e55f3 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go @@ -178,7 +178,7 @@ var ( }, []string{"group", "version", "kind"}, ) - // Because of volatality of the base metric this is pre-aggregated one. Instead of reporing current usage all the time + // Because of volatility of the base metric this is pre-aggregated one. Instead of reporting current usage all the time // it reports maximal usage during the last second. currentInflightRequests = compbasemetrics.NewGaugeVec( &compbasemetrics.GaugeOpts{ @@ -188,6 +188,15 @@ var ( }, []string{"requestKind"}, ) + + requestErrorTotal = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Name: "apiserver_request_error_total", + Help: "Number of requests which have resulted in an apiserver response error", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"verb", "group", "version", "resource", "subresource", "scope", "component", "code"}, + ) kubectlExeRegexp = regexp.MustCompile(`^.*((?i:kubectl\.exe))`) metrics = []resettableCollector{ @@ -204,6 +213,7 @@ var ( WatchEvents, WatchEventsSizes, currentInflightRequests, + requestErrorTotal, } ) @@ -237,10 +247,9 @@ func UpdateInflightRequestMetrics(nonmutating, mutating int) { currentInflightRequests.WithLabelValues(MutatingKind).Set(float64(mutating)) } -// Record records a single request to the standard metrics endpoints. For use by handlers that perform their own -// processing. All API paths should use InstrumentRouteFunc implicitly. Use this instead of MonitorRequest if -// you already have a RequestInfo object. -func Record(req *http.Request, requestInfo *request.RequestInfo, component, contentType string, code int, responseSizeInBytes int, elapsed time.Duration) { +// RecordRequestError records the occurrence of a request error during apiserver handling (e.g. timeouts, +// maxinflight throttling, proxyHandler errors). +func RecordRequestError(req *http.Request, requestInfo *request.RequestInfo, component string, code int) { if requestInfo == nil { requestInfo = &request.RequestInfo{Verb: req.Method, Path: req.URL.Path} } @@ -252,9 +261,9 @@ func Record(req *http.Request, requestInfo *request.RequestInfo, component, cont // However, we need to tweak it e.g. to differentiate GET from LIST. verb := canonicalVerb(strings.ToUpper(req.Method), scope) if requestInfo.IsResourceRequest { - MonitorRequest(req, verb, requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, contentType, code, responseSizeInBytes, elapsed) + requestErrorTotal.WithLabelValues(cleanVerb(verb, req), requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, codeToString(code)).Inc() } else { - MonitorRequest(req, verb, "", "", "", requestInfo.Path, scope, component, contentType, code, responseSizeInBytes, elapsed) + requestErrorTotal.WithLabelValues(cleanVerb(verb, req), "", "", "", requestInfo.Path, scope, component, codeToString(code)).Inc() } } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go b/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go index f7bc691c762..d77676b0a02 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go @@ -178,7 +178,7 @@ func WithMaxInFlightLimit( } } } - metrics.Record(r, requestInfo, metrics.APIServerComponent, "", http.StatusTooManyRequests, 0, 0) + metrics.RecordRequestError(r, requestInfo, metrics.APIServerComponent, http.StatusTooManyRequests) tooManyRequests(r, w) } } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go index 23ac0034d8d..251597584c8 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go @@ -59,7 +59,7 @@ func WithTimeoutForNonLongRunningRequests(handler http.Handler, longRunning apir postTimeoutFn := func() { cancel() - metrics.Record(req, requestInfo, metrics.APIServerComponent, "", http.StatusGatewayTimeout, 0, 0) + metrics.RecordRequestError(req, requestInfo, metrics.APIServerComponent, http.StatusGatewayTimeout) } return req, time.After(timeout), postTimeoutFn, apierrors.NewTimeoutError(fmt.Sprintf("request did not complete within %s", timeout), 0) } diff --git a/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go b/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go index bfbe4c0955d..c49658cd0e2 100644 --- a/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go +++ b/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go @@ -98,7 +98,7 @@ func proxyError(w http.ResponseWriter, req *http.Request, error string, code int return } // TODO: record long-running request differently? The long-running check func does not necessarily match the one of the aggregated apiserver - endpointmetrics.Record(req, info, aggregatorComponent, "", code, 0, 0) + endpointmetrics.RecordRequestError(req, info, aggregatorComponent, code) } func (r *proxyHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { From 5e652fe12605027509870d5d7f821390f20b1cee Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 8 Oct 2019 15:49:12 -0700 Subject: [PATCH 2/3] pluralize error metric name Change-Id: I12eb94f41ded20ed5a16332ada13a7b34f75de18 --- .../apiserver/pkg/endpoints/metrics/metrics.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go index e092e3e55f3..192d991bd33 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go @@ -189,10 +189,10 @@ var ( []string{"requestKind"}, ) - requestErrorTotal = compbasemetrics.NewCounterVec( + requestErrorsTotal = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ - Name: "apiserver_request_error_total", - Help: "Number of requests which have resulted in an apiserver response error", + Name: "apiserver_request_errors_total", + Help: "Number of requests which have resulted in an apiserver response error.", StabilityLevel: compbasemetrics.ALPHA, }, []string{"verb", "group", "version", "resource", "subresource", "scope", "component", "code"}, @@ -213,7 +213,7 @@ var ( WatchEvents, WatchEventsSizes, currentInflightRequests, - requestErrorTotal, + requestErrorsTotal, } ) @@ -261,9 +261,9 @@ func RecordRequestError(req *http.Request, requestInfo *request.RequestInfo, com // However, we need to tweak it e.g. to differentiate GET from LIST. verb := canonicalVerb(strings.ToUpper(req.Method), scope) if requestInfo.IsResourceRequest { - requestErrorTotal.WithLabelValues(cleanVerb(verb, req), requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, codeToString(code)).Inc() + requestErrorsTotal.WithLabelValues(cleanVerb(verb, req), requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, codeToString(code)).Inc() } else { - requestErrorTotal.WithLabelValues(cleanVerb(verb, req), "", "", "", requestInfo.Path, scope, component, codeToString(code)).Inc() + requestErrorsTotal.WithLabelValues(cleanVerb(verb, req), "", "", "", requestInfo.Path, scope, component, codeToString(code)).Inc() } } From f8f1def5f1d92a588f48ebc01293e142f8dd63fd Mon Sep 17 00:00:00 2001 From: Han Kang Date: Mon, 21 Oct 2019 13:26:31 -0700 Subject: [PATCH 3/3] rename metric for apiserver request terminations and reword corresponding documentation Change-Id: I47a9c7b10614afe85bb652fa61984f91848d6d65 --- .../pkg/endpoints/metrics/metrics.go | 20 ++++++++++--------- .../pkg/server/filters/maxinflight.go | 2 +- .../apiserver/pkg/server/filters/timeout.go | 2 +- .../pkg/apiserver/handler_proxy.go | 2 +- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go index 192d991bd33..dabd56edd23 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go @@ -189,10 +189,10 @@ var ( []string{"requestKind"}, ) - requestErrorsTotal = compbasemetrics.NewCounterVec( + requestTerminationsTotal = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ - Name: "apiserver_request_errors_total", - Help: "Number of requests which have resulted in an apiserver response error.", + Name: "apiserver_request_terminations_total", + Help: "Number of requests which apiserver terminated in self-defense.", StabilityLevel: compbasemetrics.ALPHA, }, []string{"verb", "group", "version", "resource", "subresource", "scope", "component", "code"}, @@ -213,7 +213,7 @@ var ( WatchEvents, WatchEventsSizes, currentInflightRequests, - requestErrorsTotal, + requestTerminationsTotal, } ) @@ -247,9 +247,11 @@ func UpdateInflightRequestMetrics(nonmutating, mutating int) { currentInflightRequests.WithLabelValues(MutatingKind).Set(float64(mutating)) } -// RecordRequestError records the occurrence of a request error during apiserver handling (e.g. timeouts, -// maxinflight throttling, proxyHandler errors). -func RecordRequestError(req *http.Request, requestInfo *request.RequestInfo, component string, code int) { +// RecordRequestTermination records that the request was terminated early as part of a resource +// preservation or apiserver self-defense mechanism (e.g. timeouts, maxinflight throttling, +// proxyHandler errors). RecordRequestTermination should only be called zero or one times +// per request. +func RecordRequestTermination(req *http.Request, requestInfo *request.RequestInfo, component string, code int) { if requestInfo == nil { requestInfo = &request.RequestInfo{Verb: req.Method, Path: req.URL.Path} } @@ -261,9 +263,9 @@ func RecordRequestError(req *http.Request, requestInfo *request.RequestInfo, com // However, we need to tweak it e.g. to differentiate GET from LIST. verb := canonicalVerb(strings.ToUpper(req.Method), scope) if requestInfo.IsResourceRequest { - requestErrorsTotal.WithLabelValues(cleanVerb(verb, req), requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, codeToString(code)).Inc() + requestTerminationsTotal.WithLabelValues(cleanVerb(verb, req), requestInfo.APIGroup, requestInfo.APIVersion, requestInfo.Resource, requestInfo.Subresource, scope, component, codeToString(code)).Inc() } else { - requestErrorsTotal.WithLabelValues(cleanVerb(verb, req), "", "", "", requestInfo.Path, scope, component, codeToString(code)).Inc() + requestTerminationsTotal.WithLabelValues(cleanVerb(verb, req), "", "", "", requestInfo.Path, scope, component, codeToString(code)).Inc() } } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go b/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go index d77676b0a02..099e23e14fb 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go @@ -178,7 +178,7 @@ func WithMaxInFlightLimit( } } } - metrics.RecordRequestError(r, requestInfo, metrics.APIServerComponent, http.StatusTooManyRequests) + metrics.RecordRequestTermination(r, requestInfo, metrics.APIServerComponent, http.StatusTooManyRequests) tooManyRequests(r, w) } } diff --git a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go index 251597584c8..9b8d6d4b138 100644 --- a/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go +++ b/staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go @@ -59,7 +59,7 @@ func WithTimeoutForNonLongRunningRequests(handler http.Handler, longRunning apir postTimeoutFn := func() { cancel() - metrics.RecordRequestError(req, requestInfo, metrics.APIServerComponent, http.StatusGatewayTimeout) + metrics.RecordRequestTermination(req, requestInfo, metrics.APIServerComponent, http.StatusGatewayTimeout) } return req, time.After(timeout), postTimeoutFn, apierrors.NewTimeoutError(fmt.Sprintf("request did not complete within %s", timeout), 0) } diff --git a/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go b/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go index c49658cd0e2..9f876b227f1 100644 --- a/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go +++ b/staging/src/k8s.io/kube-aggregator/pkg/apiserver/handler_proxy.go @@ -98,7 +98,7 @@ func proxyError(w http.ResponseWriter, req *http.Request, error string, code int return } // TODO: record long-running request differently? The long-running check func does not necessarily match the one of the aggregated apiserver - endpointmetrics.RecordRequestError(req, info, aggregatorComponent, code) + endpointmetrics.RecordRequestTermination(req, info, aggregatorComponent, code) } func (r *proxyHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {