mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-19 09:52:49 +00:00
Merge pull request #108396 from tkashem/retry-metric
client-go: add a metric to count request retries
This commit is contained in:
commit
2ab3151936
@ -726,7 +726,6 @@ func (r *Request) Watch(ctx context.Context) (watch.Interface, error) {
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
updateURLMetrics(ctx, r, resp, err)
|
||||
retry.After(ctx, r, resp, err)
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
return r.newStreamWatcher(resp)
|
||||
@ -786,22 +785,36 @@ func (r *Request) newStreamWatcher(resp *http.Response) (watch.Interface, error)
|
||||
), nil
|
||||
}
|
||||
|
||||
// updateURLMetrics is a convenience function for pushing metrics.
|
||||
// It also handles corner cases for incomplete/invalid request data.
|
||||
func updateURLMetrics(ctx context.Context, req *Request, resp *http.Response, err error) {
|
||||
url := "none"
|
||||
// updateRequestResultMetric increments the RequestResult metric counter,
|
||||
// it should be called with the (response, err) tuple from the final
|
||||
// reply from the server.
|
||||
func updateRequestResultMetric(ctx context.Context, req *Request, resp *http.Response, err error) {
|
||||
code, host := sanitize(req, resp, err)
|
||||
metrics.RequestResult.Increment(ctx, code, req.verb, host)
|
||||
}
|
||||
|
||||
// updateRequestRetryMetric increments the RequestRetry metric counter,
|
||||
// it should be called with the (response, err) tuple for each retry
|
||||
// except for the final attempt.
|
||||
func updateRequestRetryMetric(ctx context.Context, req *Request, resp *http.Response, err error) {
|
||||
code, host := sanitize(req, resp, err)
|
||||
metrics.RequestRetry.IncrementRetry(ctx, code, req.verb, host)
|
||||
}
|
||||
|
||||
func sanitize(req *Request, resp *http.Response, err error) (string, string) {
|
||||
host := "none"
|
||||
if req.c.base != nil {
|
||||
url = req.c.base.Host
|
||||
host = req.c.base.Host
|
||||
}
|
||||
|
||||
// Errors can be arbitrary strings. Unbound label cardinality is not suitable for a metric
|
||||
// system so we just report them as `<error>`.
|
||||
if err != nil {
|
||||
metrics.RequestResult.Increment(ctx, "<error>", req.verb, url)
|
||||
} else {
|
||||
// Metrics for failure codes
|
||||
metrics.RequestResult.Increment(ctx, strconv.Itoa(resp.StatusCode), req.verb, url)
|
||||
code := "<error>"
|
||||
if resp != nil {
|
||||
code = strconv.Itoa(resp.StatusCode)
|
||||
}
|
||||
|
||||
return code, host
|
||||
}
|
||||
|
||||
// Stream formats and executes the request, and offers streaming of the response.
|
||||
@ -834,7 +847,6 @@ func (r *Request) Stream(ctx context.Context) (io.ReadCloser, error) {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
updateURLMetrics(ctx, r, resp, err)
|
||||
retry.After(ctx, r, resp, err)
|
||||
if err != nil {
|
||||
// we only retry on an HTTP response with 'Retry-After' header
|
||||
@ -979,7 +991,6 @@ func (r *Request) request(ctx context.Context, fn func(*http.Request, *http.Resp
|
||||
return err
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
updateURLMetrics(ctx, r, resp, err)
|
||||
// The value -1 or a value of 0 with a non-nil Body indicates that the length is unknown.
|
||||
// https://pkg.go.dev/net/http#Request
|
||||
if req.ContentLength >= 0 && !(req.Body != nil && req.ContentLength == 0) {
|
||||
|
@ -2991,6 +2991,7 @@ type withRateLimiterBackoffManagerAndMetrics struct {
|
||||
metrics.ResultMetric
|
||||
calculateBackoffSeq int64
|
||||
calculateBackoffFn func(i int64) time.Duration
|
||||
metrics.RetryMetric
|
||||
|
||||
invokeOrderGot []string
|
||||
sleepsGot []string
|
||||
@ -3027,6 +3028,14 @@ func (lb *withRateLimiterBackoffManagerAndMetrics) Increment(ctx context.Context
|
||||
}
|
||||
}
|
||||
|
||||
func (lb *withRateLimiterBackoffManagerAndMetrics) IncrementRetry(ctx context.Context, code, _, _ string) {
|
||||
// we are interested in the request context that is marked by this test
|
||||
if marked, ok := ctx.Value(retryTestKey).(bool); ok && marked {
|
||||
lb.invokeOrderGot = append(lb.invokeOrderGot, "RequestRetry.IncrementRetry")
|
||||
lb.statusCodesGot = append(lb.statusCodesGot, code)
|
||||
}
|
||||
}
|
||||
|
||||
func (lb *withRateLimiterBackoffManagerAndMetrics) Do() {
|
||||
lb.invokeOrderGot = append(lb.invokeOrderGot, "Client.Do")
|
||||
}
|
||||
@ -3072,13 +3081,17 @@ func testRetryWithRateLimiterBackoffAndMetrics(t *testing.T, key string, doFunc
|
||||
"Client.Do",
|
||||
|
||||
// it's a success, so do the following:
|
||||
// - call metrics and update backoff parameters
|
||||
// count the result metric, and since it's a retry,
|
||||
// count the retry metric, and then update backoff parameters.
|
||||
"RequestResult.Increment",
|
||||
"RequestRetry.IncrementRetry",
|
||||
"BackoffManager.UpdateBackoff",
|
||||
}
|
||||
statusCodesWant := []string{
|
||||
// first attempt (A): we count the result metric only
|
||||
"500",
|
||||
"200",
|
||||
// final attempt (B): we count the result metric, and the retry metric
|
||||
"200", "200",
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
@ -3192,10 +3205,13 @@ func testRetryWithRateLimiterBackoffAndMetrics(t *testing.T, key string, doFunc
|
||||
// to override as well, and we want tests to be able to run in
|
||||
// parallel then we will need to provide a way for tests to
|
||||
// register/deregister their own metric inerfaces.
|
||||
old := metrics.RequestResult
|
||||
oldRequestResult := metrics.RequestResult
|
||||
oldRequestRetry := metrics.RequestRetry
|
||||
metrics.RequestResult = interceptor
|
||||
metrics.RequestRetry = interceptor
|
||||
defer func() {
|
||||
metrics.RequestResult = old
|
||||
metrics.RequestResult = oldRequestResult
|
||||
metrics.RequestRetry = oldRequestRetry
|
||||
}()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
@ -242,8 +242,20 @@ func (r *withRetry) After(ctx context.Context, request *Request, resp *http.Resp
|
||||
// parameters calculated from the (response, err) tuple from
|
||||
// attempt N-1, so r.retryAfter is outdated and should not be
|
||||
// referred to here.
|
||||
isRetry := r.retryAfter != nil
|
||||
r.retryAfter = nil
|
||||
|
||||
// the client finishes a single request after N attempts (1..N)
|
||||
// - all attempts (1..N) are counted to the rest_client_requests_total
|
||||
// metric (current behavior).
|
||||
// - every attempt after the first (2..N) are counted to the
|
||||
// rest_client_request_retries_total metric.
|
||||
updateRequestResultMetric(ctx, request, resp, err)
|
||||
if isRetry {
|
||||
// this is attempt 2 or later
|
||||
updateRequestRetryMetric(ctx, request, resp, err)
|
||||
}
|
||||
|
||||
if request.c.base != nil {
|
||||
if err != nil {
|
||||
request.backoff.UpdateBackoff(request.URL(), err, 0)
|
||||
|
@ -58,6 +58,12 @@ type CallsMetric interface {
|
||||
Increment(exitCode int, callStatus string)
|
||||
}
|
||||
|
||||
// RetryMetric counts the number of retries sent to the server
|
||||
// partitioned by code, method, and host.
|
||||
type RetryMetric interface {
|
||||
IncrementRetry(ctx context.Context, code string, method string, host string)
|
||||
}
|
||||
|
||||
var (
|
||||
// ClientCertExpiry is the expiry time of a client certificate
|
||||
ClientCertExpiry ExpiryMetric = noopExpiry{}
|
||||
@ -76,6 +82,9 @@ var (
|
||||
// ExecPluginCalls is the number of calls made to an exec plugin, partitioned by
|
||||
// exit code and call status.
|
||||
ExecPluginCalls CallsMetric = noopCalls{}
|
||||
// RequestRetry is the retry metric that tracks the number of
|
||||
// retries sent to the server.
|
||||
RequestRetry RetryMetric = noopRetry{}
|
||||
)
|
||||
|
||||
// RegisterOpts contains all the metrics to register. Metrics may be nil.
|
||||
@ -88,6 +97,7 @@ type RegisterOpts struct {
|
||||
RateLimiterLatency LatencyMetric
|
||||
RequestResult ResultMetric
|
||||
ExecPluginCalls CallsMetric
|
||||
RequestRetry RetryMetric
|
||||
}
|
||||
|
||||
// Register registers metrics for the rest client to use. This can
|
||||
@ -118,6 +128,9 @@ func Register(opts RegisterOpts) {
|
||||
if opts.ExecPluginCalls != nil {
|
||||
ExecPluginCalls = opts.ExecPluginCalls
|
||||
}
|
||||
if opts.RequestRetry != nil {
|
||||
RequestRetry = opts.RequestRetry
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@ -144,3 +157,7 @@ func (noopResult) Increment(context.Context, string, string, string) {}
|
||||
type noopCalls struct{}
|
||||
|
||||
func (noopCalls) Increment(int, string) {}
|
||||
|
||||
type noopRetry struct{}
|
||||
|
||||
func (noopRetry) IncrementRetry(context.Context, string, string, string) {}
|
||||
|
@ -82,6 +82,15 @@ var (
|
||||
[]string{"code", "verb", "host"},
|
||||
)
|
||||
|
||||
requestRetry = k8smetrics.NewCounterVec(
|
||||
&k8smetrics.CounterOpts{
|
||||
Name: "rest_client_request_retries_total",
|
||||
StabilityLevel: k8smetrics.ALPHA,
|
||||
Help: "Number of request retries, partitioned by status code, verb, and host.",
|
||||
},
|
||||
[]string{"code", "verb", "host"},
|
||||
)
|
||||
|
||||
execPluginCertTTLAdapter = &expiryToTTLAdapter{}
|
||||
|
||||
execPluginCertTTL = k8smetrics.NewGaugeFunc(
|
||||
@ -152,6 +161,7 @@ func init() {
|
||||
legacyregistry.MustRegister(responseSize)
|
||||
legacyregistry.MustRegister(rateLimiterLatency)
|
||||
legacyregistry.MustRegister(requestResult)
|
||||
legacyregistry.MustRegister(requestRetry)
|
||||
legacyregistry.RawMustRegister(execPluginCertTTL)
|
||||
legacyregistry.MustRegister(execPluginCertRotation)
|
||||
metrics.Register(metrics.RegisterOpts{
|
||||
@ -162,6 +172,7 @@ func init() {
|
||||
ResponseSize: &sizeAdapter{m: responseSize},
|
||||
RateLimiterLatency: &latencyAdapter{m: rateLimiterLatency},
|
||||
RequestResult: &resultAdapter{requestResult},
|
||||
RequestRetry: &retryAdapter{requestRetry},
|
||||
ExecPluginCalls: &callsAdapter{m: execPluginCalls},
|
||||
})
|
||||
}
|
||||
@ -213,3 +224,11 @@ type callsAdapter struct {
|
||||
func (r *callsAdapter) Increment(code int, callStatus string) {
|
||||
r.m.WithLabelValues(fmt.Sprintf("%d", code), callStatus).Inc()
|
||||
}
|
||||
|
||||
type retryAdapter struct {
|
||||
m *k8smetrics.CounterVec
|
||||
}
|
||||
|
||||
func (r *retryAdapter) IncrementRetry(ctx context.Context, code, method, host string) {
|
||||
r.m.WithContext(ctx).WithLabelValues(code, method, host).Inc()
|
||||
}
|
||||
|
@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package restclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"k8s.io/client-go/tools/metrics"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
"k8s.io/component-base/metrics/testutil"
|
||||
)
|
||||
|
||||
func TestClientGOMetrics(t *testing.T) {
|
||||
tests := []struct {
|
||||
description string
|
||||
name string
|
||||
metric interface{}
|
||||
update func()
|
||||
want string
|
||||
}{
|
||||
{
|
||||
description: "Number of HTTP requests, partitioned by status code, verb, and host.",
|
||||
name: "rest_client_requests_total",
|
||||
metric: requestResult,
|
||||
update: func() {
|
||||
metrics.RequestResult.Increment(context.TODO(), "200", "POST", "www.foo.com")
|
||||
},
|
||||
want: `
|
||||
# HELP rest_client_requests_total [ALPHA] Number of HTTP requests, partitioned by status code, verb, and host.
|
||||
# TYPE rest_client_requests_total counter
|
||||
rest_client_requests_total{code="200",host="www.foo.com",verb="POST"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
description: "Number of request retries, partitioned by status code, verb, and host.",
|
||||
name: "rest_client_request_retries_total",
|
||||
metric: requestRetry,
|
||||
update: func() {
|
||||
metrics.RequestRetry.IncrementRetry(context.TODO(), "500", "GET", "www.bar.com")
|
||||
},
|
||||
want: `
|
||||
# HELP rest_client_request_retries_total [ALPHA] Number of request retries, partitioned by status code, verb, and host.
|
||||
# TYPE rest_client_request_retries_total counter
|
||||
rest_client_request_retries_total{code="500",host="www.bar.com",verb="GET"} 1
|
||||
`,
|
||||
},
|
||||
}
|
||||
|
||||
// no need to register the metrics here, since the init function of
|
||||
// the package registers all the client-go metrics.
|
||||
for _, test := range tests {
|
||||
t.Run(test.description, func(t *testing.T) {
|
||||
resetter, resettable := test.metric.(interface {
|
||||
Reset()
|
||||
})
|
||||
if !resettable {
|
||||
t.Fatalf("the metric must be resettaable: %s", test.name)
|
||||
}
|
||||
|
||||
// Since prometheus' gatherer is global, other tests may have updated
|
||||
// metrics already, so we need to reset them prior to running this test.
|
||||
// This also implies that we can't run this test in parallel with other tests.
|
||||
resetter.Reset()
|
||||
test.update()
|
||||
|
||||
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.want), test.name); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user