diff --git a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics.go index 6f155c0adb2..78c9a598388 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics.go @@ -47,6 +47,22 @@ var ( }, []string{"operation", "type"}, ) + etcdRequestCounts = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Name: "etcd_requests_total", + Help: "Etcd request counts for each operation and object type.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"operation", "type"}, + ) + etcdRequestErrorCounts = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Name: "etcd_request_errors_total", + Help: "Etcd failed request counts for each operation and object type.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"operation", "type"}, + ) objectCounts = compbasemetrics.NewGaugeVec( &compbasemetrics.GaugeOpts{ Name: "apiserver_storage_objects", @@ -140,6 +156,8 @@ func Register() { // Register the metrics. registerMetrics.Do(func() { legacyregistry.MustRegister(etcdRequestLatency) + legacyregistry.MustRegister(etcdRequestCounts) + legacyregistry.MustRegister(etcdRequestErrorCounts) legacyregistry.MustRegister(objectCounts) legacyregistry.MustRegister(dbTotalSize) legacyregistry.MustRegister(etcdBookmarkCounts) @@ -157,9 +175,15 @@ func UpdateObjectCount(resourcePrefix string, count int64) { objectCounts.WithLabelValues(resourcePrefix).Set(float64(count)) } -// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics. -func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) { - etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime)) +// RecordEtcdRequest updates and sets the etcd_request_duration_seconds, +// etcd_request_total, etcd_request_errors_total metrics. +func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) { + v := []string{verb, resource} + etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime)) + etcdRequestCounts.WithLabelValues(v...).Inc() + if err != nil { + etcdRequestErrorCounts.WithLabelValues(v...).Inc() + } } // RecordEtcdEvent updated the etcd_events_received_total metric. @@ -183,7 +207,9 @@ func Reset() { } // sinceInSeconds gets the time since the specified start in seconds. -func sinceInSeconds(start time.Time) float64 { +// +// This is a variable to facilitate testing. +var sinceInSeconds = func(start time.Time) float64 { return time.Since(start).Seconds() } diff --git a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics_test.go b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics_test.go index 76fe533bae2..93d15e4af15 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics_test.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/metrics/metrics_test.go @@ -17,8 +17,10 @@ limitations under the License. package metrics import ( + "errors" "strings" "testing" + "time" "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/testutil" @@ -55,3 +57,125 @@ func TestRecordDecodeError(t *testing.T) { }) } } + +func TestRecordEtcdRequest(t *testing.T) { + registry := metrics.NewKubeRegistry() + + // modify default sinceInSeconds to constant NOW + sinceInSeconds = func(t time.Time) float64 { + return time.Unix(0, 300*int64(time.Millisecond)).Sub(t).Seconds() + } + + testedMetrics := []metrics.Registerable{ + etcdRequestCounts, + etcdRequestErrorCounts, + etcdRequestLatency, + } + + testedMetricsName := make([]string, 0, len(testedMetrics)) + for _, m := range testedMetrics { + registry.MustRegister(m) + testedMetricsName = append(testedMetricsName, m.FQName()) + } + + testCases := []struct { + desc string + operation string + typ string + err error + startTime time.Time + want string + }{ + { + desc: "success_request", + operation: "foo", + typ: "bar", + err: nil, + startTime: time.Unix(0, 0), // 0.3s + want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type. +# TYPE etcd_request_duration_seconds histogram +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1 +etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3 +etcd_request_duration_seconds_count{operation="foo",type="bar"} 1 +# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type. +# TYPE etcd_requests_total counter +etcd_requests_total{operation="foo",type="bar"} 1 +`, + }, + { + desc: "failed_request", + operation: "foo", + typ: "bar", + err: errors.New("some error"), + startTime: time.Unix(0, 0), // 0.3s + want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type. +# TYPE etcd_request_duration_seconds histogram +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1 +etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1 +etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3 +etcd_request_duration_seconds_count{operation="foo",type="bar"} 1 +# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type. +# TYPE etcd_requests_total counter +etcd_requests_total{operation="foo",type="bar"} 1 +# HELP etcd_request_errors_total [ALPHA] Etcd failed request counts for each operation and object type. +# TYPE etcd_request_errors_total counter +etcd_request_errors_total{operation="foo",type="bar"} 1 +`, + }, + } + + for _, test := range testCases { + t.Run(test.desc, func(t *testing.T) { + defer registry.Reset() + RecordEtcdRequest(test.operation, test.typ, test.err, test.startTime) + if err := testutil.GatherAndCompare(registry, strings.NewReader(test.want), testedMetricsName...); err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/store.go b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/store.go index 2fc237de331..0814513a64c 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/etcd3/store.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/etcd3/store.go @@ -136,7 +136,7 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou } startTime := time.Now() getResp, err := s.client.KV.Get(ctx, preparedKey) - metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) if err != nil { return err } @@ -210,7 +210,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ).Then( clientv3.OpPut(preparedKey, string(newData), opts...), ).Commit() - metrics.RecordEtcdRequestLatency("create", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime) if err != nil { span.AddEvent("Txn call failed", attribute.String("err", err.Error())) return err @@ -255,7 +255,7 @@ func (s *store) conditionalDelete( getCurrentState := func() (*objState, error) { startTime := time.Now() getResp, err := s.client.KV.Get(ctx, key) - metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) if err != nil { return nil, err } @@ -337,7 +337,7 @@ func (s *store) conditionalDelete( ).Else( clientv3.OpGet(key), ).Commit() - metrics.RecordEtcdRequestLatency("delete", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime) if err != nil { return err } @@ -391,7 +391,7 @@ func (s *store) GuaranteedUpdate( getCurrentState := func() (*objState, error) { startTime := time.Now() getResp, err := s.client.KV.Get(ctx, preparedKey) - metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) if err != nil { return nil, err } @@ -512,7 +512,7 @@ func (s *store) GuaranteedUpdate( ).Else( clientv3.OpGet(preparedKey), ).Commit() - metrics.RecordEtcdRequestLatency("update", s.groupResourceString, startTime) + metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime) if err != nil { span.AddEvent("Txn call failed", attribute.String("err", err.Error())) return err @@ -575,7 +575,7 @@ func (s *store) Count(key string) (int64, error) { startTime := time.Now() getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly()) - metrics.RecordEtcdRequestLatency("listWithCount", preparedKey, startTime) + metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime) if err != nil { return 0, err } @@ -720,14 +720,16 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption numReturn := v.Len() metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn) }() + + metricsOp := "get" + if recursive { + metricsOp = "list" + } + for { startTime := time.Now() getResp, err = s.client.KV.Get(ctx, preparedKey, options...) - if recursive { - metrics.RecordEtcdRequestLatency("list", s.groupResourceString, startTime) - } else { - metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) - } + metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime) if err != nil { return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix) }