apiserver/etcd: add request and request_error metrics (#117222)

* apiserver/etcd: add request and request_error metrics

Signed-off-by: iyear <ljyngup@gmail.com>

* apiserver/etcd: rename etcdRequestCounts metric

Signed-off-by: iyear <ljyngup@gmail.com>

---------

Signed-off-by: iyear <ljyngup@gmail.com>
This commit is contained in:
iyear 2023-04-12 22:02:45 +08:00 committed by GitHub
parent d09842e0ad
commit 4f6714ef42
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 168 additions and 16 deletions

View File

@ -47,6 +47,22 @@ var (
},
[]string{"operation", "type"},
)
etcdRequestCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_requests_total",
Help: "Etcd request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
etcdRequestErrorCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_request_errors_total",
Help: "Etcd failed request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "apiserver_storage_objects",
@ -140,6 +156,8 @@ func Register() {
// Register the metrics.
registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(etcdRequestCounts)
legacyregistry.MustRegister(etcdRequestErrorCounts)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.MustRegister(etcdBookmarkCounts)
@ -157,9 +175,15 @@ func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
}
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics.
func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
// RecordEtcdRequest updates and sets the etcd_request_duration_seconds,
// etcd_request_total, etcd_request_errors_total metrics.
func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) {
v := []string{verb, resource}
etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime))
etcdRequestCounts.WithLabelValues(v...).Inc()
if err != nil {
etcdRequestErrorCounts.WithLabelValues(v...).Inc()
}
}
// RecordEtcdEvent updated the etcd_events_received_total metric.
@ -183,7 +207,9 @@ func Reset() {
}
// sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 {
//
// This is a variable to facilitate testing.
var sinceInSeconds = func(start time.Time) float64 {
return time.Since(start).Seconds()
}

View File

@ -17,8 +17,10 @@ limitations under the License.
package metrics
import (
"errors"
"strings"
"testing"
"time"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/testutil"
@ -55,3 +57,125 @@ func TestRecordDecodeError(t *testing.T) {
})
}
}
func TestRecordEtcdRequest(t *testing.T) {
registry := metrics.NewKubeRegistry()
// modify default sinceInSeconds to constant NOW
sinceInSeconds = func(t time.Time) float64 {
return time.Unix(0, 300*int64(time.Millisecond)).Sub(t).Seconds()
}
testedMetrics := []metrics.Registerable{
etcdRequestCounts,
etcdRequestErrorCounts,
etcdRequestLatency,
}
testedMetricsName := make([]string, 0, len(testedMetrics))
for _, m := range testedMetrics {
registry.MustRegister(m)
testedMetricsName = append(testedMetricsName, m.FQName())
}
testCases := []struct {
desc string
operation string
typ string
err error
startTime time.Time
want string
}{
{
desc: "success_request",
operation: "foo",
typ: "bar",
err: nil,
startTime: time.Unix(0, 0), // 0.3s
want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type.
# TYPE etcd_request_duration_seconds histogram
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1
etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3
etcd_request_duration_seconds_count{operation="foo",type="bar"} 1
# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type.
# TYPE etcd_requests_total counter
etcd_requests_total{operation="foo",type="bar"} 1
`,
},
{
desc: "failed_request",
operation: "foo",
typ: "bar",
err: errors.New("some error"),
startTime: time.Unix(0, 0), // 0.3s
want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type.
# TYPE etcd_request_duration_seconds histogram
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1
etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3
etcd_request_duration_seconds_count{operation="foo",type="bar"} 1
# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type.
# TYPE etcd_requests_total counter
etcd_requests_total{operation="foo",type="bar"} 1
# HELP etcd_request_errors_total [ALPHA] Etcd failed request counts for each operation and object type.
# TYPE etcd_request_errors_total counter
etcd_request_errors_total{operation="foo",type="bar"} 1
`,
},
}
for _, test := range testCases {
t.Run(test.desc, func(t *testing.T) {
defer registry.Reset()
RecordEtcdRequest(test.operation, test.typ, test.err, test.startTime)
if err := testutil.GatherAndCompare(registry, strings.NewReader(test.want), testedMetricsName...); err != nil {
t.Fatal(err)
}
})
}
}

View File

@ -136,7 +136,7 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou
}
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@ -210,7 +210,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
).Then(
clientv3.OpPut(preparedKey, string(newData), opts...),
).Commit()
metrics.RecordEtcdRequestLatency("create", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@ -255,7 +255,7 @@ func (s *store) conditionalDelete(
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
@ -337,7 +337,7 @@ func (s *store) conditionalDelete(
).Else(
clientv3.OpGet(key),
).Commit()
metrics.RecordEtcdRequestLatency("delete", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@ -391,7 +391,7 @@ func (s *store) GuaranteedUpdate(
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
@ -512,7 +512,7 @@ func (s *store) GuaranteedUpdate(
).Else(
clientv3.OpGet(preparedKey),
).Commit()
metrics.RecordEtcdRequestLatency("update", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@ -575,7 +575,7 @@ func (s *store) Count(key string) (int64, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
metrics.RecordEtcdRequestLatency("listWithCount", preparedKey, startTime)
metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
if err != nil {
return 0, err
}
@ -720,14 +720,16 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
numReturn := v.Len()
metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
}()
metricsOp := "get"
if recursive {
metricsOp = "list"
}
for {
startTime := time.Now()
getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
if recursive {
metrics.RecordEtcdRequestLatency("list", s.groupResourceString, startTime)
} else {
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
}
metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
if err != nil {
return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix)
}