apiserver/etcd: add request and request_error metrics (#117222)

* apiserver/etcd: add request and request_error metrics

Signed-off-by: iyear <ljyngup@gmail.com>

* apiserver/etcd: rename etcdRequestCounts metric

Signed-off-by: iyear <ljyngup@gmail.com>

---------

Signed-off-by: iyear <ljyngup@gmail.com>
This commit is contained in:
iyear 2023-04-12 22:02:45 +08:00 committed by GitHub
parent d09842e0ad
commit 4f6714ef42
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 168 additions and 16 deletions

View File

@ -47,6 +47,22 @@ var (
}, },
[]string{"operation", "type"}, []string{"operation", "type"},
) )
etcdRequestCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_requests_total",
Help: "Etcd request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
etcdRequestErrorCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_request_errors_total",
Help: "Etcd failed request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
objectCounts = compbasemetrics.NewGaugeVec( objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{ &compbasemetrics.GaugeOpts{
Name: "apiserver_storage_objects", Name: "apiserver_storage_objects",
@ -140,6 +156,8 @@ func Register() {
// Register the metrics. // Register the metrics.
registerMetrics.Do(func() { registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency) legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(etcdRequestCounts)
legacyregistry.MustRegister(etcdRequestErrorCounts)
legacyregistry.MustRegister(objectCounts) legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(dbTotalSize) legacyregistry.MustRegister(dbTotalSize)
legacyregistry.MustRegister(etcdBookmarkCounts) legacyregistry.MustRegister(etcdBookmarkCounts)
@ -157,9 +175,15 @@ func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count)) objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
} }
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics. // RecordEtcdRequest updates and sets the etcd_request_duration_seconds,
func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) { // etcd_request_total, etcd_request_errors_total metrics.
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime)) func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) {
v := []string{verb, resource}
etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime))
etcdRequestCounts.WithLabelValues(v...).Inc()
if err != nil {
etcdRequestErrorCounts.WithLabelValues(v...).Inc()
}
} }
// RecordEtcdEvent updated the etcd_events_received_total metric. // RecordEtcdEvent updated the etcd_events_received_total metric.
@ -183,7 +207,9 @@ func Reset() {
} }
// sinceInSeconds gets the time since the specified start in seconds. // sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 { //
// This is a variable to facilitate testing.
var sinceInSeconds = func(start time.Time) float64 {
return time.Since(start).Seconds() return time.Since(start).Seconds()
} }

View File

@ -17,8 +17,10 @@ limitations under the License.
package metrics package metrics
import ( import (
"errors"
"strings" "strings"
"testing" "testing"
"time"
"k8s.io/component-base/metrics" "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/testutil" "k8s.io/component-base/metrics/testutil"
@ -55,3 +57,125 @@ func TestRecordDecodeError(t *testing.T) {
}) })
} }
} }
func TestRecordEtcdRequest(t *testing.T) {
registry := metrics.NewKubeRegistry()
// modify default sinceInSeconds to constant NOW
sinceInSeconds = func(t time.Time) float64 {
return time.Unix(0, 300*int64(time.Millisecond)).Sub(t).Seconds()
}
testedMetrics := []metrics.Registerable{
etcdRequestCounts,
etcdRequestErrorCounts,
etcdRequestLatency,
}
testedMetricsName := make([]string, 0, len(testedMetrics))
for _, m := range testedMetrics {
registry.MustRegister(m)
testedMetricsName = append(testedMetricsName, m.FQName())
}
testCases := []struct {
desc string
operation string
typ string
err error
startTime time.Time
want string
}{
{
desc: "success_request",
operation: "foo",
typ: "bar",
err: nil,
startTime: time.Unix(0, 0), // 0.3s
want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type.
# TYPE etcd_request_duration_seconds histogram
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1
etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3
etcd_request_duration_seconds_count{operation="foo",type="bar"} 1
# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type.
# TYPE etcd_requests_total counter
etcd_requests_total{operation="foo",type="bar"} 1
`,
},
{
desc: "failed_request",
operation: "foo",
typ: "bar",
err: errors.New("some error"),
startTime: time.Unix(0, 0), // 0.3s
want: `# HELP etcd_request_duration_seconds [ALPHA] Etcd request latency in seconds for each operation and object type.
# TYPE etcd_request_duration_seconds histogram
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.005"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.025"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.05"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.1"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.2"} 0
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="0.8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.25"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="1.5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="2"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="3"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="4"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="5"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="6"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="8"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="10"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="15"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="20"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="30"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="45"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="60"} 1
etcd_request_duration_seconds_bucket{operation="foo",type="bar",le="+Inf"} 1
etcd_request_duration_seconds_sum{operation="foo",type="bar"} 0.3
etcd_request_duration_seconds_count{operation="foo",type="bar"} 1
# HELP etcd_requests_total [ALPHA] Etcd request counts for each operation and object type.
# TYPE etcd_requests_total counter
etcd_requests_total{operation="foo",type="bar"} 1
# HELP etcd_request_errors_total [ALPHA] Etcd failed request counts for each operation and object type.
# TYPE etcd_request_errors_total counter
etcd_request_errors_total{operation="foo",type="bar"} 1
`,
},
}
for _, test := range testCases {
t.Run(test.desc, func(t *testing.T) {
defer registry.Reset()
RecordEtcdRequest(test.operation, test.typ, test.err, test.startTime)
if err := testutil.GatherAndCompare(registry, strings.NewReader(test.want), testedMetricsName...); err != nil {
t.Fatal(err)
}
})
}
}

View File

@ -136,7 +136,7 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou
} }
startTime := time.Now() startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey) getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
return err return err
} }
@ -210,7 +210,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
).Then( ).Then(
clientv3.OpPut(preparedKey, string(newData), opts...), clientv3.OpPut(preparedKey, string(newData), opts...),
).Commit() ).Commit()
metrics.RecordEtcdRequestLatency("create", s.groupResourceString, startTime) metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error())) span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err return err
@ -255,7 +255,7 @@ func (s *store) conditionalDelete(
getCurrentState := func() (*objState, error) { getCurrentState := func() (*objState, error) {
startTime := time.Now() startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key) getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -337,7 +337,7 @@ func (s *store) conditionalDelete(
).Else( ).Else(
clientv3.OpGet(key), clientv3.OpGet(key),
).Commit() ).Commit()
metrics.RecordEtcdRequestLatency("delete", s.groupResourceString, startTime) metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
return err return err
} }
@ -391,7 +391,7 @@ func (s *store) GuaranteedUpdate(
getCurrentState := func() (*objState, error) { getCurrentState := func() (*objState, error) {
startTime := time.Now() startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey) getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime) metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -512,7 +512,7 @@ func (s *store) GuaranteedUpdate(
).Else( ).Else(
clientv3.OpGet(preparedKey), clientv3.OpGet(preparedKey),
).Commit() ).Commit()
metrics.RecordEtcdRequestLatency("update", s.groupResourceString, startTime) metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
if err != nil { if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error())) span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err return err
@ -575,7 +575,7 @@ func (s *store) Count(key string) (int64, error) {
startTime := time.Now() startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly()) getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
metrics.RecordEtcdRequestLatency("listWithCount", preparedKey, startTime) metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@ -720,14 +720,16 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
numReturn := v.Len() numReturn := v.Len()
metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn) metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
}() }()
metricsOp := "get"
if recursive {
metricsOp = "list"
}
for { for {
startTime := time.Now() startTime := time.Now()
getResp, err = s.client.KV.Get(ctx, preparedKey, options...) getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
if recursive { metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
metrics.RecordEtcdRequestLatency("list", s.groupResourceString, startTime)
} else {
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
}
if err != nil { if err != nil {
return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix) return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix)
} }