From 894e31f41aed5e6db4275bf8cf89fdd62fe25b1e Mon Sep 17 00:00:00 2001 From: Reza Karimi Date: Tue, 2 Nov 2021 17:53:53 +0000 Subject: [PATCH] making some apiserver metrics stable --- .../pkg/admission/metrics/metrics.go | 103 ++++++++++-------- .../pkg/endpoints/metrics/metrics.go | 4 +- .../testdata/stable-metrics-list.yaml | 84 ++++++++++++++ 3 files changed, 144 insertions(+), 47 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/admission/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/admission/metrics/metrics.go index 4ec5c9b357c..757e26882d5 100644 --- a/staging/src/k8s.io/apiserver/pkg/admission/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/admission/metrics/metrics.go @@ -18,7 +18,6 @@ package metrics import ( "context" - "fmt" "strconv" "time" @@ -45,8 +44,6 @@ const ( ) var ( - // Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default). - latencyBuckets = []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5} latencySummaryMaxAge = 5 * time.Hour // Metrics provides access to all admission metrics. @@ -126,19 +123,68 @@ type AdmissionMetrics struct { func newAdmissionMetrics() *AdmissionMetrics { // Admission metrics for a step of the admission flow. The entire admission flow is broken down into a series of steps // Each step is identified by a distinct type label value. - step := newMetricSet("step", - []string{"type", "operation", "rejected"}, - "Admission sub-step %s, broken out for each operation and API resource and step type (validate or admit).", true) + // Use buckets ranging from 5 ms to 2.5 seconds. + step := &metricSet{ + latencies: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "step_admission_duration_seconds", + Help: "Admission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).", + Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5}, + StabilityLevel: metrics.STABLE, + }, + []string{"type", "operation", "rejected"}, + ), + + latenciesSummary: metrics.NewSummaryVec( + &metrics.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "step_admission_duration_seconds_summary", + Help: "Admission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).", + MaxAge: latencySummaryMaxAge, + StabilityLevel: metrics.ALPHA, + }, + []string{"type", "operation", "rejected"}, + ), + } // Built-in admission controller metrics. Each admission controller is identified by name. - controller := newMetricSet("controller", - []string{"name", "type", "operation", "rejected"}, - "Admission controller %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false) + // Use buckets ranging from 5 ms to 2.5 seconds. + controller := &metricSet{ + latencies: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "controller_admission_duration_seconds", + Help: "Admission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).", + Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5}, + StabilityLevel: metrics.STABLE, + }, + []string{"name", "type", "operation", "rejected"}, + ), + + latenciesSummary: nil, + } // Admission webhook metrics. Each webhook is identified by name. - webhook := newMetricSet("webhook", - []string{"name", "type", "operation", "rejected"}, - "Admission webhook %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false) + // Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default). + webhook := &metricSet{ + latencies: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "webhook_admission_duration_seconds", + Help: "Admission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).", + Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5}, + StabilityLevel: metrics.STABLE, + }, + []string{"name", "type", "operation", "rejected"}, + ), + + latenciesSummary: nil, + } webhookRejection := metrics.NewCounterVec( &metrics.CounterOpts{ @@ -209,39 +255,6 @@ type metricSet struct { latenciesSummary *metrics.SummaryVec } -func newMetricSet(name string, labels []string, helpTemplate string, hasSummary bool) *metricSet { - var summary *metrics.SummaryVec - if hasSummary { - summary = metrics.NewSummaryVec( - &metrics.SummaryOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: fmt.Sprintf("%s_admission_duration_seconds_summary", name), - Help: fmt.Sprintf(helpTemplate, "latency summary in seconds"), - MaxAge: latencySummaryMaxAge, - StabilityLevel: metrics.ALPHA, - }, - labels, - ) - } - - return &metricSet{ - latencies: metrics.NewHistogramVec( - &metrics.HistogramOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: fmt.Sprintf("%s_admission_duration_seconds", name), - Help: fmt.Sprintf(helpTemplate, "latency histogram in seconds"), - Buckets: latencyBuckets, - StabilityLevel: metrics.ALPHA, - }, - labels, - ), - - latenciesSummary: summary, - } -} - // MustRegister registers all the prometheus metrics in the metricSet. func (m *metricSet) mustRegister() { legacyregistry.MustRegister(m.latencies) diff --git a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go index 43ff8b7428c..b72684a1f4b 100644 --- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go +++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go @@ -115,7 +115,7 @@ var ( Help: "Response size distribution in bytes for each group, version, verb, resource, subresource, scope and component.", // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). Buckets: compbasemetrics.ExponentialBuckets(1000, 10.0, 7), - StabilityLevel: compbasemetrics.ALPHA, + StabilityLevel: compbasemetrics.STABLE, }, []string{"verb", "group", "version", "resource", "subresource", "scope", "component"}, ) @@ -169,7 +169,7 @@ var ( &compbasemetrics.GaugeOpts{ Name: "apiserver_current_inflight_requests", Help: "Maximal number of currently used inflight request limit of this apiserver per request kind in last second.", - StabilityLevel: compbasemetrics.ALPHA, + StabilityLevel: compbasemetrics.STABLE, }, []string{"request_kind"}, ) diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index 4f48d397cb3..459a0c6d1b1 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -61,6 +61,69 @@ - 4.096 - 8.192 - 16.384 +- name: controller_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission controller latency histogram in seconds, identified by name and + broken out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: step_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission sub-step latency histogram in seconds, broken out for each operation + and API resource and step type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: webhook_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission webhook latency histogram in seconds, identified by name and broken + out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: apiserver_current_inflight_requests + help: Maximal number of currently used inflight request limit of this apiserver + per request kind in last second. + type: Gauge + stabilityLevel: STABLE + labels: + - request_kind - name: apiserver_request_duration_seconds help: Response latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component. @@ -139,6 +202,27 @@ - resource - subresource - version +- name: apiserver_response_sizes + help: Response size distribution in bytes for each group, version, verb, resource, + subresource, scope and component. + type: Histogram + stabilityLevel: STABLE + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 1000 + - 10000 + - 100000 + - 1e+06 + - 1e+07 + - 1e+08 + - 1e+09 - name: apiserver_storage_objects help: Number of stored objects at the time of last check split by kind. type: Gauge