Merge pull request #106122 from rezakrimi/issue/105862

making some apiserver metrics stable
This commit is contained in:
Kubernetes Prow Robot 2021-11-08 10:55:19 -08:00 committed by GitHub
commit ae550b62da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 144 additions and 47 deletions

View File

@ -18,7 +18,6 @@ package metrics
import (
"context"
"fmt"
"strconv"
"time"
@ -45,8 +44,6 @@ const (
)
var (
// Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default).
latencyBuckets = []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5}
latencySummaryMaxAge = 5 * time.Hour
// Metrics provides access to all admission metrics.
@ -126,19 +123,68 @@ type AdmissionMetrics struct {
func newAdmissionMetrics() *AdmissionMetrics {
// Admission metrics for a step of the admission flow. The entire admission flow is broken down into a series of steps
// Each step is identified by a distinct type label value.
step := newMetricSet("step",
[]string{"type", "operation", "rejected"},
"Admission sub-step %s, broken out for each operation and API resource and step type (validate or admit).", true)
// Use buckets ranging from 5 ms to 2.5 seconds.
step := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "step_admission_duration_seconds",
Help: "Admission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"type", "operation", "rejected"},
),
latenciesSummary: metrics.NewSummaryVec(
&metrics.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "step_admission_duration_seconds_summary",
Help: "Admission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).",
MaxAge: latencySummaryMaxAge,
StabilityLevel: metrics.ALPHA,
},
[]string{"type", "operation", "rejected"},
),
}
// Built-in admission controller metrics. Each admission controller is identified by name.
controller := newMetricSet("controller",
[]string{"name", "type", "operation", "rejected"},
"Admission controller %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false)
// Use buckets ranging from 5 ms to 2.5 seconds.
controller := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "controller_admission_duration_seconds",
Help: "Admission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"name", "type", "operation", "rejected"},
),
latenciesSummary: nil,
}
// Admission webhook metrics. Each webhook is identified by name.
webhook := newMetricSet("webhook",
[]string{"name", "type", "operation", "rejected"},
"Admission webhook %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false)
// Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default).
webhook := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "webhook_admission_duration_seconds",
Help: "Admission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"name", "type", "operation", "rejected"},
),
latenciesSummary: nil,
}
webhookRejection := metrics.NewCounterVec(
&metrics.CounterOpts{
@ -209,39 +255,6 @@ type metricSet struct {
latenciesSummary *metrics.SummaryVec
}
func newMetricSet(name string, labels []string, helpTemplate string, hasSummary bool) *metricSet {
var summary *metrics.SummaryVec
if hasSummary {
summary = metrics.NewSummaryVec(
&metrics.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: fmt.Sprintf("%s_admission_duration_seconds_summary", name),
Help: fmt.Sprintf(helpTemplate, "latency summary in seconds"),
MaxAge: latencySummaryMaxAge,
StabilityLevel: metrics.ALPHA,
},
labels,
)
}
return &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: fmt.Sprintf("%s_admission_duration_seconds", name),
Help: fmt.Sprintf(helpTemplate, "latency histogram in seconds"),
Buckets: latencyBuckets,
StabilityLevel: metrics.ALPHA,
},
labels,
),
latenciesSummary: summary,
}
}
// MustRegister registers all the prometheus metrics in the metricSet.
func (m *metricSet) mustRegister() {
legacyregistry.MustRegister(m.latencies)

View File

@ -115,7 +115,7 @@ var (
Help: "Response size distribution in bytes for each group, version, verb, resource, subresource, scope and component.",
// Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB).
Buckets: compbasemetrics.ExponentialBuckets(1000, 10.0, 7),
StabilityLevel: compbasemetrics.ALPHA,
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"verb", "group", "version", "resource", "subresource", "scope", "component"},
)
@ -169,7 +169,7 @@ var (
&compbasemetrics.GaugeOpts{
Name: "apiserver_current_inflight_requests",
Help: "Maximal number of currently used inflight request limit of this apiserver per request kind in last second.",
StabilityLevel: compbasemetrics.ALPHA,
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"request_kind"},
)

View File

@ -61,6 +61,69 @@
- 4.096
- 8.192
- 16.384
- name: controller_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission controller latency histogram in seconds, identified by name and
broken out for each operation and API resource and type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- name
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: step_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission sub-step latency histogram in seconds, broken out for each operation
and API resource and step type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: webhook_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission webhook latency histogram in seconds, identified by name and broken
out for each operation and API resource and type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- name
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: apiserver_current_inflight_requests
help: Maximal number of currently used inflight request limit of this apiserver
per request kind in last second.
type: Gauge
stabilityLevel: STABLE
labels:
- request_kind
- name: apiserver_request_duration_seconds
help: Response latency distribution in seconds for each verb, dry run value, group,
version, resource, subresource, scope and component.
@ -139,6 +202,27 @@
- resource
- subresource
- version
- name: apiserver_response_sizes
help: Response size distribution in bytes for each group, version, verb, resource,
subresource, scope and component.
type: Histogram
stabilityLevel: STABLE
labels:
- component
- group
- resource
- scope
- subresource
- verb
- version
buckets:
- 1000
- 10000
- 100000
- 1e+06
- 1e+07
- 1e+08
- 1e+09
- name: apiserver_storage_objects
help: Number of stored objects at the time of last check split by kind.
type: Gauge