From 325f09045f00ffc4d313f6ee584ee071110a279b Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 25 Oct 2022 14:44:09 -0400 Subject: [PATCH 1/5] add automated documentation generation from our static analysis pipeline Change-Id: I66a0b4fd836ad3b51873517f16eb5a73b05bf329 --- test/instrumentation/documentation/main.go | 129 + test/instrumentation/find_stable_metric.go | 19 +- test/instrumentation/main.go | 9 +- test/instrumentation/metric.go | 28 +- test/instrumentation/stability-utils.sh | 25 + .../testdata/documentation-list.yaml | 3455 +++++++++++++++++ .../instrumentation/testdata/documentation.md | 325 ++ .../update-documentation-metrics.sh | 28 + test/instrumentation/update-documentation.sh | 28 + 9 files changed, 4026 insertions(+), 20 deletions(-) create mode 100755 test/instrumentation/documentation/main.go create mode 100644 test/instrumentation/testdata/documentation-list.yaml create mode 100644 test/instrumentation/testdata/documentation.md create mode 100755 test/instrumentation/update-documentation-metrics.sh create mode 100755 test/instrumentation/update-documentation.sh diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go new file mode 100755 index 00000000000..c7e40709a7c --- /dev/null +++ b/test/instrumentation/documentation/main.go @@ -0,0 +1,129 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "bytes" + "fmt" + "os" + "strings" + "text/template" + "time" + + "gopkg.in/yaml.v2" + + "k8s.io/component-base/metrics" +) + +var ( + GOROOT string = os.Getenv("GOROOT") + GOOS string = os.Getenv("GOOS") + KUBE_ROOT string = os.Getenv("KUBE_ROOT") +) + +const ( + templ = `--- +title: Kubernetes Metrics Across Components +content_type: instrumentation +--- + + +## Metrics + +These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). + +(auto-generated {{.GeneratedDate.Format "2006 Jan 02"}}) + +### List of Kubernetes Metrics + + + + + + + + + + + + + +{{range $index, $metric := .Metrics}}{{if not $metric.Labels }}{{else }}{{end}}{{if not $metric.ConstLabels }}{{else }}{{end}} +{{end}} + +
NameStability LevelTypeHelpLabelsConst Labels
{{$metric.Name}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
+` +) + +type templateData struct { + Metrics []metric + GeneratedDate time.Time +} + +func main() { + dat, err := os.ReadFile("test/instrumentation/testdata/documentation-list.yaml") + if err == nil { + metrics := []metric{} + err = yaml.Unmarshal(dat, &metrics) + if err != nil { + println("err", err) + } + t := template.New("t") + t, err := t.Parse(templ) + if err != nil { + println("err", err) + } + var tpl bytes.Buffer + for i, m := range metrics { + m.Help = strings.Join(strings.Split(m.Help, "\n"), " ") + metrics[i] = m + } + data := templateData{ + Metrics: metrics, + GeneratedDate: time.Now(), + } + err = t.Execute(&tpl, data) + if err != nil { + println("err", err) + } + fmt.Print(tpl.String()) + } else { + fmt.Fprintf(os.Stderr, "%s\n", err) + } + +} + +type metric struct { + Name string `yaml:"name" json:"name"` + Subsystem string `yaml:"subsystem,omitempty" json:"subsystem,omitempty"` + Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"` + Help string `yaml:"help,omitempty" json:"help,omitempty"` + Type string `yaml:"type,omitempty" json:"type,omitempty"` + DeprecatedVersion string `yaml:"deprecatedVersion,omitempty" json:"deprecatedVersion,omitempty"` + StabilityLevel string `yaml:"stabilityLevel,omitempty" json:"stabilityLevel,omitempty"` + Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"` + Buckets []float64 `yaml:"buckets,omitempty" json:"buckets,omitempty"` + Objectives map[float64]float64 `yaml:"objectives,omitempty" json:"objectives,omitempty"` + AgeBuckets uint32 `yaml:"ageBuckets,omitempty" json:"ageBuckets,omitempty"` + BufCap uint32 `yaml:"bufCap,omitempty" json:"bufCap,omitempty"` + MaxAge int64 `yaml:"maxAge,omitempty" json:"maxAge,omitempty"` + ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"` +} + +func (m metric) buildFQName() string { + return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name) +} diff --git a/test/instrumentation/find_stable_metric.go b/test/instrumentation/find_stable_metric.go index 41d3b23a610..3542c1e9d50 100644 --- a/test/instrumentation/find_stable_metric.go +++ b/test/instrumentation/find_stable_metric.go @@ -52,6 +52,15 @@ type stableMetricFinder struct { var _ ast.Visitor = (*stableMetricFinder)(nil) +func contains(v metrics.StabilityLevel, a []metrics.StabilityLevel) bool { + for _, i := range a { + if i == v { + return true + } + } + return false +} + func (f *stableMetricFinder) Visit(node ast.Node) (w ast.Visitor) { switch opts := node.(type) { case *ast.CallExpr: @@ -76,15 +85,19 @@ func (f *stableMetricFinder) Visit(node ast.Node) (w ast.Visitor) { f.errors = append(f.errors, err) return nil } - switch *stabilityLevel { - case metrics.STABLE, metrics.BETA: + classes := []metrics.StabilityLevel{metrics.STABLE, metrics.BETA} + if ALL_STABILITY_CLASSES { + classes = append(classes, metrics.ALPHA) + } + switch { + case contains(*stabilityLevel, classes): if f.currentFunctionCall == nil { f.errors = append(f.errors, newDecodeErrorf(opts, errNotDirectCall)) return nil } f.stableMetricsFunctionCalls = append(f.stableMetricsFunctionCalls, f.currentFunctionCall) f.currentFunctionCall = nil - case metrics.INTERNAL, metrics.ALPHA: + default: return nil } default: diff --git a/test/instrumentation/main.go b/test/instrumentation/main.go index 3a5e9ffcacd..af5465e76f8 100644 --- a/test/instrumentation/main.go +++ b/test/instrumentation/main.go @@ -41,12 +41,15 @@ const ( var ( // env configs - GOROOT string = os.Getenv("GOROOT") - GOOS string = os.Getenv("GOOS") - KUBE_ROOT string = os.Getenv("KUBE_ROOT") + GOROOT string = os.Getenv("GOROOT") + GOOS string = os.Getenv("GOOS") + KUBE_ROOT string = os.Getenv("KUBE_ROOT") + ALL_STABILITY_CLASSES bool ) func main() { + + flag.BoolVar(&ALL_STABILITY_CLASSES, "allstabilityclasses", false, "use this flag to enable all stability classes") flag.Parse() if len(flag.Args()) < 1 { fmt.Fprintf(os.Stderr, "USAGE: %s [...]\n", os.Args[0]) diff --git a/test/instrumentation/metric.go b/test/instrumentation/metric.go index 9f2721442da..35abb759a2e 100644 --- a/test/instrumentation/metric.go +++ b/test/instrumentation/metric.go @@ -29,20 +29,20 @@ const ( ) type metric struct { - Name string `yaml:"name"` - Subsystem string `yaml:"subsystem,omitempty"` - Namespace string `yaml:"namespace,omitempty"` - Help string `yaml:"help,omitempty"` - Type string `yaml:"type,omitempty"` - DeprecatedVersion string `yaml:"deprecatedVersion,omitempty"` - StabilityLevel string `yaml:"stabilityLevel,omitempty"` - Labels []string `yaml:"labels,omitempty"` - Buckets []float64 `yaml:"buckets,omitempty"` - Objectives map[float64]float64 `yaml:"objectives,omitempty"` - AgeBuckets uint32 `yaml:"ageBuckets,omitempty"` - BufCap uint32 `yaml:"bufCap,omitempty"` - MaxAge int64 `yaml:"maxAge,omitempty"` - ConstLabels map[string]string `yaml:"constLabels,omitempty"` + Name string `yaml:"name" json:"name"` + Subsystem string `yaml:"subsystem,omitempty" json:"subsystem,omitempty"` + Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"` + Help string `yaml:"help,omitempty" json:"help,omitempty"` + Type string `yaml:"type,omitempty" json:"type,omitempty"` + DeprecatedVersion string `yaml:"deprecatedVersion,omitempty" json:"deprecatedVersion,omitempty"` + StabilityLevel string `yaml:"stabilityLevel,omitempty" json:"stabilityLevel,omitempty"` + Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"` + Buckets []float64 `yaml:"buckets,omitempty" json:"buckets,omitempty"` + Objectives map[float64]float64 `yaml:"objectives,omitempty" json:"objectives,omitempty"` + AgeBuckets uint32 `yaml:"ageBuckets,omitempty" json:"ageBuckets,omitempty"` + BufCap uint32 `yaml:"bufCap,omitempty" json:"bufCap,omitempty"` + MaxAge int64 `yaml:"maxAge,omitempty" json:"maxAge,omitempty"` + ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"` } func (m metric) buildFQName() string { diff --git a/test/instrumentation/stability-utils.sh b/test/instrumentation/stability-utils.sh index 6ab14a54caf..a751ff4834b 100755 --- a/test/instrumentation/stability-utils.sh +++ b/test/instrumentation/stability-utils.sh @@ -105,6 +105,31 @@ kube::update::stablemetrics() { echo "${green}Updated golden list of stable metrics.${reset}" } +kube::update::documentation::list() { + stability_check_setup + temp_file=$(mktemp) + doCheckStability=$(find_files_to_check | grep -E ".*.go" | grep -v ".*_test.go" | sort | KUBE_ROOT=${KUBE_ROOT} xargs -L 200 go run "test/instrumentation/main.go" "test/instrumentation/decode_metric.go" "test/instrumentation/find_stable_metric.go" "test/instrumentation/error.go" "test/instrumentation/metric.go" --allstabilityclasses -- 1>"${temp_file}") + + if ! $doCheckStability; then + echo "${red}!!! updating golden list of metrics has failed! ${reset}" >&2 + exit 1 + fi + mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation-list.yaml" + echo "${green}Updated golden list of stable metrics.${reset}" +} + +kube::update::documentation() { + stability_check_setup + temp_file=$(mktemp) + doUpdateDocs=$(go run "test/instrumentation/documentation/main.go" -- 1>"${temp_file}") + if ! $doUpdateDocs; then + echo "${red}!!! updating documentation has failed! ${reset}" >&2 + exit 1 + fi + mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation.md" + echo "${green}Updated documentation of metrics.${reset}" +} + kube::update::test::stablemetrics() { stability_check_setup temp_file=$(mktemp) diff --git a/test/instrumentation/testdata/documentation-list.yaml b/test/instrumentation/testdata/documentation-list.yaml new file mode 100644 index 00000000000..281bd57f74b --- /dev/null +++ b/test/instrumentation/testdata/documentation-list.yaml @@ -0,0 +1,3455 @@ +- name: version_info + namespace: etcd + help: Etcd server's binary version + type: Gauge + stabilityLevel: ALPHA + labels: + - binary_version +- name: certificate_manager_client_ttl_seconds + subsystem: kubelet + help: Gauge of the TTL (time-to-live) of the Kubelet's client certificate. The value + is in seconds until certificate expiry (negative if already expired). If client + certificate is invalid or unused, the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: endpointslices_changed_per_sync + subsystem: endpoint_slice_controller + help: Number of EndpointSlices changed on each Service sync + type: Histogram + labels: + - topology +- name: cronjob_job_creation_skew_duration_seconds + subsystem: cronjob_controller + help: Time between when a cronjob is scheduled to be run, and when the corresponding + job is created + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 +- name: changes + subsystem: endpoint_slice_controller + help: Number of EndpointSlice changes + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: desired_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints removed on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: num_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: ALPHA +- name: syncs + subsystem: endpoint_slice_controller + help: Number of EndpointSlice syncs + type: Counter + stabilityLevel: ALPHA + labels: + - result +- name: addresses_skipped_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of addresses skipped on each Endpoints sync due to being invalid or + exceeding MaxEndpointsPerSubset + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: changes + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlice changes + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: desired_endpoint_slices + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints added on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints removed on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_sync_duration + subsystem: endpoint_slice_mirroring_controller + help: Duration of syncEndpoints() in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: endpoints_updated_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints updated on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: num_endpoint_slices + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: ALPHA +- name: resources_sync_error_total + subsystem: garbagecollector_controller + help: Number of garbage collector resources sync errors + type: Counter + stabilityLevel: ALPHA +- name: sync_duration_seconds + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: sync_total + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: job_pods_finished_total + subsystem: job_controller + help: The number of finished Pods that are fully tracked + type: Counter + labels: + - completion_mode + - result +- name: terminated_pods_tracking_finalizer_total + subsystem: job_controller + help: |- + `The number of terminated pods (phase=Failed|Succeeded) + that have the finalizer batch.kubernetes.io/job-tracking + The event label can be "add" or "delete".` + type: Counter + labels: + - event +- name: attachdetach_controller_forced_detaches + help: Number of times the A/D Controller performed a forced detach + type: Counter + stabilityLevel: ALPHA +- name: job_finished_total + subsystem: job_controller + help: The number of finished job + type: Counter + stabilityLevel: ALPHA + labels: + - completion_mode + - result +- name: job_sync_duration_seconds + subsystem: job_controller + help: The time it took to sync a job + type: Histogram + stabilityLevel: ALPHA + labels: + - action + - completion_mode + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: job_sync_total + subsystem: job_controller + help: The number of job syncs + type: Counter + stabilityLevel: ALPHA + labels: + - action + - completion_mode + - result +- name: evictions_number + subsystem: node_collector + help: Number of Node evictions that happened since current instance of NodeController + started, This metric is replaced by node_collector_evictions_total. + type: Counter + deprecatedVersion: 1.24.0 + stabilityLevel: ALPHA + labels: + - zone +- name: unhealthy_nodes_in_zone + subsystem: node_collector + help: Gauge measuring number of not Ready Nodes per zones. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: zone_health + subsystem: node_collector + help: Gauge measuring percentage of healthy nodes per zone. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: zone_size + subsystem: node_collector + help: Gauge measuring number of registered Nodes per zones. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: cidrset_allocation_tries_per_request + subsystem: node_ipam_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: ALPHA + labels: + - clusterCIDR + buckets: + - 1 + - 5 + - 25 + - 125 + - 625 +- name: cidrset_cidrs_allocations_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR allocations. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: cidrset_cidrs_releases_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR releases. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: cidrset_usage_cidrs + subsystem: node_ipam_controller + help: Gauge measuring percentage of allocated CIDRs. + type: Gauge + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_allocation_tries_per_request + subsystem: node_ipam_controller + help: Histogram measuring CIDR allocation tries per request. + type: Histogram + stabilityLevel: ALPHA + labels: + - clusterCIDR + buckets: + - 1 + - 5 + - 25 + - 125 + - 625 +- name: multicidrset_cidrs_allocations_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR allocations. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_cidrs_releases_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR releases. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_usage_cidrs + subsystem: node_ipam_controller + help: Gauge measuring percentage of allocated CIDRs. + type: Gauge + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: sorting_deletion_age_ratio + subsystem: replicaset_controller + help: The ratio of chosen deleted pod's ages to the current youngest pod's age (at + the time). Should be <2.The intent of this metric is to measure the rough efficacy + of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) + of pods when a replicaset scales down. This only considers Ready pods when calculating + and reporting. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 +- name: job_deletion_duration_seconds + subsystem: ttl_after_finished_controller + help: The time it took to delete the job since it became eligible for deletion + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1.6 + - 3.2 + - 6.4 + - 12.8 + - 25.6 + - 51.2 + - 102.4 + - 204.8 + - 409.6 + - 819.2 +- name: evictions_total + subsystem: node_collector + help: Number of Node evictions that happened since current instance of NodeController + started. + type: Counter + stabilityLevel: STABLE + labels: + - zone +- name: create_failures_total + subsystem: ephemeral_volume_controller + help: Number of PersistenVolumeClaims creation requests + type: Counter + stabilityLevel: ALPHA +- name: create_total + subsystem: ephemeral_volume_controller + help: Number of PersistenVolumeClaims creation requests + type: Counter + stabilityLevel: ALPHA +- name: client_expiration_renew_errors + subsystem: certificate_manager + namespace: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA +- name: certificate_manager_server_rotation_seconds + subsystem: kubelet + help: Histogram of the number of seconds the previous certificate lived before being + rotated. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 60 + - 3600 + - 14400 + - 86400 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 + - 1.24416e+08 +- name: certificate_manager_server_ttl_seconds + subsystem: kubelet + help: Gauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. + The value is in seconds until certificate expiry (negative if already expired). + If serving certificate is invalid or unused, the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: kubelet_credential_provider_plugin_duration + subsystem: kubelet + help: Duration of execution in seconds for credential provider plugin + type: Histogram + stabilityLevel: ALPHA + labels: + - plugin_name + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: kubelet_credential_provider_plugin_errors + subsystem: kubelet + help: Number of errors from credential provider plugin + type: Counter + stabilityLevel: ALPHA + labels: + - plugin_name +- name: server_expiration_renew_errors + subsystem: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA +- name: volume_operation_total_errors + help: Total volume operation errors + type: Counter + stabilityLevel: ALPHA + labels: + - operation_name + - plugin_name +- name: cgroup_manager_duration_seconds + subsystem: kubelet + help: Duration in seconds for cgroup manager operations. Broken down by method. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: containers_per_pod_count + subsystem: kubelet + help: The number of containers per pod. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 +- name: device_plugin_alloc_duration_seconds + subsystem: kubelet + help: Duration in seconds to serve a device plugin Allocation request. Broken down + by resource name. + type: Histogram + stabilityLevel: ALPHA + labels: + - resource_name + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: device_plugin_registration_total + subsystem: kubelet + help: Cumulative number of device plugin registrations. Broken down by resource + name. + type: Counter + stabilityLevel: ALPHA + labels: + - resource_name +- name: eviction_stats_age_seconds + subsystem: kubelet + help: Time between when stats are collected, and when pod is evicted based on those + stats by eviction signal + type: Histogram + stabilityLevel: ALPHA + labels: + - eviction_signal + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: evictions + subsystem: kubelet + help: Cumulative number of pod evictions by eviction signal + type: Counter + stabilityLevel: ALPHA + labels: + - eviction_signal +- name: graceful_shutdown_end_time_seconds + subsystem: kubelet + help: Last graceful shutdown start time since unix epoch in seconds + type: Gauge + stabilityLevel: ALPHA +- name: graceful_shutdown_start_time_seconds + subsystem: kubelet + help: Last graceful shutdown start time since unix epoch in seconds + type: Gauge + stabilityLevel: ALPHA +- name: http_inflight_requests + subsystem: kubelet + help: Number of the inflight http requests + type: Gauge + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type +- name: http_requests_duration_seconds + subsystem: kubelet + help: Duration in seconds to serve http requests + type: Histogram + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: http_requests_total + subsystem: kubelet + help: Number of the http requests received since the server started + type: Counter + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type +- name: lifecycle_handler_http_fallbacks_total + subsystem: kubelet + help: The number of times lifecycle handlers successfully fell back to http from + https. + type: Counter + stabilityLevel: ALPHA +- name: managed_ephemeral_containers + subsystem: kubelet + help: Current number of ephemeral containers in pods managed by this kubelet. Ephemeral + containers will be ignored if disabled by the EphemeralContainers feature gate, + and this number will be 0. + type: Gauge + stabilityLevel: ALPHA +- name: node_name + subsystem: kubelet + help: The node's name. The count is always 1. + type: Gauge + stabilityLevel: ALPHA + labels: + - node +- name: pleg_discard_events + subsystem: kubelet + help: The number of discard events in PLEG. + type: Counter + stabilityLevel: ALPHA +- name: pleg_last_seen_seconds + subsystem: kubelet + help: Timestamp in seconds when PLEG was last seen active. + type: Gauge + stabilityLevel: ALPHA +- name: pleg_relist_duration_seconds + subsystem: kubelet + help: Duration in seconds for relisting pods in PLEG. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pleg_relist_interval_seconds + subsystem: kubelet + help: Interval in seconds between relisting in PLEG. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_resources_endpoint_errors_get_allocatable + subsystem: kubelet + help: Number of requests to the PodResource GetAllocatableResources endpoint which + returned error. Broken down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_errors_list + subsystem: kubelet + help: Number of requests to the PodResource List endpoint which returned error. + Broken down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_get_allocatable + subsystem: kubelet + help: Number of requests to the PodResource GetAllocatableResources endpoint. Broken + down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_list + subsystem: kubelet + help: Number of requests to the PodResource List endpoint. Broken down by server + api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_total + subsystem: kubelet + help: Cumulative number of requests to the PodResource endpoint. Broken down by + server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_start_duration_seconds + subsystem: kubelet + help: Duration in seconds from kubelet seeing a pod for the first time to the pod + starting to run + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_status_sync_duration_seconds + subsystem: kubelet + help: Duration in seconds to sync a pod status update. Measures time from detection + of a change to pod status until the API is successfully updated for that pod, + even if multiple intevening changes to pod status occur. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.01 + - 0.05 + - 0.1 + - 0.5 + - 1 + - 5 + - 10 + - 20 + - 30 + - 45 + - 60 +- name: pod_worker_duration_seconds + subsystem: kubelet + help: 'Duration in seconds to sync a single pod. Broken down by operation type: + create, update, or sync' + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_worker_start_duration_seconds + subsystem: kubelet + help: Duration in seconds from kubelet seeing a pod to starting a worker. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: preemptions + subsystem: kubelet + help: Cumulative number of pod preemptions by preemption resource + type: Counter + stabilityLevel: ALPHA + labels: + - preemption_signal +- name: run_podsandbox_duration_seconds + subsystem: kubelet + help: Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler. + type: Histogram + stabilityLevel: ALPHA + labels: + - runtime_handler + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: run_podsandbox_errors_total + subsystem: kubelet + help: Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler. + type: Counter + stabilityLevel: ALPHA + labels: + - runtime_handler +- name: running_containers + subsystem: kubelet + help: Number of containers currently running + type: Gauge + stabilityLevel: ALPHA + labels: + - container_state +- name: running_pods + subsystem: kubelet + help: Number of pods that have a running pod sandbox + type: Gauge + stabilityLevel: ALPHA +- name: runtime_operations_duration_seconds + subsystem: kubelet + help: Duration in seconds of runtime operations. Broken down by operation type. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.0125 + - 0.03125 + - 0.078125 + - 0.1953125 + - 0.48828125 + - 1.220703125 + - 3.0517578125 + - 7.62939453125 + - 19.073486328125 + - 47.6837158203125 + - 119.20928955078125 + - 298.0232238769531 + - 745.0580596923828 +- name: runtime_operations_errors_total + subsystem: kubelet + help: Cumulative number of runtime operation errors by operation type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation_type +- name: runtime_operations_total + subsystem: kubelet + help: Cumulative number of runtime operations by operation type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation_type +- name: started_containers_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting containers + type: Counter + stabilityLevel: ALPHA + labels: + - code + - container_type +- name: started_containers_total + subsystem: kubelet + help: Cumulative number of containers started + type: Counter + stabilityLevel: ALPHA + labels: + - container_type +- name: started_host_process_containers_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting hostprocess containers. This metric + will only be collected on Windows and requires WindowsHostProcessContainers feature + gate to be enabled. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - container_type +- name: started_host_process_containers_total + subsystem: kubelet + help: Cumulative number of hostprocess containers started. This metric will only + be collected on Windows and requires WindowsHostProcessContainers feature gate + to be enabled. + type: Counter + stabilityLevel: ALPHA + labels: + - container_type +- name: started_pods_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting pods + type: Counter + stabilityLevel: ALPHA +- name: started_pods_total + subsystem: kubelet + help: Cumulative number of pods started + type: Counter + stabilityLevel: ALPHA +- name: volume_metric_collection_duration_seconds + subsystem: kubelet + help: Duration in seconds to calculate volume stats + type: Histogram + stabilityLevel: ALPHA + labels: + - metric_source + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: probe_duration_seconds + subsystem: prober + help: Duration in seconds for a probe response. + type: Histogram + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - probe_type +- name: probe_total + subsystem: prober + help: Cumulative number of a liveness, readiness or startup probe for a container + by result. + type: Counter + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - pod_uid + - probe_type + - result +- name: csr_honored_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration that was honored, sliced + by signer (only kubernetes.io signer names are specifically identified) + type: Counter + stabilityLevel: ALPHA + labels: + - signerName +- name: csr_requested_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration, sliced by signer (only + kubernetes.io signer names are specifically identified) + type: Counter + stabilityLevel: ALPHA + labels: + - signerName +- name: network_programming_duration_seconds + subsystem: kubeproxy + help: In Cluster Network Programming Latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.25 + - 0.5 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 + - 22 + - 23 + - 24 + - 25 + - 26 + - 27 + - 28 + - 29 + - 30 + - 31 + - 32 + - 33 + - 34 + - 35 + - 36 + - 37 + - 38 + - 39 + - 40 + - 41 + - 42 + - 43 + - 44 + - 45 + - 46 + - 47 + - 48 + - 49 + - 50 + - 51 + - 52 + - 53 + - 54 + - 55 + - 56 + - 57 + - 58 + - 59 + - 60 + - 65 + - 70 + - 75 + - 80 + - 85 + - 90 + - 95 + - 100 + - 105 + - 110 + - 115 + - 120 + - 150 + - 180 + - 210 + - 240 + - 270 + - 300 +- name: sync_proxy_rules_duration_seconds + subsystem: kubeproxy + help: SyncProxyRules latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: sync_proxy_rules_endpoint_changes_pending + subsystem: kubeproxy + help: Pending proxy rules Endpoint changes + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_endpoint_changes_total + subsystem: kubeproxy + help: Cumulative proxy rules Endpoint changes + type: Counter + stabilityLevel: ALPHA +- name: sync_proxy_rules_iptables_restore_failures_total + subsystem: kubeproxy + help: Cumulative proxy iptables restore failures + type: Counter + stabilityLevel: ALPHA +- name: sync_proxy_rules_iptables_total + subsystem: kubeproxy + help: Number of proxy iptables rules programmed + type: Gauge + stabilityLevel: ALPHA + labels: + - table +- name: sync_proxy_rules_last_queued_timestamp_seconds + subsystem: kubeproxy + help: The last time a sync of proxy rules was queued + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_last_timestamp_seconds + subsystem: kubeproxy + help: The last time proxy rules were successfully synced + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_no_local_endpoints_total + subsystem: kubeproxy + help: Number of services with a Local traffic policy and no endpoints + type: Gauge + stabilityLevel: ALPHA + labels: + - traffic_policy +- name: sync_proxy_rules_service_changes_pending + subsystem: kubeproxy + help: Pending proxy rules Service changes + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_service_changes_total + subsystem: kubeproxy + help: Cumulative proxy rules Service changes + type: Counter + stabilityLevel: ALPHA +- name: volume_manager_selinux_container_errors_total + help: Number of errors when kubelet cannot compute SELinux context for a container. + Kubelet can't start such a Pod then and it will retry, therefore value of this + metric may not represent the actual nr. of containers. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_container_warnings_total + help: Number of errors when kubelet cannot compute SELinux context for a container + that are ignored. They will become real errors when SELinuxMountReadWriteOncePod + feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_pod_context_mismatch_errors_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. Kubelet can't start such a Pod then and it will retry, + therefore value of this metric may not represent the actual nr. of Pods. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_pod_context_mismatch_warnings_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. They are not errors yet, but they will become real errors + when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volume_context_mismatch_errors_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. Kubelet can't start such a Pod then and it + will retry, therefore value of this metric may not represent the actual nr. of + Pods. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volume_context_mismatch_warnings_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. They are not errors yet, but they will become + real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume + access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volumes_admitted_total + help: Number of volumes whose SELinux context was fine and will be mounted with + mount -o context option. + type: Gauge + stabilityLevel: ALPHA +- name: allocated_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of allocated IPs for Services + type: Gauge + stabilityLevel: ALPHA + labels: + - cidr +- name: allocation_errors_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of errors trying to allocate Cluster IPs + type: Counter + stabilityLevel: ALPHA + labels: + - cidr + - scope +- name: allocation_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of Cluster IPs allocations + type: Counter + stabilityLevel: ALPHA + labels: + - cidr + - scope +- name: available_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of available IPs for Services + type: Gauge + stabilityLevel: ALPHA + labels: + - cidr +- name: pods_logs_backend_tls_failure_total + subsystem: pod_logs + namespace: kube_apiserver + help: Total number of requests for pods/logs that failed due to kubelet server TLS + verification + type: Counter + stabilityLevel: ALPHA +- name: pods_logs_insecure_backend_total + subsystem: pod_logs + namespace: kube_apiserver + help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, + skip_tls_allowed, skip_tls_denied' + type: Counter + stabilityLevel: ALPHA + labels: + - usage +- name: e2e_scheduling_duration_seconds + subsystem: scheduler + help: E2e scheduling latency in seconds (scheduling algorithm + binding). This metric + is replaced by scheduling_attempt_duration_seconds. + type: Histogram + deprecatedVersion: 1.23.0 + stabilityLevel: ALPHA + labels: + - profile + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: goroutines + subsystem: scheduler + help: Number of running goroutines split by the work they do such as binding. + type: Gauge + stabilityLevel: ALPHA + labels: + - operation +- name: permit_wait_duration_seconds + subsystem: scheduler + help: Duration of waiting on permit. + type: Histogram + stabilityLevel: ALPHA + labels: + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: plugin_execution_duration_seconds + subsystem: scheduler + help: Duration for running a plugin at a specific extension point. + type: Histogram + stabilityLevel: ALPHA + labels: + - extension_point + - plugin + - status + buckets: + - 1e-05 + - 1.5000000000000002e-05 + - 2.2500000000000005e-05 + - 3.375000000000001e-05 + - 5.062500000000001e-05 + - 7.593750000000002e-05 + - 0.00011390625000000003 + - 0.00017085937500000006 + - 0.0002562890625000001 + - 0.00038443359375000017 + - 0.0005766503906250003 + - 0.0008649755859375004 + - 0.0012974633789062506 + - 0.0019461950683593758 + - 0.0029192926025390638 + - 0.004378938903808595 + - 0.006568408355712893 + - 0.009852612533569338 + - 0.014778918800354007 + - 0.02216837820053101 +- name: scheduler_cache_size + subsystem: scheduler + help: Number of nodes, pods, and assumed (bound) pods in the scheduler cache. + type: Gauge + stabilityLevel: ALPHA + labels: + - type +- name: scheduler_goroutines + subsystem: scheduler + help: Number of running goroutines split by the work they do such as binding. This + metric is replaced by the \"goroutines\" metric. + type: Gauge + deprecatedVersion: 1.26.0 + stabilityLevel: ALPHA + labels: + - work +- name: scheduling_algorithm_duration_seconds + subsystem: scheduler + help: Scheduling algorithm latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: unschedulable_pods + subsystem: scheduler + help: The number of unschedulable pods broken down by plugin name. A pod will increment + the gauge for all plugins that caused it to not schedule and so this metric have + meaning only when broken down by plugin. + type: Gauge + stabilityLevel: ALPHA + labels: + - plugin + - profile +- name: binder_cache_requests_total + subsystem: scheduler_volume + help: Total number for request volume binding cache + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: scheduling_stage_error_total + subsystem: scheduler_volume + help: Volume scheduling stage error count + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: legacy_tokens_total + subsystem: serviceaccount + help: Cumulative legacy service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: stale_tokens_total + subsystem: serviceaccount + help: Cumulative stale projected service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: valid_tokens_total + subsystem: serviceaccount + help: Cumulative valid projected service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: framework_extension_point_duration_seconds + subsystem: scheduler + help: Latency for running all plugins of a specific extension point. + type: Histogram + stabilityLevel: STABLE + labels: + - extension_point + - profile + - status + buckets: + - 0.0001 + - 0.0002 + - 0.0004 + - 0.0008 + - 0.0016 + - 0.0032 + - 0.0064 + - 0.0128 + - 0.0256 + - 0.0512 + - 0.1024 + - 0.2048 +- name: pending_pods + subsystem: scheduler + help: Number of pending pods, by the queue type. 'active' means number of pods in + activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number + of pods in unschedulablePods. + type: Gauge + stabilityLevel: STABLE + labels: + - queue +- name: pod_scheduling_attempts + subsystem: scheduler + help: Number of attempts to successfully schedule a pod. + type: Histogram + stabilityLevel: STABLE + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 +- name: pod_scheduling_duration_seconds + subsystem: scheduler + help: E2e latency for a pod being scheduled which may include multiple scheduling + attempts. + type: Histogram + stabilityLevel: STABLE + labels: + - attempts + buckets: + - 0.01 + - 0.02 + - 0.04 + - 0.08 + - 0.16 + - 0.32 + - 0.64 + - 1.28 + - 2.56 + - 5.12 + - 10.24 + - 20.48 + - 40.96 + - 81.92 + - 163.84 + - 327.68 + - 655.36 + - 1310.72 + - 2621.44 + - 5242.88 +- name: preemption_attempts_total + subsystem: scheduler + help: Total preemption attempts in the cluster till now + type: Counter + stabilityLevel: STABLE +- name: preemption_victims + subsystem: scheduler + help: Number of selected preemption victims + type: Histogram + stabilityLevel: STABLE + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 +- name: queue_incoming_pods_total + subsystem: scheduler + help: Number of pods added to scheduling queues by event and queue type. + type: Counter + stabilityLevel: STABLE + labels: + - event + - queue +- name: schedule_attempts_total + subsystem: scheduler + help: Number of attempts to schedule pods, by the result. 'unschedulable' means + a pod could not be scheduled, while 'error' means an internal scheduler problem. + type: Counter + stabilityLevel: STABLE + labels: + - profile + - result +- name: scheduling_attempt_duration_seconds + subsystem: scheduler + help: Scheduling attempt latency in seconds (scheduling algorithm + binding) + type: Histogram + stabilityLevel: STABLE + labels: + - profile + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: operations_seconds + subsystem: csi + help: Container Storage Interface operation duration with gRPC error code status + total + type: Histogram + stabilityLevel: ALPHA + labels: + - driver_name + - grpc_status_code + - method_name + - migrated + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: storage_operation_duration_seconds + help: Storage operation duration + type: Histogram + stabilityLevel: ALPHA + labels: + - migrated + - operation_name + - status + - volume_plugin + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: volume_operation_total_seconds + help: Storage operation end to end duration in seconds + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_name + - plugin_name + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: graph_actions_duration_seconds + subsystem: node_authorizer + help: Histogram of duration of graph actions in node authorizer. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + buckets: + - 0.0001 + - 0.0002 + - 0.0004 + - 0.0008 + - 0.0016 + - 0.0032 + - 0.0064 + - 0.0128 + - 0.0256 + - 0.0512 + - 0.1024 + - 0.2048 +- name: apiextensions_openapi_v2_regeneration_count + help: Counter of OpenAPI v2 spec regeneration count broken down by causing CRD name + and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - crd + - reason +- name: apiextensions_openapi_v3_regeneration_count + help: Counter of OpenAPI v3 spec regeneration count broken down by group, version, + causing CRD and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - crd + - group + - reason + - version +- name: apiserver_crd_webhook_conversion_duration_seconds + help: CRD webhook conversion duration in seconds + type: Histogram + stabilityLevel: ALPHA + labels: + - crd_name + - from_version + - succeeded + - to_version + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: step_admission_duration_seconds_summary + subsystem: admission + namespace: apiserver + help: Admission sub-step latency summary in seconds, broken out for each operation + and API resource and step type (validate or admit). + type: Summary + stabilityLevel: ALPHA + labels: + - operation + - rejected + - type + maxAge: 18000000000000 +- name: webhook_fail_open_count + subsystem: admission + namespace: apiserver + help: Admission webhook fail open count, identified by name and broken out for each + admission type (validating or mutating). + type: Counter + stabilityLevel: ALPHA + labels: + - name + - type +- name: webhook_rejection_count + subsystem: admission + namespace: apiserver + help: Admission webhook rejection count, identified by name and broken out for each + admission type (validating or admit) and operation. Additional labels specify + an error type (calling_webhook_error or apiserver_internal_error if an error occurred; + no_error otherwise) and optionally a non-zero rejection code if the webhook rejects + the request with an HTTP status code (honored by the apiserver when the code is + greater or equal to 400). Codes greater than 600 are truncated to 600, to keep + the metrics cardinality bounded. + type: Counter + stabilityLevel: ALPHA + labels: + - error_type + - name + - operation + - rejection_code + - type +- name: webhook_request_total + subsystem: admission + namespace: apiserver + help: Admission webhook request total, identified by name and broken out for each + admission type (validating or mutating) and operation. Additional labels specify + whether the request was rejected or not and an HTTP status code. Codes greater + than 600 are truncated to 600, to keep the metrics cardinality bounded. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - name + - operation + - rejected + - type +- name: error_total + subsystem: apiserver_audit + help: Counter of audit events that failed to be audited properly. Plugin identifies + the plugin affected by the error. + type: Counter + stabilityLevel: ALPHA + labels: + - plugin +- name: event_total + subsystem: apiserver_audit + help: Counter of audit events generated and sent to the audit backend. + type: Counter + stabilityLevel: ALPHA +- name: level_total + subsystem: apiserver_audit + help: Counter of policy levels for audit events (1 per request). + type: Counter + stabilityLevel: ALPHA + labels: + - level +- name: requests_rejected_total + subsystem: apiserver_audit + help: Counter of apiserver requests rejected due to an error in audit logging backend. + type: Counter + stabilityLevel: ALPHA +- name: apiserver_delegated_authn_request_duration_seconds + help: Request latency in seconds. Broken down by status code. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.25 + - 0.5 + - 0.7 + - 1 + - 1.5 + - 3 + - 5 + - 10 +- name: apiserver_delegated_authn_request_total + help: Number of HTTP requests partitioned by status code. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: controller_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission controller latency histogram in seconds, identified by name and + broken out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: step_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission sub-step latency histogram in seconds, broken out for each operation + and API resource and step type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: webhook_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission webhook latency histogram in seconds, identified by name and broken + out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: compilation_duration_seconds + subsystem: cel + namespace: apiserver + type: Histogram + stabilityLevel: ALPHA +- name: evaluation_duration_seconds + subsystem: cel + namespace: apiserver + type: Histogram + stabilityLevel: ALPHA +- name: certificate_expiration_seconds + subsystem: client + namespace: apiserver + help: Distribution of the remaining lifetime on the certificate used to authenticate + a request. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0 + - 1800 + - 3600 + - 7200 + - 21600 + - 43200 + - 86400 + - 172800 + - 345600 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 +- name: current_inqueue_requests + subsystem: apiserver + help: Maximal number of queued requests in this apiserver per request kind in last + second. + type: Gauge + stabilityLevel: ALPHA + labels: + - request_kind +- name: apiserver_delegated_authz_request_duration_seconds + help: Request latency in seconds. Broken down by status code. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.25 + - 0.5 + - 0.7 + - 1 + - 1.5 + - 3 + - 5 + - 10 +- name: apiserver_delegated_authz_request_total + help: Number of HTTP requests partitioned by status code. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: dial_duration_seconds + subsystem: egress_dialer + namespace: apiserver + help: Dial latency histogram in seconds, labeled by the protocol (http-connect or + grpc), transport (tcp or uds) + type: Histogram + stabilityLevel: ALPHA + labels: + - protocol + - transport + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 2.5 + - 12.5 +- name: dial_failure_count + subsystem: egress_dialer + namespace: apiserver + help: Dial failure count, labeled by the protocol (http-connect or grpc), transport + (tcp or uds), and stage (connect or proxy). The stage indicates at which stage + the dial failed + type: Counter + stabilityLevel: ALPHA + labels: + - protocol + - stage + - transport +- name: request_aborts_total + subsystem: apiserver + help: Number of requests which apiserver aborted possibly due to a timeout, for + each group, version, verb, resource, subresource and scope + type: Counter + stabilityLevel: ALPHA + labels: + - group + - resource + - scope + - subresource + - verb + - version +- name: request_body_sizes + subsystem: apiserver + help: Apiserver request body sizes broken out by size. + type: Histogram + stabilityLevel: ALPHA + labels: + - resource + - verb + buckets: + - 50000 + - 150000 + - 250000 + - 350000 + - 450000 + - 550000 + - 650000 + - 750000 + - 850000 + - 950000 + - 1.05e+06 + - 1.15e+06 + - 1.25e+06 + - 1.35e+06 + - 1.45e+06 + - 1.55e+06 + - 1.65e+06 + - 1.75e+06 + - 1.85e+06 + - 1.95e+06 + - 2.05e+06 + - 2.15e+06 + - 2.25e+06 + - 2.35e+06 + - 2.45e+06 + - 2.55e+06 + - 2.65e+06 + - 2.75e+06 + - 2.85e+06 + - 2.95e+06 + - 3.05e+06 +- name: request_filter_duration_seconds + subsystem: apiserver + help: Request filter latency distribution in seconds, for each filter type + type: Histogram + stabilityLevel: ALPHA + labels: + - filter + buckets: + - 0.0001 + - 0.0003 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.3 + - 1 + - 5 +- name: request_post_timeout_total + subsystem: apiserver + help: Tracks the activity of the request handlers after the associated requests + have been timed out by the apiserver + type: Counter + stabilityLevel: ALPHA + labels: + - source + - status +- name: request_slo_duration_seconds + subsystem: apiserver + help: Response latency distribution (not counting webhook duration) in seconds for + each verb, group, version, resource, subresource, scope and component. + type: Histogram + stabilityLevel: ALPHA + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: request_terminations_total + subsystem: apiserver + help: Number of requests which apiserver terminated in self-defense. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - component + - group + - resource + - scope + - subresource + - verb + - version +- name: request_timestamp_comparison_time + subsystem: apiserver + help: Time taken for comparison of old vs new objects in UPDATE or PATCH requests + type: Histogram + stabilityLevel: ALPHA + labels: + - code_path + buckets: + - 0.0001 + - 0.0003 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.3 + - 1 + - 5 +- name: selfrequest_total + subsystem: apiserver + help: Counter of apiserver self-requests broken out for each verb, API resource + and subresource. + type: Counter + stabilityLevel: ALPHA + labels: + - resource + - subresource + - verb +- name: tls_handshake_errors_total + subsystem: apiserver + help: Number of requests dropped with 'TLS handshake error from' error + type: Counter + stabilityLevel: ALPHA +- name: watch_events_sizes + subsystem: apiserver + help: Watch event size distribution in bytes + type: Histogram + stabilityLevel: ALPHA + labels: + - group + - kind + - version + buckets: + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 + - 65536 + - 131072 +- name: watch_events_total + subsystem: apiserver + help: Number of events sent in watch clients + type: Counter + stabilityLevel: ALPHA + labels: + - group + - kind + - version +- name: authenticated_user_requests + help: Counter of authenticated requests broken out by username. + type: Counter + stabilityLevel: ALPHA + labels: + - username +- name: authentication_attempts + help: Counter of authenticated attempts. + type: Counter + stabilityLevel: ALPHA + labels: + - result +- name: authentication_duration_seconds + help: Authentication duration in seconds broken out by result. + type: Histogram + stabilityLevel: ALPHA + labels: + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: active_fetch_count + subsystem: token_cache + namespace: authentication + type: Gauge + stabilityLevel: ALPHA + labels: + - status +- name: fetch_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status +- name: request_duration_seconds + subsystem: token_cache + namespace: authentication + type: Histogram + stabilityLevel: ALPHA + labels: + - status +- name: request_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status +- name: field_validation_request_duration_seconds + help: Response latency distribution in seconds for each field validation value and + whether field validation is enabled or not + type: Histogram + stabilityLevel: ALPHA + labels: + - enabled + - field_validation + buckets: + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: current_inflight_requests + subsystem: apiserver + help: Maximal number of currently used inflight request limit of this apiserver + per request kind in last second. + type: Gauge + stabilityLevel: STABLE + labels: + - request_kind +- name: longrunning_requests + subsystem: apiserver + help: Gauge of all active long-running apiserver requests broken out by verb, group, + version, resource, scope and component. Not all requests are tracked this way. + type: Gauge + stabilityLevel: STABLE + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version +- name: request_duration_seconds + subsystem: apiserver + help: Response latency distribution in seconds for each verb, dry run value, group, + version, resource, subresource, scope and component. + type: Histogram + stabilityLevel: STABLE + labels: + - component + - dry_run + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: request_total + subsystem: apiserver + help: Counter of apiserver requests broken out for each verb, dry run value, group, + version, resource, scope, component, and HTTP response code. + type: Counter + stabilityLevel: STABLE + labels: + - code + - component + - dry_run + - group + - resource + - scope + - subresource + - verb + - version +- name: requested_deprecated_apis + subsystem: apiserver + help: Gauge of deprecated APIs that have been requested, broken out by API group, + version, resource, subresource, and removed_release. + type: Gauge + stabilityLevel: STABLE + labels: + - group + - removed_release + - resource + - subresource + - version +- name: response_sizes + subsystem: apiserver + help: Response size distribution in bytes for each group, version, verb, resource, + subresource, scope and component. + type: Histogram + stabilityLevel: STABLE + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 1000 + - 10000 + - 100000 + - 1e+06 + - 1e+07 + - 1e+08 + - 1e+09 +- name: cache_list_fetched_objects_total + namespace: apiserver + help: Number of objects read from watch cache in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: cache_list_returned_objects_total + namespace: apiserver + help: Number of objects returned for a LIST request from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - resource_prefix +- name: cache_list_total + namespace: apiserver + help: Number of LIST requests served from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: dek_cache_fill_percent + subsystem: envelope_encryption + namespace: apiserver + help: Percent of the cache slots currently occupied by cached DEKs. + type: Gauge + stabilityLevel: ALPHA +- name: dek_cache_inter_arrival_time_seconds + subsystem: envelope_encryption + namespace: apiserver + help: Time (in seconds) of inter arrival of transformation requests. + type: Histogram + stabilityLevel: ALPHA + labels: + - transformation_type + buckets: + - 60 + - 120 + - 240 + - 480 + - 960 + - 1920 + - 3840 + - 7680 + - 15360 + - 30720 +- name: current_executing_requests + subsystem: flowcontrol + namespace: apiserver + help: Number of requests in initial (for a WATCH) or any (for a non-WATCH) execution + stage in the API Priority and Fairness subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: current_inqueue_requests + subsystem: flowcontrol + namespace: apiserver + help: Number of requests currently pending in queues of the API Priority and Fairness + subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: current_r + subsystem: flowcontrol + namespace: apiserver + help: R(time of last change) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: dispatch_r + subsystem: flowcontrol + namespace: apiserver + help: R(time of last dispatch) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: dispatched_requests_total + subsystem: flowcontrol + namespace: apiserver + help: Number of requests executed by API Priority and Fairness subsystem + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: epoch_advance_total + subsystem: flowcontrol + namespace: apiserver + help: Number of times the queueset's progress meter jumped backward + type: Counter + stabilityLevel: ALPHA + labels: + - priority_level + - success +- name: latest_s + subsystem: flowcontrol + namespace: apiserver + help: S(most recently dispatched request) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: next_discounted_s_bounds + subsystem: flowcontrol + namespace: apiserver + help: min and max, over queues, of S(oldest waiting request in queue) - estimated + work in progress + type: Gauge + stabilityLevel: ALPHA + labels: + - bound + - priority_level +- name: next_s_bounds + subsystem: flowcontrol + namespace: apiserver + help: min and max, over queues, of S(oldest waiting request in queue) + type: Gauge + stabilityLevel: ALPHA + labels: + - bound + - priority_level +- name: priority_level_request_utilization + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of number of requests (as a + fraction of the relevant limit) waiting or in any stage of execution (but only + initial stage for WATCHes) + stabilityLevel: ALPHA + labels: + - phase + - priority_level + buckets: + - 0 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.25 + - 0.5 + - 0.75 + - 1 +- name: priority_level_seat_utilization + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of utilization of seats for + any stage of execution (but only initial stage for WATCHes) + stabilityLevel: ALPHA + labels: + - priority_level + buckets: + - 0 + - 0.1 + - 0.2 + - 0.3 + - 0.4 + - 0.5 + - 0.6 + - 0.7 + - 0.8 + - 0.9 + - 0.95 + - 0.99 + - 1 + constLabels: + phase: executing +- name: read_vs_write_current_requests + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of the number of requests (as + a fraction of the relevant limit) waiting or in regular stage of execution + stabilityLevel: ALPHA + labels: + - phase + - request_kind + buckets: + - 0 + - 0.001 + - 0.01 + - 0.1 + - 0.2 + - 0.3 + - 0.4 + - 0.5 + - 0.6 + - 0.7 + - 0.8 + - 0.9 + - 0.95 + - 0.99 + - 1 +- name: rejected_requests_total + subsystem: flowcontrol + namespace: apiserver + help: Number of requests rejected by API Priority and Fairness subsystem + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + - reason +- name: request_concurrency_in_use + subsystem: flowcontrol + namespace: apiserver + help: Concurrency (number of seats) occupied by the currently executing (initial + stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness + subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: request_concurrency_limit + subsystem: flowcontrol + namespace: apiserver + help: Shared concurrency limit in the API Priority and Fairness subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: request_dispatch_no_accommodation_total + subsystem: flowcontrol + namespace: apiserver + help: Number of times a dispatch attempt resulted in a non accommodation due to + lack of available seats + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: request_execution_seconds + subsystem: flowcontrol + namespace: apiserver + help: Duration of initial stage (for a WATCH) or any (for a non-WATCH) stage of + request execution in the API Priority and Fairness subsystem + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + - type + buckets: + - 0 + - 0.005 + - 0.02 + - 0.05 + - 0.1 + - 0.2 + - 0.5 + - 1 + - 2 + - 5 + - 10 + - 30 +- name: request_queue_length_after_enqueue + subsystem: flowcontrol + namespace: apiserver + help: Length of queue in the API Priority and Fairness subsystem, as seen by each + request after it is enqueued + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 0 + - 10 + - 25 + - 50 + - 100 + - 250 + - 500 + - 1000 +- name: request_wait_duration_seconds + subsystem: flowcontrol + namespace: apiserver + help: Length of time a request spent waiting in its queue + type: Histogram + stabilityLevel: ALPHA + labels: + - execute + - flow_schema + - priority_level + buckets: + - 0 + - 0.005 + - 0.02 + - 0.05 + - 0.1 + - 0.2 + - 0.5 + - 1 + - 2 + - 5 + - 10 + - 30 +- name: watch_count_samples + subsystem: flowcontrol + namespace: apiserver + help: count of watchers for mutating requests in API Priority and Fairness + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 0 + - 1 + - 10 + - 100 + - 1000 + - 10000 +- name: work_estimated_seats + subsystem: flowcontrol + namespace: apiserver + help: Number of estimated seats (maximum of initial and final seats) associated + with requests in API Priority and Fairness + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 1 + - 2 + - 4 + - 10 +- name: init_events_total + namespace: apiserver + help: Counter of init events processed in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: data_key_generation_duration_seconds + subsystem: storage + namespace: apiserver + help: Latencies in seconds of data encryption key(DEK) generation operations. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 5e-06 + - 1e-05 + - 2e-05 + - 4e-05 + - 8e-05 + - 0.00016 + - 0.00032 + - 0.00064 + - 0.00128 + - 0.00256 + - 0.00512 + - 0.01024 + - 0.02048 + - 0.04096 +- name: data_key_generation_failures_total + subsystem: storage + namespace: apiserver + help: Total number of failed data encryption key(DEK) generation operations. + type: Counter + stabilityLevel: ALPHA +- name: envelope_transformation_cache_misses_total + subsystem: storage + namespace: apiserver + help: Total number of cache misses while accessing key decryption key(KEK). + type: Counter + stabilityLevel: ALPHA +- name: apiserver_storage_list_evaluated_objects_total + help: Number of objects tested in the course of serving a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_fetched_objects_total + help: Number of objects read from storage in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_returned_objects_total + help: Number of objects returned for a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_total + help: Number of LIST requests served from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: transformation_duration_seconds + subsystem: storage + namespace: apiserver + help: Latencies in seconds of value transformation operations. + type: Histogram + stabilityLevel: ALPHA + labels: + - transformation_type + buckets: + - 5e-06 + - 1e-05 + - 2e-05 + - 4e-05 + - 8e-05 + - 0.00016 + - 0.00032 + - 0.00064 + - 0.00128 + - 0.00256 + - 0.00512 + - 0.01024 + - 0.02048 + - 0.04096 + - 0.08192 + - 0.16384 + - 0.32768 + - 0.65536 + - 1.31072 + - 2.62144 + - 5.24288 + - 10.48576 + - 20.97152 + - 41.94304 + - 83.88608 +- name: transformation_operations_total + subsystem: storage + namespace: apiserver + help: Total number of transformations. + type: Counter + stabilityLevel: ALPHA + labels: + - status + - transformation_type + - transformer_prefix +- name: terminated_watchers_total + namespace: apiserver + help: Counter of watchers closed due to unresponsiveness broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: events_dispatched_total + subsystem: watch_cache + namespace: apiserver + help: Counter of events dispatched in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: initializations_total + subsystem: watch_cache + namespace: apiserver + help: Counter of watch cache initializations broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: x509_insecure_sha1_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers with insecure SHA1 signatures in + their serving certificate OR the number of connection failures due to the insecure + SHA1 signatures (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: x509_missing_san_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers missing SAN extension in their serving + certificate OR the number of connection failures due to the lack of x509 certificate + SAN extension missing (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: etcd_bookmark_counts + help: Number of etcd bookmarks (progress notify events) split by kind. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: etcd_db_total_size_in_bytes + help: Total size of the etcd database file physically allocated in bytes. + type: Gauge + stabilityLevel: ALPHA + labels: + - endpoint +- name: etcd_lease_object_counts + help: Number of objects attached to a single etcd lease. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 10 + - 50 + - 100 + - 500 + - 1000 + - 2500 + - 5000 +- name: etcd_request_duration_seconds + help: Etcd request latency in seconds for each operation and object type. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + - type + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: capacity + subsystem: watch_cache + help: Total capacity of watch cache broken by resource type. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_decrease_total + subsystem: watch_cache + help: Total number of watch cache capacity decrease events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_increase_total + subsystem: watch_cache + help: Total number of watch cache capacity increase events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_objects + help: Number of stored objects at the time of last check split by kind. + type: Gauge + stabilityLevel: STABLE + labels: + - resource +- name: nodesync_latency_seconds + subsystem: service_controller + help: A metric measuring the latency for nodesync which updates loadbalancer hosts + on cluster node updates. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 +- name: update_loadbalancer_host_latency_seconds + subsystem: service_controller + help: A metric measuring the latency for updating each load balancer hosts. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 +- name: kubernetes_build_info + help: A metric with a constant '1' value labeled by major, minor, git version, git + commit, git tree state, build date, Go version, and compiler from which Kubernetes + was built, and platform on which it is running. + type: Gauge + stabilityLevel: ALPHA + labels: + - build_date + - compiler + - git_commit + - git_tree_state + - git_version + - go_version + - major + - minor + - platform +- name: feature_enabled + namespace: kubernetes + help: This metric records the data about the stage and enablement of a k8s feature. + type: Gauge + stabilityLevel: ALPHA + labels: + - name + - stage +- name: healthcheck + namespace: kubernetes + help: This metric records the result of a single healthcheck. + type: Gauge + stabilityLevel: ALPHA + labels: + - name + - type +- name: healthchecks_total + namespace: kubernetes + help: This metric records the results of all healthcheck. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - status + - type +- name: leader_election_master_status + help: Gauge of if the reporting system is master of the relevant lease, 0 indicates + backup, 1 indicates master. 'name' is the string used to identify the lease. Please + make sure to group by name. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: rest_client_exec_plugin_call_total + help: Number of calls to an exec plugin, partitioned by the type of event encountered + (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) + and an optional exit code. The exit code will be set to 0 if and only if the plugin + call was successful. + type: Counter + stabilityLevel: ALPHA + labels: + - call_status + - code +- name: rest_client_exec_plugin_certificate_rotation_age + help: Histogram of the number of seconds the last auth exec plugin client certificate + lived before being rotated. If auth exec plugin client certificates are unused, + histogram will contain no data. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 600 + - 1800 + - 3600 + - 14400 + - 86400 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 + - 1.24416e+08 +- name: rest_client_exec_plugin_ttl_seconds + help: Gauge of the shortest TTL (time-to-live) of the client certificate(s) managed + by the auth exec plugin. The value is in seconds until certificate expiry (negative + if already expired). If auth exec plugins are unused or manage no TLS certificates, + the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: rest_client_rate_limiter_duration_seconds + help: Client side rate limiter latency in seconds. Broken down by verb, and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 + - 15 + - 30 + - 60 +- name: rest_client_request_duration_seconds + help: Request latency in seconds. Broken down by verb, and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 + - 15 + - 30 + - 60 +- name: rest_client_request_size_bytes + help: Request size in bytes. Broken down by verb and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 64 + - 256 + - 512 + - 1024 + - 4096 + - 16384 + - 65536 + - 262144 + - 1.048576e+06 + - 4.194304e+06 + - 1.6777216e+07 +- name: rest_client_requests_total + help: Number of HTTP requests, partitioned by status code, method, and host. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - host + - method +- name: rest_client_response_size_bytes + help: Response size in bytes. Broken down by verb and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 64 + - 256 + - 512 + - 1024 + - 4096 + - 16384 + - 65536 + - 262144 + - 1.048576e+06 + - 4.194304e+06 + - 1.6777216e+07 +- name: running_managed_controllers + help: Indicates where instances of a controller are currently running + type: Gauge + stabilityLevel: ALPHA + labels: + - manager + - name +- name: adds_total + subsystem: workqueue + help: Total number of adds handled by workqueue + type: Counter + stabilityLevel: ALPHA + labels: + - name +- name: depth + subsystem: workqueue + help: Current depth of workqueue + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: longest_running_processor_seconds + subsystem: workqueue + help: How many seconds has the longest running processor for workqueue been running. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: queue_duration_seconds + subsystem: workqueue + help: How long in seconds an item stays in workqueue before being requested. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + buckets: + - 1e-08 + - 1e-07 + - 1e-06 + - 9.999999999999999e-06 + - 9.999999999999999e-05 + - 0.001 + - 0.01 + - 0.1 + - 1 + - 10 +- name: retries_total + subsystem: workqueue + help: Total number of retries handled by workqueue + type: Counter + stabilityLevel: ALPHA + labels: + - name +- name: unfinished_work_seconds + subsystem: workqueue + help: How many seconds of work has done that is in progress and hasn't been observed + by work_duration. Large values indicate stuck threads. One can deduce the number + of stuck threads by observing the rate at which this increases. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: work_duration_seconds + subsystem: workqueue + help: How long in seconds processing an item from workqueue takes. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + buckets: + - 1e-08 + - 1e-07 + - 1e-06 + - 9.999999999999999e-06 + - 9.999999999999999e-05 + - 0.001 + - 0.01 + - 0.1 + - 1 + - 10 +- name: x509_insecure_sha1_total + subsystem: kube_aggregator + namespace: apiserver + help: Counts the number of requests to servers with insecure SHA1 signatures in + their serving certificate OR the number of connection failures due to the insecure + SHA1 signatures (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: x509_missing_san_total + subsystem: kube_aggregator + namespace: apiserver + help: Counts the number of requests to servers missing SAN extension in their serving + certificate OR the number of connection failures due to the lack of x509 certificate + SAN extension missing (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: aggregator_openapi_v2_regeneration_count + help: Counter of OpenAPI v2 spec regeneration count broken down by causing APIService + name and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - apiservice + - reason +- name: aggregator_openapi_v2_regeneration_duration + help: Gauge of OpenAPI v2 spec regeneration duration in seconds. + type: Gauge + stabilityLevel: ALPHA + labels: + - reason +- name: aggregator_unavailable_apiservice_total + help: Counter of APIServices which are marked as unavailable broken down by APIService + name and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - reason +- name: cloudprovider_aws_api_request_duration_seconds + help: Latency of AWS API calls + type: Histogram + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_aws_api_request_errors + help: AWS API errors + type: Counter + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_aws_api_throttled_requests_total + help: AWS API throttled requests + type: Counter + stabilityLevel: ALPHA + labels: + - operation_name +- name: api_request_duration_seconds + namespace: cloudprovider_azure + help: Latency of an Azure API call + type: Histogram + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 + - 1200 +- name: api_request_errors + namespace: cloudprovider_azure + help: Number of errors for an Azure API call + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: api_request_ratelimited_count + namespace: cloudprovider_azure + help: Number of rate limited Azure API calls + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: api_request_throttled_count + namespace: cloudprovider_azure + help: Number of throttled Azure API calls + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: op_duration_seconds + namespace: cloudprovider_azure + help: Latency of an Azure service operation + type: Histogram + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id + buckets: + - 0.1 + - 0.2 + - 0.5 + - 1 + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 100 + - 200 + - 300 +- name: op_failure_count + namespace: cloudprovider_azure + help: Number of failed Azure service operations + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: cloudprovider_gce_api_request_duration_seconds + help: Latency of a GCE API call + type: Histogram + stabilityLevel: ALPHA + labels: + - region + - request + - version + - zone +- name: cloudprovider_gce_api_request_errors + help: Number of errors for an API call + type: Counter + stabilityLevel: ALPHA + labels: + - region + - request + - version + - zone +- name: cloudprovider_vsphere_api_request_duration_seconds + help: Latency of vsphere api call + type: Histogram + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_vsphere_api_request_errors + help: vsphere Api errors + type: Counter + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_vsphere_operation_duration_seconds + help: Latency of vsphere operation call + type: Histogram + stabilityLevel: ALPHA + labels: + - operation +- name: cloudprovider_vsphere_operation_errors + help: vsphere operation errors + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: get_token_count + help: Counter of total Token() requests to the alternate token source + type: Counter + stabilityLevel: ALPHA +- name: get_token_fail_count + help: Counter of failed Token() requests to the alternate token source + type: Counter + stabilityLevel: ALPHA +- name: number_of_l4_ilbs + help: Number of L4 ILBs + type: Gauge + stabilityLevel: ALPHA + labels: + - feature +- name: pod_security_errors_total + help: Number of errors preventing normal evaluation. Non-fatal errors may result + in the latest restricted profile being used for evaluation. + type: Counter + stabilityLevel: ALPHA + labels: + - fatal + - request_operation + - resource + - subresource +- name: pod_security_evaluations_total + help: Number of policy evaluations that occurred, not counting ignored or exempt + requests. + type: Counter + stabilityLevel: ALPHA + labels: + - decision + - mode + - policy_level + - policy_version + - request_operation + - resource + - subresource +- name: pod_security_exemptions_total + help: Number of exempt requests, not counting ignored or out of scope requests. + type: Counter + stabilityLevel: ALPHA + labels: + - request_operation + - resource + - subresource diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md new file mode 100644 index 00000000000..9a876e0cc1b --- /dev/null +++ b/test/instrumentation/testdata/documentation.md @@ -0,0 +1,325 @@ +--- +title: Kubernetes Metrics Across Components +content_type: instrumentation +--- + + +## Metrics + +These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). + +(auto-generated 2022 Oct 25) + +### List of Kubernetes Metrics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameStability LevelTypeHelpLabelsConst Labels
version_infoALPHAGaugeEtcd server's binary version
binary_version
None
certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone
endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None
cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone
changesALPHACounterNumber of EndpointSlice changes
operation
None
desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone
endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone
num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
syncsALPHACounterNumber of EndpointSlice syncs
result
None
addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone
changesALPHACounterNumber of EndpointSlice changes
operation
None
desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone
endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone
endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone
endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone
num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone
sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None
sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None
job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None
terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded) that have the finalizer batch.kubernetes.io/job-tracking The event label can be "add" or "delete".`
event
None
attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone
job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None
job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None
job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None
evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None
unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None
zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None
zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None
cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None
cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None
multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone
job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone
evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None
create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone
certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None
kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None
server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None
cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None
containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone
device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None
device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None
eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None
evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None
graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None
http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None
http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None
lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone
managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone
node_nameALPHAGaugeThe node's name. The count is always 1.
node
None
pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone
pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone
pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone
pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone
pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None
pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone
pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone
pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None
pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone
preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None
run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None
run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None
running_containersALPHAGaugeNumber of containers currently running
container_state
None
running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone
runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None
runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None
runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None
started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None
started_containers_totalALPHACounterCumulative number of containers started
container_type
None
started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None
started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None
started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone
started_pods_totalALPHACounterCumulative number of pods startedNoneNone
volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None
probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None
probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None
csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone
sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone
sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone
sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone
sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone
sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None
sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone
sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone
sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None
sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone
sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone
volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone
volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone
allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None
allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None
allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None
available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None
pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone
pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None
e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None
goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None
permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None
plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None
scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None
scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None
scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone
unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None
binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None
scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None
legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone
stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone
valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone
framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None
pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None
pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone
pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None
preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone
preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone
queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None
schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None
scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None
operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None
storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None
volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None
graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None
apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None
apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None
apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None
step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None
webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None
webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None
error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None
event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone
level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None
requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone
apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
compilation_duration_secondsALPHAHistogramNoneNone
evaluation_duration_secondsALPHAHistogramNoneNone
certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone
current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None
apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None
dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None
request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None
request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None
request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None
request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None
request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None
request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None
selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None
tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone
watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None
watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None
authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None
authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None
authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None
active_fetch_countALPHAGauge
status
None
fetch_totalALPHACounter
status
None
request_duration_secondsALPHAHistogram
status
None
request_totalALPHACounter
status
None
field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None
current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None
longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None
request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None
request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None
requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None
response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None
cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None
cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None
dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone
dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None
current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None
current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None
current_rALPHAGaugeR(time of last change)
priority_level
None
dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None
dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None
epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None
latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None
next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None
next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None
priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None
priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing]
read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None
rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None
request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None
request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None
request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None
request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None
request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None
request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None
watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None
work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None
init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None
data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone
data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone
envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone
apiserver_storage_list_evaluated_objects_totalALPHACounterNumber of objects tested in the course of serving a LIST request from storage
resource
None
apiserver_storage_list_fetched_objects_totalALPHACounterNumber of objects read from storage in the course of serving a LIST request
resource
None
apiserver_storage_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from storage
resource
None
apiserver_storage_list_totalALPHACounterNumber of LIST requests served from storage
resource
None
transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None
transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None
terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None
events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None
initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None
x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None
etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None
etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone
etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None
capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None
capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None
capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None
apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None
nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone
update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone
kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None
feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None
healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None
healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None
leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None
rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None
rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone
rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone
rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None
rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None
rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None
running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None
adds_totalALPHACounterTotal number of adds handled by workqueue
name
None
depthALPHAGaugeCurrent depth of workqueue
name
None
longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None
queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None
retries_totalALPHACounterTotal number of retries handled by workqueue
name
None
unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None
work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None
x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None
aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None
aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None
cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None
cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None
cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None
api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None
api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None
api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None
api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None
op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None
op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None
cloudprovider_gce_api_request_duration_secondsALPHAHistogramLatency of a GCE API call
region
request
version
zone
None
cloudprovider_gce_api_request_errorsALPHACounterNumber of errors for an API call
region
request
version
zone
None
cloudprovider_vsphere_api_request_duration_secondsALPHAHistogramLatency of vsphere api call
request
None
cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None
cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None
cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None
get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone
get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone
number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None
pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None
pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None
pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None
diff --git a/test/instrumentation/update-documentation-metrics.sh b/test/instrumentation/update-documentation-metrics.sh new file mode 100755 index 00000000000..4d29eced1fa --- /dev/null +++ b/test/instrumentation/update-documentation-metrics.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright 2022 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script runs to ensure that we do not violate metric stability +# policies. +# Usage: `test/instrumentation/test-verify.sh`. + +set -o errexit +set -o nounset +set -o pipefail + +KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. +source "${KUBE_ROOT}/test/instrumentation/stability-utils.sh" + +kube::update::documentation::list diff --git a/test/instrumentation/update-documentation.sh b/test/instrumentation/update-documentation.sh new file mode 100755 index 00000000000..f56a47a287d --- /dev/null +++ b/test/instrumentation/update-documentation.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright 2022 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script runs to ensure that we do not violate metric stability +# policies. +# Usage: `test/instrumentation/test-update.sh`. + +set -o errexit +set -o nounset +set -o pipefail + +KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. +source "${KUBE_ROOT}/test/instrumentation/stability-utils.sh" + +kube::update::documentation From 8dffb571883238c1938ab5ce7179019c0f535d9c Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 25 Oct 2022 15:09:54 -0400 Subject: [PATCH 2/5] add documentation about auto-generating auto-documentation Change-Id: I6bfa6a7e8d83d211f4e52865e033b9911f607c58 --- test/instrumentation/README.md | 13 +++++++++++++ test/instrumentation/documentation/main.go | 12 ++++++------ test/instrumentation/testdata/documentation.md | 12 ++++++------ 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/test/instrumentation/README.md b/test/instrumentation/README.md index b9c41016bf7..80219cdba81 100644 --- a/test/instrumentation/README.md +++ b/test/instrumentation/README.md @@ -25,3 +25,16 @@ To update the golden test list, you can run: ```console ./test/instrumentation/test-update.sh ``` + +To update the list of documented metrics (which you need to run first before +upgrading the documentation markdown file). + +```console +./test/instrumentation/update-documentation-metrics.sh +``` + +To update the documented list of metrics for k8s/website, please run: + +```console +./test/instrumentation/update-documentation.sh +``` diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go index c7e40709a7c..5e1d02e423e 100755 --- a/test/instrumentation/documentation/main.go +++ b/test/instrumentation/documentation/main.go @@ -53,12 +53,12 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api - - - - - - + + + + + + diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md index 9a876e0cc1b..764a3cf39e7 100644 --- a/test/instrumentation/testdata/documentation.md +++ b/test/instrumentation/testdata/documentation.md @@ -15,12 +15,12 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api
NameStability LevelTypeHelpLabelsConst LabelsNameStability LevelTypeHelpLabelsConst Labels
- - - - - - + + + + + + From eb7253dfd5b85669233a16701238032f51ae4725 Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 25 Oct 2022 16:11:19 -0400 Subject: [PATCH 3/5] ignore golint error Change-Id: I696ab9ae8cb47407208e5a848b5ab852f1319037 --- test/instrumentation/documentation/main.go | 15 +- .../instrumentation/testdata/documentation.md | 480 +++++++++--------- 2 files changed, 248 insertions(+), 247 deletions(-) diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go index 5e1d02e423e..c0d5af8ecb3 100755 --- a/test/instrumentation/documentation/main.go +++ b/test/instrumentation/documentation/main.go @@ -54,15 +54,15 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api - - + + - - + + -{{range $index, $metric := .Metrics}}{{if not $metric.Labels }}{{else }}{{end}}{{if not $metric.ConstLabels }}{{else }}{{end}} +{{range $index, $metric := .Metrics}}{{if not $metric.Labels }}{{else }}{{end}}{{if not $metric.ConstLabels }}{{else }}{{end}} {{end}}
NameStability LevelTypeHelpLabelsConst LabelsNameStability LevelTypeHelpLabelsConst Labels
NameStability LevelTypeStability LevelType HelpLabelsConst LabelsLabelsConst Labels
{{$metric.Name}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
{{with $metric}}{{.BuildFQName}}{{end}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
@@ -89,7 +89,8 @@ func main() { } var tpl bytes.Buffer for i, m := range metrics { - m.Help = strings.Join(strings.Split(m.Help, "\n"), " ") + m.Help = strings.Join(strings.Split(m.Help, "\n"), ", ") + _ = m.BuildFQName() // ignore golint error metrics[i] = m } data := templateData{ @@ -124,6 +125,6 @@ type metric struct { ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"` } -func (m metric) buildFQName() string { +func (m metric) BuildFQName() string { return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name) } diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md index 764a3cf39e7..9134078195b 100644 --- a/test/instrumentation/testdata/documentation.md +++ b/test/instrumentation/testdata/documentation.md @@ -16,124 +16,124 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api Name - Stability Level - Type + Stability Level + Type Help - Labels - Const Labels + Labels + Const Labels -version_infoALPHAGaugeEtcd server's binary version
binary_version
None -certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone -endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None -cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone -changesALPHACounterNumber of EndpointSlice changes
operation
None -desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone -endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone -num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -syncsALPHACounterNumber of EndpointSlice syncs
result
None -addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone -changesALPHACounterNumber of EndpointSlice changes
operation
None -desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone -endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone -endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone -endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone -num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone -sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None -sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None -job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None -terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded) that have the finalizer batch.kubernetes.io/job-tracking The event label can be "add" or "delete".`
event
None +etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None +kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone +endpoint_slice_controller_endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None +cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone +endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone +endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone +endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None +endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone +endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone +endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone +root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None +root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None +job_controller_job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None +job_controller_terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone -job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None -job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None -job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None -evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None -unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None -zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None -zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None -cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None -cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None -multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone -job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone -evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None -create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone -certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone -certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone -kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None -kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None -server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None +job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None +job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None +node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None +node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None +node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None +node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None +node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None +node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None +replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone +ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone +node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None +ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone +kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone +kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None +kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None +kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None -cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None -containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone -device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None -device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None -eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None -evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None -graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None -http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None -http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None -lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone -managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone -node_nameALPHAGaugeThe node's name. The count is always 1.
node
None -pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone -pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone -pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone -pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone -pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None -pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None -pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None -pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None -pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None -pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone -pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone -pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None -pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone -preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None -run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None -run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None -running_containersALPHAGaugeNumber of containers currently running
container_state
None -running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone -runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None -runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None -runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None -started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None -started_containers_totalALPHACounterCumulative number of containers started
container_type
None -started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None -started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None -started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone -started_pods_totalALPHACounterCumulative number of pods startedNoneNone -volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None -probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None -probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None -csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone -sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone -sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone -sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone -sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone -sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None -sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone -sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone -sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None -sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone -sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone +kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None +kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone +kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None +kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None +kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None +kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None +kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None +kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None +kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None +kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone +kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone +kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None +kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone +kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone +kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone +kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone +kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone +kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone +kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None +kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone +kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None +kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None +kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None +kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None +kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone +kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None +kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None +kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None +kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None +kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None +kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None +kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None +kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone +kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone +kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None +prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None +prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None +apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None +apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None +kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone +kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone +kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None +kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone +kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone +kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None +kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone +kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone @@ -141,142 +141,142 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone -allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None -allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None -allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None -available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None -pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone -pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None -e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None -goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None -permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None -plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None -scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None -scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None -scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone -unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None -binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None -scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None -legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone -stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone -valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone -framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None -pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None -pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone -pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None -preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone -preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone -queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None -schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None -scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None -operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None +kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None +kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None +kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None +kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None +kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone +kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None +scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None +scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None +scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None +scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None +scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None +scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None +scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone +scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None +scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None +scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None +serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone +serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone +serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone +scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None +scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None +scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone +scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None +scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone +scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone +scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None +scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None +scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None +csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None -graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None +node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None -step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None -webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None -webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None -webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None -error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None -event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone -level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None -requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone +apiserver_admission_step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None +apiserver_admission_webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None +apiserver_admission_webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None +apiserver_admission_webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None +apiserver_audit_error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None +apiserver_audit_event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone +apiserver_audit_level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None +apiserver_audit_requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None -controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None -step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None -webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None -compilation_duration_secondsALPHAHistogramNoneNone -evaluation_duration_secondsALPHAHistogramNoneNone -certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone -current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None +apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None +apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_cel_compilation_duration_secondsALPHAHistogramNoneNone +apiserver_cel_evaluation_duration_secondsALPHAHistogramNoneNone +apiserver_client_certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone +apiserver_current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None -dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None -dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None -request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None -request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None -request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None -request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None -request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None -request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None -selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None -tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone -watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None -watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None +apiserver_egress_dialer_dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None +apiserver_egress_dialer_dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None +apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None +apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None +apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None +apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None +apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None +apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None +apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None +apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone +apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None +apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None -active_fetch_countALPHAGauge
status
None -fetch_totalALPHACounter
status
None -request_duration_secondsALPHAHistogram
status
None -request_totalALPHACounter
status
None +authentication_token_cache_active_fetch_countALPHAGauge
status
None +authentication_token_cache_fetch_totalALPHACounter
status
None +authentication_token_cache_request_duration_secondsALPHAHistogram
status
None +authentication_token_cache_request_totalALPHACounter
status
None field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None -current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None -longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None -request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None -request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None -requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None -response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None -cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None -cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None -dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone -dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None -current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None -current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None -current_rALPHAGaugeR(time of last change)
priority_level
None -dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None -dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None -epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None -latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None -next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None -next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None -priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None -priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing] -read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None -rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None -request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None -request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None -request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None -request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None -request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None -request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None -watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None -work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None -init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None -data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone -data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone -envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone +apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None +apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None +apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None +apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None +apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None +apiserver_envelope_encryption_dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone +apiserver_envelope_encryption_dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None +apiserver_flowcontrol_current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None +apiserver_flowcontrol_current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None +apiserver_flowcontrol_current_rALPHAGaugeR(time of last change)
priority_level
None +apiserver_flowcontrol_dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None +apiserver_flowcontrol_dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None +apiserver_flowcontrol_epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None +apiserver_flowcontrol_latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None +apiserver_flowcontrol_next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None +apiserver_flowcontrol_next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None +apiserver_flowcontrol_priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None +apiserver_flowcontrol_priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing] +apiserver_flowcontrol_read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None +apiserver_flowcontrol_rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None +apiserver_flowcontrol_request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None +apiserver_flowcontrol_request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None +apiserver_flowcontrol_request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None +apiserver_flowcontrol_request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None +apiserver_flowcontrol_request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None +apiserver_flowcontrol_request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None +apiserver_flowcontrol_watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None +apiserver_flowcontrol_work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None +apiserver_init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None +apiserver_storage_data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone +apiserver_storage_data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone +apiserver_storage_envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone apiserver_storage_list_evaluated_objects_totalALPHACounterNumber of objects tested in the course of serving a LIST request from storage
resource
None apiserver_storage_list_fetched_objects_totalALPHACounterNumber of objects read from storage in the course of serving a LIST request
resource
None apiserver_storage_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from storage
resource
None apiserver_storage_list_totalALPHACounterNumber of LIST requests served from storage
resource
None -transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None -transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None -terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None -events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None -initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None -x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone -x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone +apiserver_storage_transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None +apiserver_storage_transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None +apiserver_terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None +apiserver_watch_cache_events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None +apiserver_watch_cache_initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None +apiserver_webhooks_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone +apiserver_webhooks_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None -capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None -capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None -capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None +watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None +watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None +watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None -nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone -update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone +service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone +service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None -feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None -healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None -healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None +kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None +kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None +kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone @@ -287,27 +287,27 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None -adds_totalALPHACounterTotal number of adds handled by workqueue
name
None -depthALPHAGaugeCurrent depth of workqueue
name
None -longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None -queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None -retries_totalALPHACounterTotal number of retries handled by workqueue
name
None -unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None -work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None -x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone -x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone +workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None +workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None +workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None +workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None +workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None +workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None +workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None +apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone +apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None -api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None -api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None -api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None -api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None -op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None -op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None +cloudprovider_azure_api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None +cloudprovider_azure_api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None +cloudprovider_azure_api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None +cloudprovider_azure_api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None +cloudprovider_azure_op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None +cloudprovider_azure_op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None cloudprovider_gce_api_request_duration_secondsALPHAHistogramLatency of a GCE API call
region
request
version
zone
None cloudprovider_gce_api_request_errorsALPHACounterNumber of errors for an API call
region
request
version
zone
None cloudprovider_vsphere_api_request_duration_secondsALPHAHistogramLatency of vsphere api call
request
None From 1e99f54bb6605f2b258937b45f300a875360de11 Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 25 Oct 2022 16:32:16 -0400 Subject: [PATCH 4/5] adjust sizes and documentation Change-Id: Icd89c0c5bd6fbfb616255132db602b4db5ee2fe9 --- test/instrumentation/documentation/main.go | 23 +- test/instrumentation/main.go | 6 + test/instrumentation/stability-utils.sh | 2 +- .../testdata/documentation-list.yaml | 47 +- .../instrumentation/testdata/documentation.md | 444 +++++++++--------- 5 files changed, 274 insertions(+), 248 deletions(-) diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go index c0d5af8ecb3..62c05dab059 100755 --- a/test/instrumentation/documentation/main.go +++ b/test/instrumentation/documentation/main.go @@ -20,6 +20,7 @@ import ( "bytes" "fmt" "os" + "sort" "strings" "text/template" "time" @@ -37,7 +38,7 @@ var ( const ( templ = `--- -title: Kubernetes Metrics Across Components +title: Kubernetes Metrics content_type: instrumentation --- @@ -55,8 +56,8 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api Name Stability Level - Type - Help + Type + Help Labels Const Labels @@ -82,6 +83,7 @@ func main() { if err != nil { println("err", err) } + sort.Sort(byFQName(metrics)) t := template.New("t") t, err := t.Parse(templ) if err != nil { @@ -128,3 +130,18 @@ type metric struct { func (m metric) BuildFQName() string { return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name) } + +type byFQName []metric + +func (ms byFQName) Len() int { return len(ms) } +func (ms byFQName) Less(i, j int) bool { + if ms[i].StabilityLevel < ms[j].StabilityLevel { + return true + } else if ms[i].StabilityLevel > ms[j].StabilityLevel { + return false + } + return ms[i].BuildFQName() < ms[j].BuildFQName() +} +func (ms byFQName) Swap(i, j int) { + ms[i], ms[j] = ms[j], ms[i] +} diff --git a/test/instrumentation/main.go b/test/instrumentation/main.go index af5465e76f8..451023cc21a 100644 --- a/test/instrumentation/main.go +++ b/test/instrumentation/main.go @@ -89,6 +89,12 @@ func main() { if len(stableMetrics) == 0 { os.Exit(0) } + for i, m := range stableMetrics { + if m.StabilityLevel == "" { + m.StabilityLevel = "ALPHA" + } + stableMetrics[i] = m + } sort.Sort(byFQName(stableMetrics)) data, err := yaml.Marshal(stableMetrics) if err != nil { diff --git a/test/instrumentation/stability-utils.sh b/test/instrumentation/stability-utils.sh index a751ff4834b..749d68e37cc 100755 --- a/test/instrumentation/stability-utils.sh +++ b/test/instrumentation/stability-utils.sh @@ -115,7 +115,7 @@ kube::update::documentation::list() { exit 1 fi mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation-list.yaml" - echo "${green}Updated golden list of stable metrics.${reset}" + echo "${green}Updated list of metrics for documentation ${reset}" } kube::update::documentation() { diff --git a/test/instrumentation/testdata/documentation-list.yaml b/test/instrumentation/testdata/documentation-list.yaml index 281bd57f74b..4b884cb9521 100644 --- a/test/instrumentation/testdata/documentation-list.yaml +++ b/test/instrumentation/testdata/documentation-list.yaml @@ -12,12 +12,6 @@ certificate is invalid or unused, the value will be +INF. type: Gauge stabilityLevel: ALPHA -- name: endpointslices_changed_per_sync - subsystem: endpoint_slice_controller - help: Number of EndpointSlices changed on each Service sync - type: Histogram - labels: - - topology - name: cronjob_job_creation_skew_duration_seconds subsystem: cronjob_controller help: Time between when a cronjob is scheduled to be run, and when the corresponding @@ -94,6 +88,13 @@ - 8192 - 16384 - 32768 +- name: endpointslices_changed_per_sync + subsystem: endpoint_slice_controller + help: Number of EndpointSlices changed on each Service sync + type: Histogram + stabilityLevel: ALPHA + labels: + - topology - name: num_endpoint_slices subsystem: endpoint_slice_controller help: Number of EndpointSlices @@ -269,22 +270,6 @@ stabilityLevel: ALPHA labels: - code -- name: job_pods_finished_total - subsystem: job_controller - help: The number of finished Pods that are fully tracked - type: Counter - labels: - - completion_mode - - result -- name: terminated_pods_tracking_finalizer_total - subsystem: job_controller - help: |- - `The number of terminated pods (phase=Failed|Succeeded) - that have the finalizer batch.kubernetes.io/job-tracking - The event label can be "add" or "delete".` - type: Counter - labels: - - event - name: attachdetach_controller_forced_detaches help: Number of times the A/D Controller performed a forced detach type: Counter @@ -297,6 +282,14 @@ labels: - completion_mode - result +- name: job_pods_finished_total + subsystem: job_controller + help: The number of finished Pods that are fully tracked + type: Counter + stabilityLevel: ALPHA + labels: + - completion_mode + - result - name: job_sync_duration_seconds subsystem: job_controller help: The time it took to sync a job @@ -331,6 +324,16 @@ - action - completion_mode - result +- name: terminated_pods_tracking_finalizer_total + subsystem: job_controller + help: |- + `The number of terminated pods (phase=Failed|Succeeded) + that have the finalizer batch.kubernetes.io/job-tracking + The event label can be "add" or "delete".` + type: Counter + stabilityLevel: ALPHA + labels: + - event - name: evictions_number subsystem: node_collector help: Number of Node evictions that happened since current instance of NodeController diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md index 9134078195b..df7b46f6524 100644 --- a/test/instrumentation/testdata/documentation.md +++ b/test/instrumentation/testdata/documentation.md @@ -1,5 +1,5 @@ --- -title: Kubernetes Metrics Across Components +title: Kubernetes Metrics content_type: instrumentation --- @@ -17,165 +17,18 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api Name Stability Level - Type - Help + Type + Help Labels Const Labels -etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None -kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone -endpoint_slice_controller_endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None -cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone -endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None -endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone -endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone -endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None -endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone -endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None -endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone -endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone -root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None -root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None -job_controller_job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None -job_controller_terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None -attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone -job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None -job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None -job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None -node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None -node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None -node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None -node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None -node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None -node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None -node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone -ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone -node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None -ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone -kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone -kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone -kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None -kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None -kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone -volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None -kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None -kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone -kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None -kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None -kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None -kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None -kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None -kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None -kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None -kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone -kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone -kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None -kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone -kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone -kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone -kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone -kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone -kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone -kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None -kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone -kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None -kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None -kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None -kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None -kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone -kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None -kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None -kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None -kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None -kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None -kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None -kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None -kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone -kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone -kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None -prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None -prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None -apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone -kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone -kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone -kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone -kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone -kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None -kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone -kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone -kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None -kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone -kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone -volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone -volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone -volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone -volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone -kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None -kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None -kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None -kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None -kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone -kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None -scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None -scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None -scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None -scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None -scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None -scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None -scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone -scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None -scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None -scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None -serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone -serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone -serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone -scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None -scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None -scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone -scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None -scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone -scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone -scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None -scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None -scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None -csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None -storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None -volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None -node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None +aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None +aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None +aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None -apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None apiserver_admission_step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None apiserver_admission_webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None apiserver_admission_webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None @@ -184,47 +37,22 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_audit_event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone apiserver_audit_level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None apiserver_audit_requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone -apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None -apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None -apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None -apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None -apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None +apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None +apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None apiserver_cel_compilation_duration_secondsALPHAHistogramNoneNone apiserver_cel_evaluation_duration_secondsALPHAHistogramNoneNone +apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None +apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None apiserver_client_certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone +apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None apiserver_current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None +apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None +apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None apiserver_egress_dialer_dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None apiserver_egress_dialer_dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None -apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None -apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None -apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None -apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None -apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None -apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None -apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None -apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone -apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None -apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None -authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None -authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None -authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None -authentication_token_cache_active_fetch_countALPHAGauge
status
None -authentication_token_cache_fetch_totalALPHACounter
status
None -authentication_token_cache_request_duration_secondsALPHAHistogram
status
None -authentication_token_cache_request_totalALPHACounter
status
None -field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None -apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None -apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None -apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None -apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None -apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None -apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None -apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None -apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None apiserver_envelope_encryption_dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone apiserver_envelope_encryption_dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None apiserver_flowcontrol_current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None @@ -249,6 +77,16 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_flowcontrol_watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None apiserver_flowcontrol_work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None apiserver_init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None +apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone +apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone +apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None +apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None +apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None +apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None +apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None +apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None +apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None apiserver_storage_data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone apiserver_storage_data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone apiserver_storage_envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone @@ -259,46 +97,21 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_storage_transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None apiserver_storage_transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None apiserver_terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None +apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone apiserver_watch_cache_events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None apiserver_watch_cache_initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None +apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None +apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None apiserver_webhooks_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone apiserver_webhooks_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone -etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None -etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None -etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone -etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None -watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None -watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None -watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None -apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None -service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone -service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone -kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None -kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None -kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None -kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None -leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None -rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None -rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone -rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone -rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None -rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None -rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None -rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None -rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None -running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None -workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None -workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None -workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None -workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None -workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None -workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None -workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None -apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone -apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone -aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None -aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None -aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None +attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone +authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None +authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None +authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None +authentication_token_cache_active_fetch_countALPHAGauge
status
None +authentication_token_cache_fetch_totalALPHACounter
status
None +authentication_token_cache_request_duration_secondsALPHAHistogram
status
None +authentication_token_cache_request_totalALPHACounter
status
None cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None @@ -314,12 +127,199 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None +cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone +csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None +endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone +endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone +endpoint_slice_controller_endpointslices_changed_per_syncALPHAHistogramNumber of EndpointSlices changed on each Service sync
topology
None +endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None +endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone +endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone +endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None +etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None +etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone +etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None +etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None +field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None +garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone +job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None +job_controller_job_pods_finished_totalALPHACounterThe number of finished Pods that are fully tracked
completion_mode
result
None +job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None +job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None +job_controller_terminated_pods_tracking_finalizer_totalALPHACounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None +kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None +kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None +kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None +kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None +kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone +kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None +kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone +kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone +kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone +kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None +kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone +kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None +kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None +kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None +kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None +kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None +kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None +kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None +kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None +kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None +kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone +kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone +kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None +kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone +kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone +kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone +kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone +kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone +kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone +kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None +kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone +kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None +kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None +kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None +kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None +kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone +kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None +kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None +kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None +kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None +kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None +kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None +kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None +kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone +kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone +kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None +kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone +kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone +kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None +kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone +kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone +kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None +kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone +kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone +kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None +kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None +kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None +kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None +leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None +node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None +node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None +node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None +node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None +node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None +node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None +node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None +prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None +prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None +replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone +rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None +rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone +rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone +rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None +rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None +rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None +rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None +rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None +root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None +root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None +running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None +scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None +scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None +scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None +scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None +scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None +scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None +scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone +scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None +scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None +scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None +service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone +service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone +serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone +serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone +serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone +storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None +ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone +volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone +volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone +volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone +volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone +volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None +volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None +watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None +watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None +watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None +workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None +workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None +workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None +workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None +workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None +workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None +workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None +apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None +apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None +apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None +apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None +node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None +scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None +scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None +scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone +scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None +scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone +scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone +scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None +scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None +scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None From fa2eb8e3ea8eea307660e3d7bb454b93ff5428c7 Mon Sep 17 00:00:00 2001 From: Han Kang Date: Tue, 25 Oct 2022 17:08:00 -0400 Subject: [PATCH 5/5] make changes suggested by sig docs Change-Id: I793eea636e7a83da7f87b23a3aa4e7ddfaf42ec6 --- test/instrumentation/documentation/main.go | 41 +- .../instrumentation/testdata/documentation.md | 2099 ++++++++++++++--- 2 files changed, 1815 insertions(+), 325 deletions(-) diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go index 62c05dab059..d52a3d4e0b7 100755 --- a/test/instrumentation/documentation/main.go +++ b/test/instrumentation/documentation/main.go @@ -34,37 +34,46 @@ var ( GOROOT string = os.Getenv("GOROOT") GOOS string = os.Getenv("GOOS") KUBE_ROOT string = os.Getenv("KUBE_ROOT") + funcMap = template.FuncMap{ + "ToLower": strings.ToLower, + } ) const ( templ = `--- -title: Kubernetes Metrics -content_type: instrumentation +title: Kubernetes Metrics Reference +content_type: reference +description: >- + Details of the metric data that Kubernetes components export. --- -## Metrics +## Metrics (auto-generated {{.GeneratedDate.Format "2006 Jan 02"}}) -These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). - -(auto-generated {{.GeneratedDate.Format "2006 Jan 02"}}) +This page details the metrics that different Kubernetes components export. You can query the metrics endpoint for these +components using an HTTP scrape, and fetch the current metrics data in Prometheus format. ### List of Kubernetes Metrics - +
- - - - - - + + + + + + -{{range $index, $metric := .Metrics}}{{if not $metric.Labels }}{{else }}{{end}}{{if not $metric.ConstLabels }}{{else }}{{end}} -{{end}} +{{range $index, $metric := .Metrics}} + + + + +{{if not $metric.Labels }}{{else }}{{end}} +{{if not $metric.ConstLabels }}{{else }}{{end}}{{end}}
NameStability LevelTypeHelpLabelsConst LabelsNameStability LevelTypeHelpLabelsConst Labels
{{with $metric}}{{.BuildFQName}}{{end}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
{{with $metric}}{{.BuildFQName}}{{end}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
` @@ -84,7 +93,7 @@ func main() { println("err", err) } sort.Sort(byFQName(metrics)) - t := template.New("t") + t := template.New("t").Funcs(funcMap) t, err := t.Parse(templ) if err != nil { println("err", err) diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md index df7b46f6524..c7085b7c24b 100644 --- a/test/instrumentation/testdata/documentation.md +++ b/test/instrumentation/testdata/documentation.md @@ -1,325 +1,1806 @@ --- -title: Kubernetes Metrics -content_type: instrumentation +title: Kubernetes Metrics Reference +content_type: reference +description: >- + Details of the metric data that Kubernetes components export. --- -## Metrics +## Metrics (auto-generated 2022 Oct 25) -These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). - -(auto-generated 2022 Oct 25) +This page details the metrics that different Kubernetes components export. You can query the metrics endpoint for these +components using an HTTP scrape, and fetch the current metrics data in Prometheus format. ### List of Kubernetes Metrics - +
- - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameStability LevelTypeHelpLabelsConst LabelsNameStability LevelTypeHelpLabelsConst Labels
aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None
aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None
aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None
apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None
apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None
apiserver_admission_step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
apiserver_admission_webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None
apiserver_admission_webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None
apiserver_admission_webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None
apiserver_audit_error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None
apiserver_audit_event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone
apiserver_audit_level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None
apiserver_audit_requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone
apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None
apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None
apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None
apiserver_cel_compilation_duration_secondsALPHAHistogramNoneNone
apiserver_cel_evaluation_duration_secondsALPHAHistogramNoneNone
apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
apiserver_client_certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone
apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None
apiserver_current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None
apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
apiserver_egress_dialer_dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None
apiserver_egress_dialer_dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None
apiserver_envelope_encryption_dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone
apiserver_envelope_encryption_dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None
apiserver_flowcontrol_current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_current_rALPHAGaugeR(time of last change)
priority_level
None
apiserver_flowcontrol_dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None
apiserver_flowcontrol_dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None
apiserver_flowcontrol_latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None
apiserver_flowcontrol_next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None
apiserver_flowcontrol_next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None
apiserver_flowcontrol_priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None
apiserver_flowcontrol_priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing]
apiserver_flowcontrol_read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None
apiserver_flowcontrol_rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None
apiserver_flowcontrol_request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None
apiserver_flowcontrol_request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None
apiserver_flowcontrol_request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None
apiserver_flowcontrol_request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None
apiserver_flowcontrol_request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None
apiserver_flowcontrol_watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None
apiserver_flowcontrol_work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None
apiserver_init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None
apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None
apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None
apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None
apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None
apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None
apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None
apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None
apiserver_storage_data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone
apiserver_storage_data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone
apiserver_storage_envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone
apiserver_storage_list_evaluated_objects_totalALPHACounterNumber of objects tested in the course of serving a LIST request from storage
resource
None
apiserver_storage_list_fetched_objects_totalALPHACounterNumber of objects read from storage in the course of serving a LIST request
resource
None
apiserver_storage_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from storage
resource
None
apiserver_storage_list_totalALPHACounterNumber of LIST requests served from storage
resource
None
apiserver_storage_transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None
apiserver_storage_transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None
apiserver_terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None
apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone
apiserver_watch_cache_events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None
apiserver_watch_cache_initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None
apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None
apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None
apiserver_webhooks_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
apiserver_webhooks_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone
authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None
authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None
authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None
authentication_token_cache_active_fetch_countALPHAGauge
status
None
authentication_token_cache_fetch_totalALPHACounter
status
None
authentication_token_cache_request_duration_secondsALPHAHistogram
status
None
authentication_token_cache_request_totalALPHACounter
status
None
cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None
cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None
cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None
cloudprovider_azure_api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None
cloudprovider_azure_op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None
cloudprovider_azure_op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None
cloudprovider_gce_api_request_duration_secondsALPHAHistogramLatency of a GCE API call
region
request
version
zone
None
cloudprovider_gce_api_request_errorsALPHACounterNumber of errors for an API call
region
request
version
zone
None
cloudprovider_vsphere_api_request_duration_secondsALPHAHistogramLatency of vsphere api call
request
None
cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None
cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None
cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None
cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone
csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None
endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None
endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone
endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone
endpoint_slice_controller_endpointslices_changed_per_syncALPHAHistogramNumber of EndpointSlices changed on each Service sync
topology
None
endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None
endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone
endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None
endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone
endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None
etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None
etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone
etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None
etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None
field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None
garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone
get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone
get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone
job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None
job_controller_job_pods_finished_totalALPHACounterThe number of finished Pods that are fully tracked
completion_mode
result
None
job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None
job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None
job_controller_terminated_pods_tracking_finalizer_totalALPHACounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None
kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None
kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None
kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None
kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None
kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone
kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None
kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone
kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None
kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone
kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None
kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None
kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None
kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None
kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None
kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None
kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None
kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None
kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None
kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone
kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone
kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None
kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone
kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone
kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone
kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone
kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone
kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone
kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None
kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone
kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None
kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None
kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None
kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None
kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone
kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None
kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None
kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None
kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None
kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None
kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None
kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None
kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone
kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone
kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None
kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone
kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone
kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone
kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone
kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone
kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None
kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone
kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone
kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None
kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone
kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone
kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None
kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None
kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None
kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None
leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None
node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None
node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None
node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None
node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None
node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None
node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None
node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None
node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None
pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None
pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None
pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None
prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None
prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None
replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone
rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None
rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone
rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone
rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None
rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None
rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None
root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None
root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None
running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None
scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None
scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None
scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None
scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None
scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None
scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None
scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone
scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None
scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None
scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None
service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone
service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone
serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone
serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone
serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone
storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None
ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone
volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone
volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone
volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None
volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None
watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None
watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None
watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None
workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None
workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None
workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None
workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None
workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None
workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None
workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None
apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None
apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None
apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None
apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None
apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None
apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None
node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None
scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None
scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None
scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone
scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None
scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone
scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone
scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None
scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None
scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None
aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None
aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None
aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None
apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None
apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None
apiserver_admission_step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
apiserver_admission_webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None
apiserver_admission_webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None
apiserver_admission_webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None
apiserver_audit_error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None
apiserver_audit_event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone
apiserver_audit_level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None
apiserver_audit_requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone
apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None
apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None
apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None
apiserver_cel_compilation_duration_secondsALPHAHistogramNoneNone
apiserver_cel_evaluation_duration_secondsALPHAHistogramNoneNone
apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
apiserver_client_certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone
apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None
apiserver_current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None
apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
apiserver_egress_dialer_dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None
apiserver_egress_dialer_dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None
apiserver_envelope_encryption_dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone
apiserver_envelope_encryption_dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None
apiserver_flowcontrol_current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_current_rALPHAGaugeR(time of last change)
priority_level
None
apiserver_flowcontrol_dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None
apiserver_flowcontrol_dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None
apiserver_flowcontrol_latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None
apiserver_flowcontrol_next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None
apiserver_flowcontrol_next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None
apiserver_flowcontrol_priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None
apiserver_flowcontrol_priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing]
apiserver_flowcontrol_read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None
apiserver_flowcontrol_rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None
apiserver_flowcontrol_request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None
apiserver_flowcontrol_request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None
apiserver_flowcontrol_request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None
apiserver_flowcontrol_request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None
apiserver_flowcontrol_request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None
apiserver_flowcontrol_request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None
apiserver_flowcontrol_watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None
apiserver_flowcontrol_work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None
apiserver_init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None
apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None
apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None
apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None
apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None
apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None
apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None
apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None
apiserver_storage_data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone
apiserver_storage_data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone
apiserver_storage_envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone
apiserver_storage_list_evaluated_objects_totalALPHACounterNumber of objects tested in the course of serving a LIST request from storage
resource
None
apiserver_storage_list_fetched_objects_totalALPHACounterNumber of objects read from storage in the course of serving a LIST request
resource
None
apiserver_storage_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from storage
resource
None
apiserver_storage_list_totalALPHACounterNumber of LIST requests served from storage
resource
None
apiserver_storage_transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None
apiserver_storage_transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None
apiserver_terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None
apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone
apiserver_watch_cache_events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None
apiserver_watch_cache_initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None
apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None
apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None
apiserver_webhooks_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
apiserver_webhooks_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone
authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None
authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None
authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None
authentication_token_cache_active_fetch_countALPHAGauge
status
None
authentication_token_cache_fetch_totalALPHACounter
status
None
authentication_token_cache_request_duration_secondsALPHAHistogram
status
None
authentication_token_cache_request_totalALPHACounter
status
None
cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None
cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None
cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None
cloudprovider_azure_api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None
cloudprovider_azure_api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None
cloudprovider_azure_op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None
cloudprovider_azure_op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None
cloudprovider_gce_api_request_duration_secondsALPHAHistogramLatency of a GCE API call
region
request
version
zone
None
cloudprovider_gce_api_request_errorsALPHACounterNumber of errors for an API call
region
request
version
zone
None
cloudprovider_vsphere_api_request_duration_secondsALPHAHistogramLatency of vsphere api call
request
None
cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None
cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None
cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None
cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone
csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None
endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None
endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone
endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone
endpoint_slice_controller_endpointslices_changed_per_syncALPHAHistogramNumber of EndpointSlices changed on each Service sync
topology
None
endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None
endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone
endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None
endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone
endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone
endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None
etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None
etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone
etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None
etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None
field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None
garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone
get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone
get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone
job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None
job_controller_job_pods_finished_totalALPHACounterThe number of finished Pods that are fully tracked
completion_mode
result
None
job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None
job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None
job_controller_terminated_pods_tracking_finalizer_totalALPHACounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None
kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None
kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None
kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None
kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None
kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone
kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None
kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone
kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None
kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone
kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None
kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None
kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None
kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None
kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None
kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None
kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None
kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None
kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None
kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone
kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone
kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None
kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone
kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone
kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone
kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone
kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None
kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone
kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone
kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None
kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone
kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None
kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None
kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None
kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None
kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone
kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None
kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None
kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None
kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None
kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None
kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None
kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None
kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone
kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone
kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None
kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone
kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone
kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone
kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone
kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone
kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None
kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone
kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone
kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None
kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone
kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone
kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None
kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None
kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None
kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None
leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None
node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None
node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None
node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None
node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None
node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None
node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None
node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None
node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None
pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None
pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None
pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None
prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None
prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None
replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone
rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None
rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone
rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone
rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None
rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None
rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None
root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None
root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None
running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None
scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None
scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None
scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None
scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None
scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None
scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None
scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone
scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None
scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None
scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None
service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone
service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone
serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone
serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone
serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone
storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None
ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone
volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone
volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone
volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None
volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None
watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None
watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None
watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None
workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None
workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None
workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None
workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None
workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None
workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None
workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None
apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None
apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None
apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None
apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None
apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None
apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None
node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None
scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None
scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None
scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone
scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None
scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone
scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone
scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None
scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None
scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None