diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go new file mode 100755 index 00000000000..c7e40709a7c --- /dev/null +++ b/test/instrumentation/documentation/main.go @@ -0,0 +1,129 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "bytes" + "fmt" + "os" + "strings" + "text/template" + "time" + + "gopkg.in/yaml.v2" + + "k8s.io/component-base/metrics" +) + +var ( + GOROOT string = os.Getenv("GOROOT") + GOOS string = os.Getenv("GOOS") + KUBE_ROOT string = os.Getenv("KUBE_ROOT") +) + +const ( + templ = `--- +title: Kubernetes Metrics Across Components +content_type: instrumentation +--- + + +## Metrics + +These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). + +(auto-generated {{.GeneratedDate.Format "2006 Jan 02"}}) + +### List of Kubernetes Metrics + + + + + + + + + + + + + +{{range $index, $metric := .Metrics}}{{if not $metric.Labels }}{{else }}{{end}}{{if not $metric.ConstLabels }}{{else }}{{end}} +{{end}} + +
NameStability LevelTypeHelpLabelsConst Labels
{{$metric.Name}}{{$metric.StabilityLevel}}{{$metric.Type}}{{$metric.Help}}None{{range $label := $metric.Labels}}
{{$label}}
{{end}}
None{{$metric.ConstLabels}}
+` +) + +type templateData struct { + Metrics []metric + GeneratedDate time.Time +} + +func main() { + dat, err := os.ReadFile("test/instrumentation/testdata/documentation-list.yaml") + if err == nil { + metrics := []metric{} + err = yaml.Unmarshal(dat, &metrics) + if err != nil { + println("err", err) + } + t := template.New("t") + t, err := t.Parse(templ) + if err != nil { + println("err", err) + } + var tpl bytes.Buffer + for i, m := range metrics { + m.Help = strings.Join(strings.Split(m.Help, "\n"), " ") + metrics[i] = m + } + data := templateData{ + Metrics: metrics, + GeneratedDate: time.Now(), + } + err = t.Execute(&tpl, data) + if err != nil { + println("err", err) + } + fmt.Print(tpl.String()) + } else { + fmt.Fprintf(os.Stderr, "%s\n", err) + } + +} + +type metric struct { + Name string `yaml:"name" json:"name"` + Subsystem string `yaml:"subsystem,omitempty" json:"subsystem,omitempty"` + Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"` + Help string `yaml:"help,omitempty" json:"help,omitempty"` + Type string `yaml:"type,omitempty" json:"type,omitempty"` + DeprecatedVersion string `yaml:"deprecatedVersion,omitempty" json:"deprecatedVersion,omitempty"` + StabilityLevel string `yaml:"stabilityLevel,omitempty" json:"stabilityLevel,omitempty"` + Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"` + Buckets []float64 `yaml:"buckets,omitempty" json:"buckets,omitempty"` + Objectives map[float64]float64 `yaml:"objectives,omitempty" json:"objectives,omitempty"` + AgeBuckets uint32 `yaml:"ageBuckets,omitempty" json:"ageBuckets,omitempty"` + BufCap uint32 `yaml:"bufCap,omitempty" json:"bufCap,omitempty"` + MaxAge int64 `yaml:"maxAge,omitempty" json:"maxAge,omitempty"` + ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"` +} + +func (m metric) buildFQName() string { + return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name) +} diff --git a/test/instrumentation/find_stable_metric.go b/test/instrumentation/find_stable_metric.go index 41d3b23a610..3542c1e9d50 100644 --- a/test/instrumentation/find_stable_metric.go +++ b/test/instrumentation/find_stable_metric.go @@ -52,6 +52,15 @@ type stableMetricFinder struct { var _ ast.Visitor = (*stableMetricFinder)(nil) +func contains(v metrics.StabilityLevel, a []metrics.StabilityLevel) bool { + for _, i := range a { + if i == v { + return true + } + } + return false +} + func (f *stableMetricFinder) Visit(node ast.Node) (w ast.Visitor) { switch opts := node.(type) { case *ast.CallExpr: @@ -76,15 +85,19 @@ func (f *stableMetricFinder) Visit(node ast.Node) (w ast.Visitor) { f.errors = append(f.errors, err) return nil } - switch *stabilityLevel { - case metrics.STABLE, metrics.BETA: + classes := []metrics.StabilityLevel{metrics.STABLE, metrics.BETA} + if ALL_STABILITY_CLASSES { + classes = append(classes, metrics.ALPHA) + } + switch { + case contains(*stabilityLevel, classes): if f.currentFunctionCall == nil { f.errors = append(f.errors, newDecodeErrorf(opts, errNotDirectCall)) return nil } f.stableMetricsFunctionCalls = append(f.stableMetricsFunctionCalls, f.currentFunctionCall) f.currentFunctionCall = nil - case metrics.INTERNAL, metrics.ALPHA: + default: return nil } default: diff --git a/test/instrumentation/main.go b/test/instrumentation/main.go index 3a5e9ffcacd..af5465e76f8 100644 --- a/test/instrumentation/main.go +++ b/test/instrumentation/main.go @@ -41,12 +41,15 @@ const ( var ( // env configs - GOROOT string = os.Getenv("GOROOT") - GOOS string = os.Getenv("GOOS") - KUBE_ROOT string = os.Getenv("KUBE_ROOT") + GOROOT string = os.Getenv("GOROOT") + GOOS string = os.Getenv("GOOS") + KUBE_ROOT string = os.Getenv("KUBE_ROOT") + ALL_STABILITY_CLASSES bool ) func main() { + + flag.BoolVar(&ALL_STABILITY_CLASSES, "allstabilityclasses", false, "use this flag to enable all stability classes") flag.Parse() if len(flag.Args()) < 1 { fmt.Fprintf(os.Stderr, "USAGE: %s [...]\n", os.Args[0]) diff --git a/test/instrumentation/metric.go b/test/instrumentation/metric.go index 9f2721442da..35abb759a2e 100644 --- a/test/instrumentation/metric.go +++ b/test/instrumentation/metric.go @@ -29,20 +29,20 @@ const ( ) type metric struct { - Name string `yaml:"name"` - Subsystem string `yaml:"subsystem,omitempty"` - Namespace string `yaml:"namespace,omitempty"` - Help string `yaml:"help,omitempty"` - Type string `yaml:"type,omitempty"` - DeprecatedVersion string `yaml:"deprecatedVersion,omitempty"` - StabilityLevel string `yaml:"stabilityLevel,omitempty"` - Labels []string `yaml:"labels,omitempty"` - Buckets []float64 `yaml:"buckets,omitempty"` - Objectives map[float64]float64 `yaml:"objectives,omitempty"` - AgeBuckets uint32 `yaml:"ageBuckets,omitempty"` - BufCap uint32 `yaml:"bufCap,omitempty"` - MaxAge int64 `yaml:"maxAge,omitempty"` - ConstLabels map[string]string `yaml:"constLabels,omitempty"` + Name string `yaml:"name" json:"name"` + Subsystem string `yaml:"subsystem,omitempty" json:"subsystem,omitempty"` + Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"` + Help string `yaml:"help,omitempty" json:"help,omitempty"` + Type string `yaml:"type,omitempty" json:"type,omitempty"` + DeprecatedVersion string `yaml:"deprecatedVersion,omitempty" json:"deprecatedVersion,omitempty"` + StabilityLevel string `yaml:"stabilityLevel,omitempty" json:"stabilityLevel,omitempty"` + Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"` + Buckets []float64 `yaml:"buckets,omitempty" json:"buckets,omitempty"` + Objectives map[float64]float64 `yaml:"objectives,omitempty" json:"objectives,omitempty"` + AgeBuckets uint32 `yaml:"ageBuckets,omitempty" json:"ageBuckets,omitempty"` + BufCap uint32 `yaml:"bufCap,omitempty" json:"bufCap,omitempty"` + MaxAge int64 `yaml:"maxAge,omitempty" json:"maxAge,omitempty"` + ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"` } func (m metric) buildFQName() string { diff --git a/test/instrumentation/stability-utils.sh b/test/instrumentation/stability-utils.sh index 6ab14a54caf..a751ff4834b 100755 --- a/test/instrumentation/stability-utils.sh +++ b/test/instrumentation/stability-utils.sh @@ -105,6 +105,31 @@ kube::update::stablemetrics() { echo "${green}Updated golden list of stable metrics.${reset}" } +kube::update::documentation::list() { + stability_check_setup + temp_file=$(mktemp) + doCheckStability=$(find_files_to_check | grep -E ".*.go" | grep -v ".*_test.go" | sort | KUBE_ROOT=${KUBE_ROOT} xargs -L 200 go run "test/instrumentation/main.go" "test/instrumentation/decode_metric.go" "test/instrumentation/find_stable_metric.go" "test/instrumentation/error.go" "test/instrumentation/metric.go" --allstabilityclasses -- 1>"${temp_file}") + + if ! $doCheckStability; then + echo "${red}!!! updating golden list of metrics has failed! ${reset}" >&2 + exit 1 + fi + mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation-list.yaml" + echo "${green}Updated golden list of stable metrics.${reset}" +} + +kube::update::documentation() { + stability_check_setup + temp_file=$(mktemp) + doUpdateDocs=$(go run "test/instrumentation/documentation/main.go" -- 1>"${temp_file}") + if ! $doUpdateDocs; then + echo "${red}!!! updating documentation has failed! ${reset}" >&2 + exit 1 + fi + mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation.md" + echo "${green}Updated documentation of metrics.${reset}" +} + kube::update::test::stablemetrics() { stability_check_setup temp_file=$(mktemp) diff --git a/test/instrumentation/testdata/documentation-list.yaml b/test/instrumentation/testdata/documentation-list.yaml new file mode 100644 index 00000000000..281bd57f74b --- /dev/null +++ b/test/instrumentation/testdata/documentation-list.yaml @@ -0,0 +1,3455 @@ +- name: version_info + namespace: etcd + help: Etcd server's binary version + type: Gauge + stabilityLevel: ALPHA + labels: + - binary_version +- name: certificate_manager_client_ttl_seconds + subsystem: kubelet + help: Gauge of the TTL (time-to-live) of the Kubelet's client certificate. The value + is in seconds until certificate expiry (negative if already expired). If client + certificate is invalid or unused, the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: endpointslices_changed_per_sync + subsystem: endpoint_slice_controller + help: Number of EndpointSlices changed on each Service sync + type: Histogram + labels: + - topology +- name: cronjob_job_creation_skew_duration_seconds + subsystem: cronjob_controller + help: Time between when a cronjob is scheduled to be run, and when the corresponding + job is created + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 +- name: changes + subsystem: endpoint_slice_controller + help: Number of EndpointSlice changes + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: desired_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints removed on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: num_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: ALPHA +- name: syncs + subsystem: endpoint_slice_controller + help: Number of EndpointSlice syncs + type: Counter + stabilityLevel: ALPHA + labels: + - result +- name: addresses_skipped_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of addresses skipped on each Endpoints sync due to being invalid or + exceeding MaxEndpointsPerSubset + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: changes + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlice changes + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: desired_endpoint_slices + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints added on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints removed on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_sync_duration + subsystem: endpoint_slice_mirroring_controller + help: Duration of syncEndpoints() in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: endpoints_updated_per_sync + subsystem: endpoint_slice_mirroring_controller + help: Number of endpoints updated on each Endpoints sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: num_endpoint_slices + subsystem: endpoint_slice_mirroring_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: ALPHA +- name: resources_sync_error_total + subsystem: garbagecollector_controller + help: Number of garbage collector resources sync errors + type: Counter + stabilityLevel: ALPHA +- name: sync_duration_seconds + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: sync_total + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: job_pods_finished_total + subsystem: job_controller + help: The number of finished Pods that are fully tracked + type: Counter + labels: + - completion_mode + - result +- name: terminated_pods_tracking_finalizer_total + subsystem: job_controller + help: |- + `The number of terminated pods (phase=Failed|Succeeded) + that have the finalizer batch.kubernetes.io/job-tracking + The event label can be "add" or "delete".` + type: Counter + labels: + - event +- name: attachdetach_controller_forced_detaches + help: Number of times the A/D Controller performed a forced detach + type: Counter + stabilityLevel: ALPHA +- name: job_finished_total + subsystem: job_controller + help: The number of finished job + type: Counter + stabilityLevel: ALPHA + labels: + - completion_mode + - result +- name: job_sync_duration_seconds + subsystem: job_controller + help: The time it took to sync a job + type: Histogram + stabilityLevel: ALPHA + labels: + - action + - completion_mode + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: job_sync_total + subsystem: job_controller + help: The number of job syncs + type: Counter + stabilityLevel: ALPHA + labels: + - action + - completion_mode + - result +- name: evictions_number + subsystem: node_collector + help: Number of Node evictions that happened since current instance of NodeController + started, This metric is replaced by node_collector_evictions_total. + type: Counter + deprecatedVersion: 1.24.0 + stabilityLevel: ALPHA + labels: + - zone +- name: unhealthy_nodes_in_zone + subsystem: node_collector + help: Gauge measuring number of not Ready Nodes per zones. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: zone_health + subsystem: node_collector + help: Gauge measuring percentage of healthy nodes per zone. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: zone_size + subsystem: node_collector + help: Gauge measuring number of registered Nodes per zones. + type: Gauge + stabilityLevel: ALPHA + labels: + - zone +- name: cidrset_allocation_tries_per_request + subsystem: node_ipam_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: ALPHA + labels: + - clusterCIDR + buckets: + - 1 + - 5 + - 25 + - 125 + - 625 +- name: cidrset_cidrs_allocations_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR allocations. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: cidrset_cidrs_releases_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR releases. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: cidrset_usage_cidrs + subsystem: node_ipam_controller + help: Gauge measuring percentage of allocated CIDRs. + type: Gauge + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_allocation_tries_per_request + subsystem: node_ipam_controller + help: Histogram measuring CIDR allocation tries per request. + type: Histogram + stabilityLevel: ALPHA + labels: + - clusterCIDR + buckets: + - 1 + - 5 + - 25 + - 125 + - 625 +- name: multicidrset_cidrs_allocations_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR allocations. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_cidrs_releases_total + subsystem: node_ipam_controller + help: Counter measuring total number of CIDR releases. + type: Counter + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: multicidrset_usage_cidrs + subsystem: node_ipam_controller + help: Gauge measuring percentage of allocated CIDRs. + type: Gauge + stabilityLevel: ALPHA + labels: + - clusterCIDR +- name: sorting_deletion_age_ratio + subsystem: replicaset_controller + help: The ratio of chosen deleted pod's ages to the current youngest pod's age (at + the time). Should be <2.The intent of this metric is to measure the rough efficacy + of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) + of pods when a replicaset scales down. This only considers Ready pods when calculating + and reporting. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 +- name: job_deletion_duration_seconds + subsystem: ttl_after_finished_controller + help: The time it took to delete the job since it became eligible for deletion + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1.6 + - 3.2 + - 6.4 + - 12.8 + - 25.6 + - 51.2 + - 102.4 + - 204.8 + - 409.6 + - 819.2 +- name: evictions_total + subsystem: node_collector + help: Number of Node evictions that happened since current instance of NodeController + started. + type: Counter + stabilityLevel: STABLE + labels: + - zone +- name: create_failures_total + subsystem: ephemeral_volume_controller + help: Number of PersistenVolumeClaims creation requests + type: Counter + stabilityLevel: ALPHA +- name: create_total + subsystem: ephemeral_volume_controller + help: Number of PersistenVolumeClaims creation requests + type: Counter + stabilityLevel: ALPHA +- name: client_expiration_renew_errors + subsystem: certificate_manager + namespace: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA +- name: certificate_manager_server_rotation_seconds + subsystem: kubelet + help: Histogram of the number of seconds the previous certificate lived before being + rotated. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 60 + - 3600 + - 14400 + - 86400 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 + - 1.24416e+08 +- name: certificate_manager_server_ttl_seconds + subsystem: kubelet + help: Gauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. + The value is in seconds until certificate expiry (negative if already expired). + If serving certificate is invalid or unused, the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: kubelet_credential_provider_plugin_duration + subsystem: kubelet + help: Duration of execution in seconds for credential provider plugin + type: Histogram + stabilityLevel: ALPHA + labels: + - plugin_name + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: kubelet_credential_provider_plugin_errors + subsystem: kubelet + help: Number of errors from credential provider plugin + type: Counter + stabilityLevel: ALPHA + labels: + - plugin_name +- name: server_expiration_renew_errors + subsystem: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA +- name: volume_operation_total_errors + help: Total volume operation errors + type: Counter + stabilityLevel: ALPHA + labels: + - operation_name + - plugin_name +- name: cgroup_manager_duration_seconds + subsystem: kubelet + help: Duration in seconds for cgroup manager operations. Broken down by method. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: containers_per_pod_count + subsystem: kubelet + help: The number of containers per pod. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 +- name: device_plugin_alloc_duration_seconds + subsystem: kubelet + help: Duration in seconds to serve a device plugin Allocation request. Broken down + by resource name. + type: Histogram + stabilityLevel: ALPHA + labels: + - resource_name + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: device_plugin_registration_total + subsystem: kubelet + help: Cumulative number of device plugin registrations. Broken down by resource + name. + type: Counter + stabilityLevel: ALPHA + labels: + - resource_name +- name: eviction_stats_age_seconds + subsystem: kubelet + help: Time between when stats are collected, and when pod is evicted based on those + stats by eviction signal + type: Histogram + stabilityLevel: ALPHA + labels: + - eviction_signal + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: evictions + subsystem: kubelet + help: Cumulative number of pod evictions by eviction signal + type: Counter + stabilityLevel: ALPHA + labels: + - eviction_signal +- name: graceful_shutdown_end_time_seconds + subsystem: kubelet + help: Last graceful shutdown start time since unix epoch in seconds + type: Gauge + stabilityLevel: ALPHA +- name: graceful_shutdown_start_time_seconds + subsystem: kubelet + help: Last graceful shutdown start time since unix epoch in seconds + type: Gauge + stabilityLevel: ALPHA +- name: http_inflight_requests + subsystem: kubelet + help: Number of the inflight http requests + type: Gauge + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type +- name: http_requests_duration_seconds + subsystem: kubelet + help: Duration in seconds to serve http requests + type: Histogram + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: http_requests_total + subsystem: kubelet + help: Number of the http requests received since the server started + type: Counter + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type +- name: lifecycle_handler_http_fallbacks_total + subsystem: kubelet + help: The number of times lifecycle handlers successfully fell back to http from + https. + type: Counter + stabilityLevel: ALPHA +- name: managed_ephemeral_containers + subsystem: kubelet + help: Current number of ephemeral containers in pods managed by this kubelet. Ephemeral + containers will be ignored if disabled by the EphemeralContainers feature gate, + and this number will be 0. + type: Gauge + stabilityLevel: ALPHA +- name: node_name + subsystem: kubelet + help: The node's name. The count is always 1. + type: Gauge + stabilityLevel: ALPHA + labels: + - node +- name: pleg_discard_events + subsystem: kubelet + help: The number of discard events in PLEG. + type: Counter + stabilityLevel: ALPHA +- name: pleg_last_seen_seconds + subsystem: kubelet + help: Timestamp in seconds when PLEG was last seen active. + type: Gauge + stabilityLevel: ALPHA +- name: pleg_relist_duration_seconds + subsystem: kubelet + help: Duration in seconds for relisting pods in PLEG. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pleg_relist_interval_seconds + subsystem: kubelet + help: Interval in seconds between relisting in PLEG. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_resources_endpoint_errors_get_allocatable + subsystem: kubelet + help: Number of requests to the PodResource GetAllocatableResources endpoint which + returned error. Broken down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_errors_list + subsystem: kubelet + help: Number of requests to the PodResource List endpoint which returned error. + Broken down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_get_allocatable + subsystem: kubelet + help: Number of requests to the PodResource GetAllocatableResources endpoint. Broken + down by server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_list + subsystem: kubelet + help: Number of requests to the PodResource List endpoint. Broken down by server + api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_resources_endpoint_requests_total + subsystem: kubelet + help: Cumulative number of requests to the PodResource endpoint. Broken down by + server api version. + type: Counter + stabilityLevel: ALPHA + labels: + - server_api_version +- name: pod_start_duration_seconds + subsystem: kubelet + help: Duration in seconds from kubelet seeing a pod for the first time to the pod + starting to run + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_status_sync_duration_seconds + subsystem: kubelet + help: Duration in seconds to sync a pod status update. Measures time from detection + of a change to pod status until the API is successfully updated for that pod, + even if multiple intevening changes to pod status occur. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.01 + - 0.05 + - 0.1 + - 0.5 + - 1 + - 5 + - 10 + - 20 + - 30 + - 45 + - 60 +- name: pod_worker_duration_seconds + subsystem: kubelet + help: 'Duration in seconds to sync a single pod. Broken down by operation type: + create, update, or sync' + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: pod_worker_start_duration_seconds + subsystem: kubelet + help: Duration in seconds from kubelet seeing a pod to starting a worker. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: preemptions + subsystem: kubelet + help: Cumulative number of pod preemptions by preemption resource + type: Counter + stabilityLevel: ALPHA + labels: + - preemption_signal +- name: run_podsandbox_duration_seconds + subsystem: kubelet + help: Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler. + type: Histogram + stabilityLevel: ALPHA + labels: + - runtime_handler + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: run_podsandbox_errors_total + subsystem: kubelet + help: Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler. + type: Counter + stabilityLevel: ALPHA + labels: + - runtime_handler +- name: running_containers + subsystem: kubelet + help: Number of containers currently running + type: Gauge + stabilityLevel: ALPHA + labels: + - container_state +- name: running_pods + subsystem: kubelet + help: Number of pods that have a running pod sandbox + type: Gauge + stabilityLevel: ALPHA +- name: runtime_operations_duration_seconds + subsystem: kubelet + help: Duration in seconds of runtime operations. Broken down by operation type. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_type + buckets: + - 0.005 + - 0.0125 + - 0.03125 + - 0.078125 + - 0.1953125 + - 0.48828125 + - 1.220703125 + - 3.0517578125 + - 7.62939453125 + - 19.073486328125 + - 47.6837158203125 + - 119.20928955078125 + - 298.0232238769531 + - 745.0580596923828 +- name: runtime_operations_errors_total + subsystem: kubelet + help: Cumulative number of runtime operation errors by operation type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation_type +- name: runtime_operations_total + subsystem: kubelet + help: Cumulative number of runtime operations by operation type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation_type +- name: started_containers_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting containers + type: Counter + stabilityLevel: ALPHA + labels: + - code + - container_type +- name: started_containers_total + subsystem: kubelet + help: Cumulative number of containers started + type: Counter + stabilityLevel: ALPHA + labels: + - container_type +- name: started_host_process_containers_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting hostprocess containers. This metric + will only be collected on Windows and requires WindowsHostProcessContainers feature + gate to be enabled. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - container_type +- name: started_host_process_containers_total + subsystem: kubelet + help: Cumulative number of hostprocess containers started. This metric will only + be collected on Windows and requires WindowsHostProcessContainers feature gate + to be enabled. + type: Counter + stabilityLevel: ALPHA + labels: + - container_type +- name: started_pods_errors_total + subsystem: kubelet + help: Cumulative number of errors when starting pods + type: Counter + stabilityLevel: ALPHA +- name: started_pods_total + subsystem: kubelet + help: Cumulative number of pods started + type: Counter + stabilityLevel: ALPHA +- name: volume_metric_collection_duration_seconds + subsystem: kubelet + help: Duration in seconds to calculate volume stats + type: Histogram + stabilityLevel: ALPHA + labels: + - metric_source + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: probe_duration_seconds + subsystem: prober + help: Duration in seconds for a probe response. + type: Histogram + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - probe_type +- name: probe_total + subsystem: prober + help: Cumulative number of a liveness, readiness or startup probe for a container + by result. + type: Counter + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - pod_uid + - probe_type + - result +- name: csr_honored_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration that was honored, sliced + by signer (only kubernetes.io signer names are specifically identified) + type: Counter + stabilityLevel: ALPHA + labels: + - signerName +- name: csr_requested_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration, sliced by signer (only + kubernetes.io signer names are specifically identified) + type: Counter + stabilityLevel: ALPHA + labels: + - signerName +- name: network_programming_duration_seconds + subsystem: kubeproxy + help: In Cluster Network Programming Latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.25 + - 0.5 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 + - 22 + - 23 + - 24 + - 25 + - 26 + - 27 + - 28 + - 29 + - 30 + - 31 + - 32 + - 33 + - 34 + - 35 + - 36 + - 37 + - 38 + - 39 + - 40 + - 41 + - 42 + - 43 + - 44 + - 45 + - 46 + - 47 + - 48 + - 49 + - 50 + - 51 + - 52 + - 53 + - 54 + - 55 + - 56 + - 57 + - 58 + - 59 + - 60 + - 65 + - 70 + - 75 + - 80 + - 85 + - 90 + - 95 + - 100 + - 105 + - 110 + - 115 + - 120 + - 150 + - 180 + - 210 + - 240 + - 270 + - 300 +- name: sync_proxy_rules_duration_seconds + subsystem: kubeproxy + help: SyncProxyRules latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: sync_proxy_rules_endpoint_changes_pending + subsystem: kubeproxy + help: Pending proxy rules Endpoint changes + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_endpoint_changes_total + subsystem: kubeproxy + help: Cumulative proxy rules Endpoint changes + type: Counter + stabilityLevel: ALPHA +- name: sync_proxy_rules_iptables_restore_failures_total + subsystem: kubeproxy + help: Cumulative proxy iptables restore failures + type: Counter + stabilityLevel: ALPHA +- name: sync_proxy_rules_iptables_total + subsystem: kubeproxy + help: Number of proxy iptables rules programmed + type: Gauge + stabilityLevel: ALPHA + labels: + - table +- name: sync_proxy_rules_last_queued_timestamp_seconds + subsystem: kubeproxy + help: The last time a sync of proxy rules was queued + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_last_timestamp_seconds + subsystem: kubeproxy + help: The last time proxy rules were successfully synced + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_no_local_endpoints_total + subsystem: kubeproxy + help: Number of services with a Local traffic policy and no endpoints + type: Gauge + stabilityLevel: ALPHA + labels: + - traffic_policy +- name: sync_proxy_rules_service_changes_pending + subsystem: kubeproxy + help: Pending proxy rules Service changes + type: Gauge + stabilityLevel: ALPHA +- name: sync_proxy_rules_service_changes_total + subsystem: kubeproxy + help: Cumulative proxy rules Service changes + type: Counter + stabilityLevel: ALPHA +- name: volume_manager_selinux_container_errors_total + help: Number of errors when kubelet cannot compute SELinux context for a container. + Kubelet can't start such a Pod then and it will retry, therefore value of this + metric may not represent the actual nr. of containers. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_container_warnings_total + help: Number of errors when kubelet cannot compute SELinux context for a container + that are ignored. They will become real errors when SELinuxMountReadWriteOncePod + feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_pod_context_mismatch_errors_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. Kubelet can't start such a Pod then and it will retry, + therefore value of this metric may not represent the actual nr. of Pods. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_pod_context_mismatch_warnings_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. They are not errors yet, but they will become real errors + when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volume_context_mismatch_errors_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. Kubelet can't start such a Pod then and it + will retry, therefore value of this metric may not represent the actual nr. of + Pods. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volume_context_mismatch_warnings_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. They are not errors yet, but they will become + real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume + access modes. + type: Gauge + stabilityLevel: ALPHA +- name: volume_manager_selinux_volumes_admitted_total + help: Number of volumes whose SELinux context was fine and will be mounted with + mount -o context option. + type: Gauge + stabilityLevel: ALPHA +- name: allocated_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of allocated IPs for Services + type: Gauge + stabilityLevel: ALPHA + labels: + - cidr +- name: allocation_errors_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of errors trying to allocate Cluster IPs + type: Counter + stabilityLevel: ALPHA + labels: + - cidr + - scope +- name: allocation_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of Cluster IPs allocations + type: Counter + stabilityLevel: ALPHA + labels: + - cidr + - scope +- name: available_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of available IPs for Services + type: Gauge + stabilityLevel: ALPHA + labels: + - cidr +- name: pods_logs_backend_tls_failure_total + subsystem: pod_logs + namespace: kube_apiserver + help: Total number of requests for pods/logs that failed due to kubelet server TLS + verification + type: Counter + stabilityLevel: ALPHA +- name: pods_logs_insecure_backend_total + subsystem: pod_logs + namespace: kube_apiserver + help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, + skip_tls_allowed, skip_tls_denied' + type: Counter + stabilityLevel: ALPHA + labels: + - usage +- name: e2e_scheduling_duration_seconds + subsystem: scheduler + help: E2e scheduling latency in seconds (scheduling algorithm + binding). This metric + is replaced by scheduling_attempt_duration_seconds. + type: Histogram + deprecatedVersion: 1.23.0 + stabilityLevel: ALPHA + labels: + - profile + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: goroutines + subsystem: scheduler + help: Number of running goroutines split by the work they do such as binding. + type: Gauge + stabilityLevel: ALPHA + labels: + - operation +- name: permit_wait_duration_seconds + subsystem: scheduler + help: Duration of waiting on permit. + type: Histogram + stabilityLevel: ALPHA + labels: + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: plugin_execution_duration_seconds + subsystem: scheduler + help: Duration for running a plugin at a specific extension point. + type: Histogram + stabilityLevel: ALPHA + labels: + - extension_point + - plugin + - status + buckets: + - 1e-05 + - 1.5000000000000002e-05 + - 2.2500000000000005e-05 + - 3.375000000000001e-05 + - 5.062500000000001e-05 + - 7.593750000000002e-05 + - 0.00011390625000000003 + - 0.00017085937500000006 + - 0.0002562890625000001 + - 0.00038443359375000017 + - 0.0005766503906250003 + - 0.0008649755859375004 + - 0.0012974633789062506 + - 0.0019461950683593758 + - 0.0029192926025390638 + - 0.004378938903808595 + - 0.006568408355712893 + - 0.009852612533569338 + - 0.014778918800354007 + - 0.02216837820053101 +- name: scheduler_cache_size + subsystem: scheduler + help: Number of nodes, pods, and assumed (bound) pods in the scheduler cache. + type: Gauge + stabilityLevel: ALPHA + labels: + - type +- name: scheduler_goroutines + subsystem: scheduler + help: Number of running goroutines split by the work they do such as binding. This + metric is replaced by the \"goroutines\" metric. + type: Gauge + deprecatedVersion: 1.26.0 + stabilityLevel: ALPHA + labels: + - work +- name: scheduling_algorithm_duration_seconds + subsystem: scheduler + help: Scheduling algorithm latency in seconds + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: unschedulable_pods + subsystem: scheduler + help: The number of unschedulable pods broken down by plugin name. A pod will increment + the gauge for all plugins that caused it to not schedule and so this metric have + meaning only when broken down by plugin. + type: Gauge + stabilityLevel: ALPHA + labels: + - plugin + - profile +- name: binder_cache_requests_total + subsystem: scheduler_volume + help: Total number for request volume binding cache + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: scheduling_stage_error_total + subsystem: scheduler_volume + help: Volume scheduling stage error count + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: legacy_tokens_total + subsystem: serviceaccount + help: Cumulative legacy service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: stale_tokens_total + subsystem: serviceaccount + help: Cumulative stale projected service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: valid_tokens_total + subsystem: serviceaccount + help: Cumulative valid projected service account tokens used + type: Counter + stabilityLevel: ALPHA +- name: framework_extension_point_duration_seconds + subsystem: scheduler + help: Latency for running all plugins of a specific extension point. + type: Histogram + stabilityLevel: STABLE + labels: + - extension_point + - profile + - status + buckets: + - 0.0001 + - 0.0002 + - 0.0004 + - 0.0008 + - 0.0016 + - 0.0032 + - 0.0064 + - 0.0128 + - 0.0256 + - 0.0512 + - 0.1024 + - 0.2048 +- name: pending_pods + subsystem: scheduler + help: Number of pending pods, by the queue type. 'active' means number of pods in + activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number + of pods in unschedulablePods. + type: Gauge + stabilityLevel: STABLE + labels: + - queue +- name: pod_scheduling_attempts + subsystem: scheduler + help: Number of attempts to successfully schedule a pod. + type: Histogram + stabilityLevel: STABLE + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 +- name: pod_scheduling_duration_seconds + subsystem: scheduler + help: E2e latency for a pod being scheduled which may include multiple scheduling + attempts. + type: Histogram + stabilityLevel: STABLE + labels: + - attempts + buckets: + - 0.01 + - 0.02 + - 0.04 + - 0.08 + - 0.16 + - 0.32 + - 0.64 + - 1.28 + - 2.56 + - 5.12 + - 10.24 + - 20.48 + - 40.96 + - 81.92 + - 163.84 + - 327.68 + - 655.36 + - 1310.72 + - 2621.44 + - 5242.88 +- name: preemption_attempts_total + subsystem: scheduler + help: Total preemption attempts in the cluster till now + type: Counter + stabilityLevel: STABLE +- name: preemption_victims + subsystem: scheduler + help: Number of selected preemption victims + type: Histogram + stabilityLevel: STABLE + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 +- name: queue_incoming_pods_total + subsystem: scheduler + help: Number of pods added to scheduling queues by event and queue type. + type: Counter + stabilityLevel: STABLE + labels: + - event + - queue +- name: schedule_attempts_total + subsystem: scheduler + help: Number of attempts to schedule pods, by the result. 'unschedulable' means + a pod could not be scheduled, while 'error' means an internal scheduler problem. + type: Counter + stabilityLevel: STABLE + labels: + - profile + - result +- name: scheduling_attempt_duration_seconds + subsystem: scheduler + help: Scheduling attempt latency in seconds (scheduling algorithm + binding) + type: Histogram + stabilityLevel: STABLE + labels: + - profile + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: operations_seconds + subsystem: csi + help: Container Storage Interface operation duration with gRPC error code status + total + type: Histogram + stabilityLevel: ALPHA + labels: + - driver_name + - grpc_status_code + - method_name + - migrated + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: storage_operation_duration_seconds + help: Storage operation duration + type: Histogram + stabilityLevel: ALPHA + labels: + - migrated + - operation_name + - status + - volume_plugin + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: volume_operation_total_seconds + help: Storage operation end to end duration in seconds + type: Histogram + stabilityLevel: ALPHA + labels: + - operation_name + - plugin_name + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 +- name: graph_actions_duration_seconds + subsystem: node_authorizer + help: Histogram of duration of graph actions in node authorizer. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + buckets: + - 0.0001 + - 0.0002 + - 0.0004 + - 0.0008 + - 0.0016 + - 0.0032 + - 0.0064 + - 0.0128 + - 0.0256 + - 0.0512 + - 0.1024 + - 0.2048 +- name: apiextensions_openapi_v2_regeneration_count + help: Counter of OpenAPI v2 spec regeneration count broken down by causing CRD name + and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - crd + - reason +- name: apiextensions_openapi_v3_regeneration_count + help: Counter of OpenAPI v3 spec regeneration count broken down by group, version, + causing CRD and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - crd + - group + - reason + - version +- name: apiserver_crd_webhook_conversion_duration_seconds + help: CRD webhook conversion duration in seconds + type: Histogram + stabilityLevel: ALPHA + labels: + - crd_name + - from_version + - succeeded + - to_version + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: step_admission_duration_seconds_summary + subsystem: admission + namespace: apiserver + help: Admission sub-step latency summary in seconds, broken out for each operation + and API resource and step type (validate or admit). + type: Summary + stabilityLevel: ALPHA + labels: + - operation + - rejected + - type + maxAge: 18000000000000 +- name: webhook_fail_open_count + subsystem: admission + namespace: apiserver + help: Admission webhook fail open count, identified by name and broken out for each + admission type (validating or mutating). + type: Counter + stabilityLevel: ALPHA + labels: + - name + - type +- name: webhook_rejection_count + subsystem: admission + namespace: apiserver + help: Admission webhook rejection count, identified by name and broken out for each + admission type (validating or admit) and operation. Additional labels specify + an error type (calling_webhook_error or apiserver_internal_error if an error occurred; + no_error otherwise) and optionally a non-zero rejection code if the webhook rejects + the request with an HTTP status code (honored by the apiserver when the code is + greater or equal to 400). Codes greater than 600 are truncated to 600, to keep + the metrics cardinality bounded. + type: Counter + stabilityLevel: ALPHA + labels: + - error_type + - name + - operation + - rejection_code + - type +- name: webhook_request_total + subsystem: admission + namespace: apiserver + help: Admission webhook request total, identified by name and broken out for each + admission type (validating or mutating) and operation. Additional labels specify + whether the request was rejected or not and an HTTP status code. Codes greater + than 600 are truncated to 600, to keep the metrics cardinality bounded. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - name + - operation + - rejected + - type +- name: error_total + subsystem: apiserver_audit + help: Counter of audit events that failed to be audited properly. Plugin identifies + the plugin affected by the error. + type: Counter + stabilityLevel: ALPHA + labels: + - plugin +- name: event_total + subsystem: apiserver_audit + help: Counter of audit events generated and sent to the audit backend. + type: Counter + stabilityLevel: ALPHA +- name: level_total + subsystem: apiserver_audit + help: Counter of policy levels for audit events (1 per request). + type: Counter + stabilityLevel: ALPHA + labels: + - level +- name: requests_rejected_total + subsystem: apiserver_audit + help: Counter of apiserver requests rejected due to an error in audit logging backend. + type: Counter + stabilityLevel: ALPHA +- name: apiserver_delegated_authn_request_duration_seconds + help: Request latency in seconds. Broken down by status code. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.25 + - 0.5 + - 0.7 + - 1 + - 1.5 + - 3 + - 5 + - 10 +- name: apiserver_delegated_authn_request_total + help: Number of HTTP requests partitioned by status code. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: controller_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission controller latency histogram in seconds, identified by name and + broken out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: step_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission sub-step latency histogram in seconds, broken out for each operation + and API resource and step type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: webhook_admission_duration_seconds + subsystem: admission + namespace: apiserver + help: Admission webhook latency histogram in seconds, identified by name and broken + out for each operation and API resource and type (validate or admit). + type: Histogram + stabilityLevel: STABLE + labels: + - name + - operation + - rejected + - type + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 +- name: compilation_duration_seconds + subsystem: cel + namespace: apiserver + type: Histogram + stabilityLevel: ALPHA +- name: evaluation_duration_seconds + subsystem: cel + namespace: apiserver + type: Histogram + stabilityLevel: ALPHA +- name: certificate_expiration_seconds + subsystem: client + namespace: apiserver + help: Distribution of the remaining lifetime on the certificate used to authenticate + a request. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0 + - 1800 + - 3600 + - 7200 + - 21600 + - 43200 + - 86400 + - 172800 + - 345600 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 +- name: current_inqueue_requests + subsystem: apiserver + help: Maximal number of queued requests in this apiserver per request kind in last + second. + type: Gauge + stabilityLevel: ALPHA + labels: + - request_kind +- name: apiserver_delegated_authz_request_duration_seconds + help: Request latency in seconds. Broken down by status code. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.25 + - 0.5 + - 0.7 + - 1 + - 1.5 + - 3 + - 5 + - 10 +- name: apiserver_delegated_authz_request_total + help: Number of HTTP requests partitioned by status code. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: dial_duration_seconds + subsystem: egress_dialer + namespace: apiserver + help: Dial latency histogram in seconds, labeled by the protocol (http-connect or + grpc), transport (tcp or uds) + type: Histogram + stabilityLevel: ALPHA + labels: + - protocol + - transport + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 2.5 + - 12.5 +- name: dial_failure_count + subsystem: egress_dialer + namespace: apiserver + help: Dial failure count, labeled by the protocol (http-connect or grpc), transport + (tcp or uds), and stage (connect or proxy). The stage indicates at which stage + the dial failed + type: Counter + stabilityLevel: ALPHA + labels: + - protocol + - stage + - transport +- name: request_aborts_total + subsystem: apiserver + help: Number of requests which apiserver aborted possibly due to a timeout, for + each group, version, verb, resource, subresource and scope + type: Counter + stabilityLevel: ALPHA + labels: + - group + - resource + - scope + - subresource + - verb + - version +- name: request_body_sizes + subsystem: apiserver + help: Apiserver request body sizes broken out by size. + type: Histogram + stabilityLevel: ALPHA + labels: + - resource + - verb + buckets: + - 50000 + - 150000 + - 250000 + - 350000 + - 450000 + - 550000 + - 650000 + - 750000 + - 850000 + - 950000 + - 1.05e+06 + - 1.15e+06 + - 1.25e+06 + - 1.35e+06 + - 1.45e+06 + - 1.55e+06 + - 1.65e+06 + - 1.75e+06 + - 1.85e+06 + - 1.95e+06 + - 2.05e+06 + - 2.15e+06 + - 2.25e+06 + - 2.35e+06 + - 2.45e+06 + - 2.55e+06 + - 2.65e+06 + - 2.75e+06 + - 2.85e+06 + - 2.95e+06 + - 3.05e+06 +- name: request_filter_duration_seconds + subsystem: apiserver + help: Request filter latency distribution in seconds, for each filter type + type: Histogram + stabilityLevel: ALPHA + labels: + - filter + buckets: + - 0.0001 + - 0.0003 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.3 + - 1 + - 5 +- name: request_post_timeout_total + subsystem: apiserver + help: Tracks the activity of the request handlers after the associated requests + have been timed out by the apiserver + type: Counter + stabilityLevel: ALPHA + labels: + - source + - status +- name: request_slo_duration_seconds + subsystem: apiserver + help: Response latency distribution (not counting webhook duration) in seconds for + each verb, group, version, resource, subresource, scope and component. + type: Histogram + stabilityLevel: ALPHA + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: request_terminations_total + subsystem: apiserver + help: Number of requests which apiserver terminated in self-defense. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - component + - group + - resource + - scope + - subresource + - verb + - version +- name: request_timestamp_comparison_time + subsystem: apiserver + help: Time taken for comparison of old vs new objects in UPDATE or PATCH requests + type: Histogram + stabilityLevel: ALPHA + labels: + - code_path + buckets: + - 0.0001 + - 0.0003 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.3 + - 1 + - 5 +- name: selfrequest_total + subsystem: apiserver + help: Counter of apiserver self-requests broken out for each verb, API resource + and subresource. + type: Counter + stabilityLevel: ALPHA + labels: + - resource + - subresource + - verb +- name: tls_handshake_errors_total + subsystem: apiserver + help: Number of requests dropped with 'TLS handshake error from' error + type: Counter + stabilityLevel: ALPHA +- name: watch_events_sizes + subsystem: apiserver + help: Watch event size distribution in bytes + type: Histogram + stabilityLevel: ALPHA + labels: + - group + - kind + - version + buckets: + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 + - 65536 + - 131072 +- name: watch_events_total + subsystem: apiserver + help: Number of events sent in watch clients + type: Counter + stabilityLevel: ALPHA + labels: + - group + - kind + - version +- name: authenticated_user_requests + help: Counter of authenticated requests broken out by username. + type: Counter + stabilityLevel: ALPHA + labels: + - username +- name: authentication_attempts + help: Counter of authenticated attempts. + type: Counter + stabilityLevel: ALPHA + labels: + - result +- name: authentication_duration_seconds + help: Authentication duration in seconds broken out by result. + type: Histogram + stabilityLevel: ALPHA + labels: + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: active_fetch_count + subsystem: token_cache + namespace: authentication + type: Gauge + stabilityLevel: ALPHA + labels: + - status +- name: fetch_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status +- name: request_duration_seconds + subsystem: token_cache + namespace: authentication + type: Histogram + stabilityLevel: ALPHA + labels: + - status +- name: request_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status +- name: field_validation_request_duration_seconds + help: Response latency distribution in seconds for each field validation value and + whether field validation is enabled or not + type: Histogram + stabilityLevel: ALPHA + labels: + - enabled + - field_validation + buckets: + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: current_inflight_requests + subsystem: apiserver + help: Maximal number of currently used inflight request limit of this apiserver + per request kind in last second. + type: Gauge + stabilityLevel: STABLE + labels: + - request_kind +- name: longrunning_requests + subsystem: apiserver + help: Gauge of all active long-running apiserver requests broken out by verb, group, + version, resource, scope and component. Not all requests are tracked this way. + type: Gauge + stabilityLevel: STABLE + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version +- name: request_duration_seconds + subsystem: apiserver + help: Response latency distribution in seconds for each verb, dry run value, group, + version, resource, subresource, scope and component. + type: Histogram + stabilityLevel: STABLE + labels: + - component + - dry_run + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: request_total + subsystem: apiserver + help: Counter of apiserver requests broken out for each verb, dry run value, group, + version, resource, scope, component, and HTTP response code. + type: Counter + stabilityLevel: STABLE + labels: + - code + - component + - dry_run + - group + - resource + - scope + - subresource + - verb + - version +- name: requested_deprecated_apis + subsystem: apiserver + help: Gauge of deprecated APIs that have been requested, broken out by API group, + version, resource, subresource, and removed_release. + type: Gauge + stabilityLevel: STABLE + labels: + - group + - removed_release + - resource + - subresource + - version +- name: response_sizes + subsystem: apiserver + help: Response size distribution in bytes for each group, version, verb, resource, + subresource, scope and component. + type: Histogram + stabilityLevel: STABLE + labels: + - component + - group + - resource + - scope + - subresource + - verb + - version + buckets: + - 1000 + - 10000 + - 100000 + - 1e+06 + - 1e+07 + - 1e+08 + - 1e+09 +- name: cache_list_fetched_objects_total + namespace: apiserver + help: Number of objects read from watch cache in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: cache_list_returned_objects_total + namespace: apiserver + help: Number of objects returned for a LIST request from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - resource_prefix +- name: cache_list_total + namespace: apiserver + help: Number of LIST requests served from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: dek_cache_fill_percent + subsystem: envelope_encryption + namespace: apiserver + help: Percent of the cache slots currently occupied by cached DEKs. + type: Gauge + stabilityLevel: ALPHA +- name: dek_cache_inter_arrival_time_seconds + subsystem: envelope_encryption + namespace: apiserver + help: Time (in seconds) of inter arrival of transformation requests. + type: Histogram + stabilityLevel: ALPHA + labels: + - transformation_type + buckets: + - 60 + - 120 + - 240 + - 480 + - 960 + - 1920 + - 3840 + - 7680 + - 15360 + - 30720 +- name: current_executing_requests + subsystem: flowcontrol + namespace: apiserver + help: Number of requests in initial (for a WATCH) or any (for a non-WATCH) execution + stage in the API Priority and Fairness subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: current_inqueue_requests + subsystem: flowcontrol + namespace: apiserver + help: Number of requests currently pending in queues of the API Priority and Fairness + subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: current_r + subsystem: flowcontrol + namespace: apiserver + help: R(time of last change) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: dispatch_r + subsystem: flowcontrol + namespace: apiserver + help: R(time of last dispatch) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: dispatched_requests_total + subsystem: flowcontrol + namespace: apiserver + help: Number of requests executed by API Priority and Fairness subsystem + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: epoch_advance_total + subsystem: flowcontrol + namespace: apiserver + help: Number of times the queueset's progress meter jumped backward + type: Counter + stabilityLevel: ALPHA + labels: + - priority_level + - success +- name: latest_s + subsystem: flowcontrol + namespace: apiserver + help: S(most recently dispatched request) + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: next_discounted_s_bounds + subsystem: flowcontrol + namespace: apiserver + help: min and max, over queues, of S(oldest waiting request in queue) - estimated + work in progress + type: Gauge + stabilityLevel: ALPHA + labels: + - bound + - priority_level +- name: next_s_bounds + subsystem: flowcontrol + namespace: apiserver + help: min and max, over queues, of S(oldest waiting request in queue) + type: Gauge + stabilityLevel: ALPHA + labels: + - bound + - priority_level +- name: priority_level_request_utilization + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of number of requests (as a + fraction of the relevant limit) waiting or in any stage of execution (but only + initial stage for WATCHes) + stabilityLevel: ALPHA + labels: + - phase + - priority_level + buckets: + - 0 + - 0.001 + - 0.003 + - 0.01 + - 0.03 + - 0.1 + - 0.25 + - 0.5 + - 0.75 + - 1 +- name: priority_level_seat_utilization + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of utilization of seats for + any stage of execution (but only initial stage for WATCHes) + stabilityLevel: ALPHA + labels: + - priority_level + buckets: + - 0 + - 0.1 + - 0.2 + - 0.3 + - 0.4 + - 0.5 + - 0.6 + - 0.7 + - 0.8 + - 0.9 + - 0.95 + - 0.99 + - 1 + constLabels: + phase: executing +- name: read_vs_write_current_requests + subsystem: flowcontrol + namespace: apiserver + help: Observations, at the end of every nanosecond, of the number of requests (as + a fraction of the relevant limit) waiting or in regular stage of execution + stabilityLevel: ALPHA + labels: + - phase + - request_kind + buckets: + - 0 + - 0.001 + - 0.01 + - 0.1 + - 0.2 + - 0.3 + - 0.4 + - 0.5 + - 0.6 + - 0.7 + - 0.8 + - 0.9 + - 0.95 + - 0.99 + - 1 +- name: rejected_requests_total + subsystem: flowcontrol + namespace: apiserver + help: Number of requests rejected by API Priority and Fairness subsystem + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + - reason +- name: request_concurrency_in_use + subsystem: flowcontrol + namespace: apiserver + help: Concurrency (number of seats) occupied by the currently executing (initial + stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness + subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: request_concurrency_limit + subsystem: flowcontrol + namespace: apiserver + help: Shared concurrency limit in the API Priority and Fairness subsystem + type: Gauge + stabilityLevel: ALPHA + labels: + - priority_level +- name: request_dispatch_no_accommodation_total + subsystem: flowcontrol + namespace: apiserver + help: Number of times a dispatch attempt resulted in a non accommodation due to + lack of available seats + type: Counter + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level +- name: request_execution_seconds + subsystem: flowcontrol + namespace: apiserver + help: Duration of initial stage (for a WATCH) or any (for a non-WATCH) stage of + request execution in the API Priority and Fairness subsystem + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + - type + buckets: + - 0 + - 0.005 + - 0.02 + - 0.05 + - 0.1 + - 0.2 + - 0.5 + - 1 + - 2 + - 5 + - 10 + - 30 +- name: request_queue_length_after_enqueue + subsystem: flowcontrol + namespace: apiserver + help: Length of queue in the API Priority and Fairness subsystem, as seen by each + request after it is enqueued + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 0 + - 10 + - 25 + - 50 + - 100 + - 250 + - 500 + - 1000 +- name: request_wait_duration_seconds + subsystem: flowcontrol + namespace: apiserver + help: Length of time a request spent waiting in its queue + type: Histogram + stabilityLevel: ALPHA + labels: + - execute + - flow_schema + - priority_level + buckets: + - 0 + - 0.005 + - 0.02 + - 0.05 + - 0.1 + - 0.2 + - 0.5 + - 1 + - 2 + - 5 + - 10 + - 30 +- name: watch_count_samples + subsystem: flowcontrol + namespace: apiserver + help: count of watchers for mutating requests in API Priority and Fairness + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 0 + - 1 + - 10 + - 100 + - 1000 + - 10000 +- name: work_estimated_seats + subsystem: flowcontrol + namespace: apiserver + help: Number of estimated seats (maximum of initial and final seats) associated + with requests in API Priority and Fairness + type: Histogram + stabilityLevel: ALPHA + labels: + - flow_schema + - priority_level + buckets: + - 1 + - 2 + - 4 + - 10 +- name: init_events_total + namespace: apiserver + help: Counter of init events processed in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: data_key_generation_duration_seconds + subsystem: storage + namespace: apiserver + help: Latencies in seconds of data encryption key(DEK) generation operations. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 5e-06 + - 1e-05 + - 2e-05 + - 4e-05 + - 8e-05 + - 0.00016 + - 0.00032 + - 0.00064 + - 0.00128 + - 0.00256 + - 0.00512 + - 0.01024 + - 0.02048 + - 0.04096 +- name: data_key_generation_failures_total + subsystem: storage + namespace: apiserver + help: Total number of failed data encryption key(DEK) generation operations. + type: Counter + stabilityLevel: ALPHA +- name: envelope_transformation_cache_misses_total + subsystem: storage + namespace: apiserver + help: Total number of cache misses while accessing key decryption key(KEK). + type: Counter + stabilityLevel: ALPHA +- name: apiserver_storage_list_evaluated_objects_total + help: Number of objects tested in the course of serving a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_fetched_objects_total + help: Number of objects read from storage in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_returned_objects_total + help: Number of objects returned for a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_total + help: Number of LIST requests served from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: transformation_duration_seconds + subsystem: storage + namespace: apiserver + help: Latencies in seconds of value transformation operations. + type: Histogram + stabilityLevel: ALPHA + labels: + - transformation_type + buckets: + - 5e-06 + - 1e-05 + - 2e-05 + - 4e-05 + - 8e-05 + - 0.00016 + - 0.00032 + - 0.00064 + - 0.00128 + - 0.00256 + - 0.00512 + - 0.01024 + - 0.02048 + - 0.04096 + - 0.08192 + - 0.16384 + - 0.32768 + - 0.65536 + - 1.31072 + - 2.62144 + - 5.24288 + - 10.48576 + - 20.97152 + - 41.94304 + - 83.88608 +- name: transformation_operations_total + subsystem: storage + namespace: apiserver + help: Total number of transformations. + type: Counter + stabilityLevel: ALPHA + labels: + - status + - transformation_type + - transformer_prefix +- name: terminated_watchers_total + namespace: apiserver + help: Counter of watchers closed due to unresponsiveness broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: events_dispatched_total + subsystem: watch_cache + namespace: apiserver + help: Counter of events dispatched in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: initializations_total + subsystem: watch_cache + namespace: apiserver + help: Counter of watch cache initializations broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: x509_insecure_sha1_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers with insecure SHA1 signatures in + their serving certificate OR the number of connection failures due to the insecure + SHA1 signatures (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: x509_missing_san_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers missing SAN extension in their serving + certificate OR the number of connection failures due to the lack of x509 certificate + SAN extension missing (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: etcd_bookmark_counts + help: Number of etcd bookmarks (progress notify events) split by kind. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: etcd_db_total_size_in_bytes + help: Total size of the etcd database file physically allocated in bytes. + type: Gauge + stabilityLevel: ALPHA + labels: + - endpoint +- name: etcd_lease_object_counts + help: Number of objects attached to a single etcd lease. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 10 + - 50 + - 100 + - 500 + - 1000 + - 2500 + - 5000 +- name: etcd_request_duration_seconds + help: Etcd request latency in seconds for each operation and object type. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + - type + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: capacity + subsystem: watch_cache + help: Total capacity of watch cache broken by resource type. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_decrease_total + subsystem: watch_cache + help: Total number of watch cache capacity decrease events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_increase_total + subsystem: watch_cache + help: Total number of watch cache capacity increase events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_objects + help: Number of stored objects at the time of last check split by kind. + type: Gauge + stabilityLevel: STABLE + labels: + - resource +- name: nodesync_latency_seconds + subsystem: service_controller + help: A metric measuring the latency for nodesync which updates loadbalancer hosts + on cluster node updates. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 +- name: update_loadbalancer_host_latency_seconds + subsystem: service_controller + help: A metric measuring the latency for updating each load balancer hosts. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 +- name: kubernetes_build_info + help: A metric with a constant '1' value labeled by major, minor, git version, git + commit, git tree state, build date, Go version, and compiler from which Kubernetes + was built, and platform on which it is running. + type: Gauge + stabilityLevel: ALPHA + labels: + - build_date + - compiler + - git_commit + - git_tree_state + - git_version + - go_version + - major + - minor + - platform +- name: feature_enabled + namespace: kubernetes + help: This metric records the data about the stage and enablement of a k8s feature. + type: Gauge + stabilityLevel: ALPHA + labels: + - name + - stage +- name: healthcheck + namespace: kubernetes + help: This metric records the result of a single healthcheck. + type: Gauge + stabilityLevel: ALPHA + labels: + - name + - type +- name: healthchecks_total + namespace: kubernetes + help: This metric records the results of all healthcheck. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - status + - type +- name: leader_election_master_status + help: Gauge of if the reporting system is master of the relevant lease, 0 indicates + backup, 1 indicates master. 'name' is the string used to identify the lease. Please + make sure to group by name. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: rest_client_exec_plugin_call_total + help: Number of calls to an exec plugin, partitioned by the type of event encountered + (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) + and an optional exit code. The exit code will be set to 0 if and only if the plugin + call was successful. + type: Counter + stabilityLevel: ALPHA + labels: + - call_status + - code +- name: rest_client_exec_plugin_certificate_rotation_age + help: Histogram of the number of seconds the last auth exec plugin client certificate + lived before being rotated. If auth exec plugin client certificates are unused, + histogram will contain no data. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 600 + - 1800 + - 3600 + - 14400 + - 86400 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 + - 1.24416e+08 +- name: rest_client_exec_plugin_ttl_seconds + help: Gauge of the shortest TTL (time-to-live) of the client certificate(s) managed + by the auth exec plugin. The value is in seconds until certificate expiry (negative + if already expired). If auth exec plugins are unused or manage no TLS certificates, + the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: rest_client_rate_limiter_duration_seconds + help: Client side rate limiter latency in seconds. Broken down by verb, and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 + - 15 + - 30 + - 60 +- name: rest_client_request_duration_seconds + help: Request latency in seconds. Broken down by verb, and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2 + - 4 + - 8 + - 15 + - 30 + - 60 +- name: rest_client_request_size_bytes + help: Request size in bytes. Broken down by verb and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 64 + - 256 + - 512 + - 1024 + - 4096 + - 16384 + - 65536 + - 262144 + - 1.048576e+06 + - 4.194304e+06 + - 1.6777216e+07 +- name: rest_client_requests_total + help: Number of HTTP requests, partitioned by status code, method, and host. + type: Counter + stabilityLevel: ALPHA + labels: + - code + - host + - method +- name: rest_client_response_size_bytes + help: Response size in bytes. Broken down by verb and host. + type: Histogram + stabilityLevel: ALPHA + labels: + - host + - verb + buckets: + - 64 + - 256 + - 512 + - 1024 + - 4096 + - 16384 + - 65536 + - 262144 + - 1.048576e+06 + - 4.194304e+06 + - 1.6777216e+07 +- name: running_managed_controllers + help: Indicates where instances of a controller are currently running + type: Gauge + stabilityLevel: ALPHA + labels: + - manager + - name +- name: adds_total + subsystem: workqueue + help: Total number of adds handled by workqueue + type: Counter + stabilityLevel: ALPHA + labels: + - name +- name: depth + subsystem: workqueue + help: Current depth of workqueue + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: longest_running_processor_seconds + subsystem: workqueue + help: How many seconds has the longest running processor for workqueue been running. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: queue_duration_seconds + subsystem: workqueue + help: How long in seconds an item stays in workqueue before being requested. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + buckets: + - 1e-08 + - 1e-07 + - 1e-06 + - 9.999999999999999e-06 + - 9.999999999999999e-05 + - 0.001 + - 0.01 + - 0.1 + - 1 + - 10 +- name: retries_total + subsystem: workqueue + help: Total number of retries handled by workqueue + type: Counter + stabilityLevel: ALPHA + labels: + - name +- name: unfinished_work_seconds + subsystem: workqueue + help: How many seconds of work has done that is in progress and hasn't been observed + by work_duration. Large values indicate stuck threads. One can deduce the number + of stuck threads by observing the rate at which this increases. + type: Gauge + stabilityLevel: ALPHA + labels: + - name +- name: work_duration_seconds + subsystem: workqueue + help: How long in seconds processing an item from workqueue takes. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + buckets: + - 1e-08 + - 1e-07 + - 1e-06 + - 9.999999999999999e-06 + - 9.999999999999999e-05 + - 0.001 + - 0.01 + - 0.1 + - 1 + - 10 +- name: x509_insecure_sha1_total + subsystem: kube_aggregator + namespace: apiserver + help: Counts the number of requests to servers with insecure SHA1 signatures in + their serving certificate OR the number of connection failures due to the insecure + SHA1 signatures (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: x509_missing_san_total + subsystem: kube_aggregator + namespace: apiserver + help: Counts the number of requests to servers missing SAN extension in their serving + certificate OR the number of connection failures due to the lack of x509 certificate + SAN extension missing (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: aggregator_openapi_v2_regeneration_count + help: Counter of OpenAPI v2 spec regeneration count broken down by causing APIService + name and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - apiservice + - reason +- name: aggregator_openapi_v2_regeneration_duration + help: Gauge of OpenAPI v2 spec regeneration duration in seconds. + type: Gauge + stabilityLevel: ALPHA + labels: + - reason +- name: aggregator_unavailable_apiservice_total + help: Counter of APIServices which are marked as unavailable broken down by APIService + name and reason. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - reason +- name: cloudprovider_aws_api_request_duration_seconds + help: Latency of AWS API calls + type: Histogram + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_aws_api_request_errors + help: AWS API errors + type: Counter + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_aws_api_throttled_requests_total + help: AWS API throttled requests + type: Counter + stabilityLevel: ALPHA + labels: + - operation_name +- name: api_request_duration_seconds + namespace: cloudprovider_azure + help: Latency of an Azure API call + type: Histogram + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id + buckets: + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 + - 15 + - 25 + - 50 + - 120 + - 300 + - 600 + - 1200 +- name: api_request_errors + namespace: cloudprovider_azure + help: Number of errors for an Azure API call + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: api_request_ratelimited_count + namespace: cloudprovider_azure + help: Number of rate limited Azure API calls + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: api_request_throttled_count + namespace: cloudprovider_azure + help: Number of throttled Azure API calls + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: op_duration_seconds + namespace: cloudprovider_azure + help: Latency of an Azure service operation + type: Histogram + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id + buckets: + - 0.1 + - 0.2 + - 0.5 + - 1 + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 100 + - 200 + - 300 +- name: op_failure_count + namespace: cloudprovider_azure + help: Number of failed Azure service operations + type: Counter + stabilityLevel: ALPHA + labels: + - request + - resource_group + - source + - subscription_id +- name: cloudprovider_gce_api_request_duration_seconds + help: Latency of a GCE API call + type: Histogram + stabilityLevel: ALPHA + labels: + - region + - request + - version + - zone +- name: cloudprovider_gce_api_request_errors + help: Number of errors for an API call + type: Counter + stabilityLevel: ALPHA + labels: + - region + - request + - version + - zone +- name: cloudprovider_vsphere_api_request_duration_seconds + help: Latency of vsphere api call + type: Histogram + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_vsphere_api_request_errors + help: vsphere Api errors + type: Counter + stabilityLevel: ALPHA + labels: + - request +- name: cloudprovider_vsphere_operation_duration_seconds + help: Latency of vsphere operation call + type: Histogram + stabilityLevel: ALPHA + labels: + - operation +- name: cloudprovider_vsphere_operation_errors + help: vsphere operation errors + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: get_token_count + help: Counter of total Token() requests to the alternate token source + type: Counter + stabilityLevel: ALPHA +- name: get_token_fail_count + help: Counter of failed Token() requests to the alternate token source + type: Counter + stabilityLevel: ALPHA +- name: number_of_l4_ilbs + help: Number of L4 ILBs + type: Gauge + stabilityLevel: ALPHA + labels: + - feature +- name: pod_security_errors_total + help: Number of errors preventing normal evaluation. Non-fatal errors may result + in the latest restricted profile being used for evaluation. + type: Counter + stabilityLevel: ALPHA + labels: + - fatal + - request_operation + - resource + - subresource +- name: pod_security_evaluations_total + help: Number of policy evaluations that occurred, not counting ignored or exempt + requests. + type: Counter + stabilityLevel: ALPHA + labels: + - decision + - mode + - policy_level + - policy_version + - request_operation + - resource + - subresource +- name: pod_security_exemptions_total + help: Number of exempt requests, not counting ignored or out of scope requests. + type: Counter + stabilityLevel: ALPHA + labels: + - request_operation + - resource + - subresource diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md new file mode 100644 index 00000000000..9a876e0cc1b --- /dev/null +++ b/test/instrumentation/testdata/documentation.md @@ -0,0 +1,325 @@ +--- +title: Kubernetes Metrics Across Components +content_type: instrumentation +--- + + +## Metrics + +These are the metrics which are exported in Kubernetes components (i.e. kube-apiserver, scheduler, kube-controller-manager, kube-proxy, cloud-controller-manager). + +(auto-generated 2022 Oct 25) + +### List of Kubernetes Metrics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameStability LevelTypeHelpLabelsConst Labels
version_infoALPHAGaugeEtcd server's binary version
binary_version
None
certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone
endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None
cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone
changesALPHACounterNumber of EndpointSlice changes
operation
None
desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone
endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone
num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
syncsALPHACounterNumber of EndpointSlice syncs
result
None
addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone
changesALPHACounterNumber of EndpointSlice changes
operation
None
desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone
endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone
endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone
endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone
endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone
endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone
num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone
resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone
sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None
sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None
job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None
terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded) that have the finalizer batch.kubernetes.io/job-tracking The event label can be "add" or "delete".`
event
None
attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone
job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None
job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None
job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None
evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None
unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None
zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None
zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None
cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None
cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None
multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None
multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None
multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None
sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone
job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone
evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None
create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone
client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone
certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone
kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None
kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None
server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone
volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None
cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None
containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone
device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None
device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None
eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None
evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None
graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone
http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None
http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None
http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None
lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone
managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone
node_nameALPHAGaugeThe node's name. The count is always 1.
node
None
pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone
pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone
pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone
pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone
pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None
pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None
pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone
pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone
pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None
pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone
preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None
run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None
run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None
running_containersALPHAGaugeNumber of containers currently running
container_state
None
running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone
runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None
runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None
runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None
started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None
started_containers_totalALPHACounterCumulative number of containers started
container_type
None
started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None
started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None
started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone
started_pods_totalALPHACounterCumulative number of pods startedNoneNone
volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None
probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None
probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None
csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None
network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone
sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone
sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone
sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone
sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone
sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None
sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone
sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone
sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None
sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone
sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone
volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone
volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone
volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone
volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone
allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None
allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None
allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None
available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None
pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone
pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None
e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None
goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None
permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None
plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None
scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None
scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None
scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone
unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None
binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None
scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None
legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone
stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone
valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone
framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None
pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None
pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone
pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None
preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone
preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone
queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None
schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None
scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None
operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None
storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None
volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None
graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None
apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None
apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None
apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None
step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None
webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None
webhook_request_totalALPHACounterAdmission webhook request total, identified by name and broken out for each admission type (validating or mutating) and operation. Additional labels specify whether the request was rejected or not and an HTTP status code. Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
code
name
operation
rejected
type
None
error_totalALPHACounterCounter of audit events that failed to be audited properly. Plugin identifies the plugin affected by the error.
plugin
None
event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone
level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None
requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone
apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None
webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None
compilation_duration_secondsALPHAHistogramNoneNone
evaluation_duration_secondsALPHAHistogramNoneNone
certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone
current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None
apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None
apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None
dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None
dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None
request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None
request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None
request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None
request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None
request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None
request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None
selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None
tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone
watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None
watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None
authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None
authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None
authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None
active_fetch_countALPHAGauge
status
None
fetch_totalALPHACounter
status
None
request_duration_secondsALPHAHistogram
status
None
request_totalALPHACounter
status
None
field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None
current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None
longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None
request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None
request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None
requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None
response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None
cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None
cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None
cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None
dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone
dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None
current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None
current_inqueue_requestsALPHAGaugeNumber of requests currently pending in queues of the API Priority and Fairness subsystem
flow_schema
priority_level
None
current_rALPHAGaugeR(time of last change)
priority_level
None
dispatch_rALPHAGaugeR(time of last dispatch)
priority_level
None
dispatched_requests_totalALPHACounterNumber of requests executed by API Priority and Fairness subsystem
flow_schema
priority_level
None
epoch_advance_totalALPHACounterNumber of times the queueset's progress meter jumped backward
priority_level
success
None
latest_sALPHAGaugeS(most recently dispatched request)
priority_level
None
next_discounted_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue) - estimated work in progress
bound
priority_level
None
next_s_boundsALPHAGaugemin and max, over queues, of S(oldest waiting request in queue)
bound
priority_level
None
priority_level_request_utilizationALPHAObservations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)
phase
priority_level
None
priority_level_seat_utilizationALPHAObservations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)
priority_level
map[phase:executing]
read_vs_write_current_requestsALPHAObservations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution
phase
request_kind
None
rejected_requests_totalALPHACounterNumber of requests rejected by API Priority and Fairness subsystem
flow_schema
priority_level
reason
None
request_concurrency_in_useALPHAGaugeConcurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem
flow_schema
priority_level
None
request_concurrency_limitALPHAGaugeShared concurrency limit in the API Priority and Fairness subsystem
priority_level
None
request_dispatch_no_accommodation_totalALPHACounterNumber of times a dispatch attempt resulted in a non accommodation due to lack of available seats
flow_schema
priority_level
None
request_execution_secondsALPHAHistogramDuration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem
flow_schema
priority_level
type
None
request_queue_length_after_enqueueALPHAHistogramLength of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued
flow_schema
priority_level
None
request_wait_duration_secondsALPHAHistogramLength of time a request spent waiting in its queue
execute
flow_schema
priority_level
None
watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None
work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None
init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None
data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone
data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone
envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone
apiserver_storage_list_evaluated_objects_totalALPHACounterNumber of objects tested in the course of serving a LIST request from storage
resource
None
apiserver_storage_list_fetched_objects_totalALPHACounterNumber of objects read from storage in the course of serving a LIST request
resource
None
apiserver_storage_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from storage
resource
None
apiserver_storage_list_totalALPHACounterNumber of LIST requests served from storage
resource
None
transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None
transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None
terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None
events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None
initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None
x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None
etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None
etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone
etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None
capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None
capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None
capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None
apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None
nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone
update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone
kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None
feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None
healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None
healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None
leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None
rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None
rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone
rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone
rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None
rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None
rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None
rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None
running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None
adds_totalALPHACounterTotal number of adds handled by workqueue
name
None
depthALPHAGaugeCurrent depth of workqueue
name
None
longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None
queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None
retries_totalALPHACounterTotal number of retries handled by workqueue
name
None
unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None
work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None
x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone
x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone
aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None
aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None
aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None
cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None
cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None
cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None
api_request_duration_secondsALPHAHistogramLatency of an Azure API call
request
resource_group
source
subscription_id
None
api_request_errorsALPHACounterNumber of errors for an Azure API call
request
resource_group
source
subscription_id
None
api_request_ratelimited_countALPHACounterNumber of rate limited Azure API calls
request
resource_group
source
subscription_id
None
api_request_throttled_countALPHACounterNumber of throttled Azure API calls
request
resource_group
source
subscription_id
None
op_duration_secondsALPHAHistogramLatency of an Azure service operation
request
resource_group
source
subscription_id
None
op_failure_countALPHACounterNumber of failed Azure service operations
request
resource_group
source
subscription_id
None
cloudprovider_gce_api_request_duration_secondsALPHAHistogramLatency of a GCE API call
region
request
version
zone
None
cloudprovider_gce_api_request_errorsALPHACounterNumber of errors for an API call
region
request
version
zone
None
cloudprovider_vsphere_api_request_duration_secondsALPHAHistogramLatency of vsphere api call
request
None
cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None
cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None
cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None
get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone
get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone
number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None
pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None
pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None
pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None
diff --git a/test/instrumentation/update-documentation-metrics.sh b/test/instrumentation/update-documentation-metrics.sh new file mode 100755 index 00000000000..4d29eced1fa --- /dev/null +++ b/test/instrumentation/update-documentation-metrics.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright 2022 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script runs to ensure that we do not violate metric stability +# policies. +# Usage: `test/instrumentation/test-verify.sh`. + +set -o errexit +set -o nounset +set -o pipefail + +KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. +source "${KUBE_ROOT}/test/instrumentation/stability-utils.sh" + +kube::update::documentation::list diff --git a/test/instrumentation/update-documentation.sh b/test/instrumentation/update-documentation.sh new file mode 100755 index 00000000000..f56a47a287d --- /dev/null +++ b/test/instrumentation/update-documentation.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright 2022 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script runs to ensure that we do not violate metric stability +# policies. +# Usage: `test/instrumentation/test-update.sh`. + +set -o errexit +set -o nounset +set -o pipefail + +KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. +source "${KUBE_ROOT}/test/instrumentation/stability-utils.sh" + +kube::update::documentation