diff --git a/test/instrumentation/documentation/documentation-list.yaml b/test/instrumentation/documentation/documentation-list.yaml index 8ceae9cf220..2b99bb60509 100644 --- a/test/instrumentation/documentation/documentation-list.yaml +++ b/test/instrumentation/documentation/documentation-list.yaml @@ -12,6 +12,53 @@ certificate is invalid or unused, the value will be +INF. type: Gauge stabilityLevel: ALPHA +- name: sync_duration_seconds + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Histogram + stabilityLevel: ALPHA + labels: + - code + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: sync_total + subsystem: root_ca_cert_publisher + help: Number of namespace syncs happened in root ca cert publisher. + type: Counter + stabilityLevel: ALPHA + labels: + - code +- name: job_creation_skew_duration_seconds + subsystem: cronjob_controller + help: Time between when a cronjob is scheduled to be run, and when the corresponding + job is created + type: Histogram + stabilityLevel: STABLE + buckets: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 - name: addresses_skipped_per_sync subsystem: endpoint_slice_mirroring_controller help: Number of addresses skipped on each Endpoints sync due to being invalid or @@ -140,53 +187,6 @@ help: Number of EndpointSlices type: Gauge stabilityLevel: ALPHA -- name: sync_duration_seconds - subsystem: root_ca_cert_publisher - help: Number of namespace syncs happened in root ca cert publisher. - type: Histogram - stabilityLevel: ALPHA - labels: - - code - buckets: - - 0.001 - - 0.002 - - 0.004 - - 0.008 - - 0.016 - - 0.032 - - 0.064 - - 0.128 - - 0.256 - - 0.512 - - 1.024 - - 2.048 - - 4.096 - - 8.192 - - 16.384 -- name: sync_total - subsystem: root_ca_cert_publisher - help: Number of namespace syncs happened in root ca cert publisher. - type: Counter - stabilityLevel: ALPHA - labels: - - code -- name: job_creation_skew_duration_seconds - subsystem: cronjob_controller - help: Time between when a cronjob is scheduled to be run, and when the corresponding - job is created - type: Histogram - stabilityLevel: STABLE - buckets: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - name: resources_sync_error_total subsystem: garbagecollector_controller help: Number of garbage collector resources sync errors @@ -295,6 +295,13 @@ labels: - reason - status +- name: jobs_by_external_controller_total + subsystem: job_controller + help: The number of Jobs managed by an external controller + type: Counter + stabilityLevel: ALPHA + labels: + - controller_name - name: pod_failures_handled_by_failure_policy_total subsystem: job_controller help: "`The number of failed Pods handled by failure policy with\n\t\t\trespect @@ -449,30 +456,6 @@ help: Number of ResourceClaims creation request failures type: Counter stabilityLevel: ALPHA -- name: pod_deletion_duration_seconds - subsystem: taint_eviction_controller - help: Latency, in seconds, between the time when a taint effect has been activated - for the Pod and its deletion via TaintEvictionController. - type: Histogram - stabilityLevel: ALPHA - buckets: - - 0.005 - - 0.025 - - 0.1 - - 0.5 - - 1 - - 2.5 - - 10 - - 30 - - 60 - - 120 - - 180 - - 240 -- name: pod_deletions_total - subsystem: taint_eviction_controller - help: Total number of Pods deleted by TaintEvictionController since its start. - type: Counter - stabilityLevel: ALPHA - name: job_pods_finished_total subsystem: job_controller help: The number of finished Pods that are fully tracked @@ -556,36 +539,6 @@ help: Number of PersistenVolumeClaims creation requests type: Counter stabilityLevel: ALPHA -- name: client_expiration_renew_errors - subsystem: certificate_manager - namespace: kubelet - help: Counter of certificate renewal errors. - type: Counter - stabilityLevel: ALPHA -- name: certificate_manager_server_rotation_seconds - subsystem: kubelet - help: Histogram of the number of seconds the previous certificate lived before being - rotated. - type: Histogram - stabilityLevel: ALPHA - buckets: - - 60 - - 3600 - - 14400 - - 86400 - - 604800 - - 2.592e+06 - - 7.776e+06 - - 1.5552e+07 - - 3.1104e+07 - - 1.24416e+08 -- name: certificate_manager_server_ttl_seconds - subsystem: kubelet - help: Gauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. - The value is in seconds until certificate expiry (negative if already expired). - If serving certificate is invalid or unused, the value will be +INF. - type: Gauge - stabilityLevel: ALPHA - name: credential_provider_plugin_duration subsystem: kubelet help: Duration of execution in seconds for credential provider plugin @@ -612,11 +565,6 @@ stabilityLevel: ALPHA labels: - plugin_name -- name: server_expiration_renew_errors - subsystem: kubelet - help: Counter of certificate renewal errors. - type: Counter - stabilityLevel: ALPHA - name: pv_collector_bound_pv_count help: Gauge measuring number of persistent volume currently bound type: Custom @@ -665,6 +613,30 @@ labels: - node - volume_plugin +- name: pod_deletion_duration_seconds + subsystem: taint_eviction_controller + help: Latency, in seconds, between the time when a taint effect has been activated + for the Pod and its deletion via TaintEvictionController. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 1 + - 2.5 + - 10 + - 30 + - 60 + - 120 + - 180 + - 240 +- name: pod_deletions_total + subsystem: taint_eviction_controller + help: Total number of Pods deleted by TaintEvictionController since its start. + type: Counter + stabilityLevel: ALPHA - name: job_deletion_duration_seconds subsystem: ttl_after_finished_controller help: The time it took to delete the job since it became eligible for deletion @@ -692,6 +664,41 @@ labels: - operation_name - plugin_name +- name: client_expiration_renew_errors + subsystem: certificate_manager + namespace: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA +- name: certificate_manager_server_rotation_seconds + subsystem: kubelet + help: Histogram of the number of seconds the previous certificate lived before being + rotated. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 60 + - 3600 + - 14400 + - 86400 + - 604800 + - 2.592e+06 + - 7.776e+06 + - 1.5552e+07 + - 3.1104e+07 + - 1.24416e+08 +- name: certificate_manager_server_ttl_seconds + subsystem: kubelet + help: Gauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. + The value is in seconds until certificate expiry (negative if already expired). + If serving certificate is invalid or unused, the value will be +INF. + type: Gauge + stabilityLevel: ALPHA +- name: server_expiration_renew_errors + subsystem: kubelet + help: Counter of certificate renewal errors. + type: Counter + stabilityLevel: ALPHA - name: container_swap_usage_bytes help: Current amount of the container swap usage in bytes. Reported only on non-windows systems @@ -701,6 +708,16 @@ - container - pod - namespace +- name: force_cleaned_failed_volume_operation_errors_total + help: The number of volumes that failed force cleanup after their reconstruction + failed during kubelet startup. + type: Counter + stabilityLevel: ALPHA +- name: force_cleaned_failed_volume_operations_total + help: The number of volumes that were force cleaned after their reconstruction failed + during kubelet startup. This includes both successful and failed cleanups. + type: Counter + stabilityLevel: ALPHA - name: active_pods subsystem: kubelet help: The number of pods the kubelet considers active and which are being considered @@ -859,12 +876,82 @@ help: Last graceful shutdown start time since unix epoch in seconds type: Gauge stabilityLevel: ALPHA +- name: http_inflight_requests + subsystem: kubelet + help: Number of the inflight http requests + type: Gauge + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type +- name: http_requests_duration_seconds + subsystem: kubelet + help: Duration in seconds to serve http requests + type: Histogram + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: http_requests_total + subsystem: kubelet + help: Number of the http requests received since the server started + type: Counter + stabilityLevel: ALPHA + labels: + - long_running + - method + - path + - server_type - name: image_garbage_collected_total subsystem: kubelet help: Total number of images garbage collected by the kubelet, whether through disk usage or image age. type: Counter stabilityLevel: ALPHA + labels: + - reason +- name: image_pull_duration_seconds + subsystem: kubelet + help: Duration in seconds to pull an image. + type: Histogram + stabilityLevel: ALPHA + labels: + - image_size_in_bytes + buckets: + - 1 + - 5 + - 10 + - 20 + - 30 + - 60 + - 120 + - 180 + - 240 + - 300 + - 360 + - 480 + - 600 + - 900 + - 1200 + - 1800 + - 2700 + - 3600 - name: lifecycle_handler_http_fallbacks_total subsystem: kubelet help: The number of times lifecycle handlers successfully fell back to http from @@ -876,6 +963,16 @@ help: Current number of ephemeral containers in pods managed by this kubelet. type: Gauge stabilityLevel: ALPHA +- name: memory_manager_pinning_errors_total + subsystem: kubelet + help: The number of memory pages allocations which required pinning that failed. + type: Counter + stabilityLevel: ALPHA +- name: memory_manager_pinning_requests_total + subsystem: kubelet + help: The number of memory pages allocations which required pinning. + type: Counter + stabilityLevel: ALPHA - name: mirror_pods subsystem: kubelet help: The number of mirror pods the kubelet will try to create (one per admitted @@ -1278,6 +1375,11 @@ stabilityLevel: ALPHA labels: - operation_type +- name: sleep_action_terminated_early_total + subsystem: kubelet + help: The number of times lifecycle sleep handler got terminated before it finishes + type: Counter + stabilityLevel: ALPHA - name: started_containers_errors_total subsystem: kubelet help: Cumulative number of errors when starting containers @@ -1351,6 +1453,25 @@ help: The number of admission requests where resources have to be aligned. type: Counter stabilityLevel: ALPHA +- name: volume_metric_collection_duration_seconds + subsystem: kubelet + help: Duration in seconds to calculate volume stats + type: Histogram + stabilityLevel: ALPHA + labels: + - metric_source + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 - name: kubelet_volume_stats_available_bytes help: Number of available bytes in the volume type: Custom @@ -1417,6 +1538,13 @@ help: Current swap usage of the node in bytes. Reported only on non-windows systems type: Custom stabilityLevel: ALPHA +- name: plugin_manager_total_plugins + help: Number of plugins in Plugin Manager + type: Custom + stabilityLevel: ALPHA + labels: + - socket_path + - state - name: pod_swap_usage_bytes help: Current amount of the pod swap usage in bytes. Reported only on non-windows systems @@ -1425,11 +1553,111 @@ labels: - pod - namespace +- name: probe_duration_seconds + subsystem: prober + help: Duration in seconds for a probe response. + type: Histogram + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - probe_type +- name: probe_total + subsystem: prober + help: Cumulative number of a liveness, readiness or startup probe for a container + by result. + type: Counter + stabilityLevel: ALPHA + labels: + - container + - namespace + - pod + - pod_uid + - probe_type + - result +- name: reconstruct_volume_operations_errors_total + help: The number of volumes that failed reconstruction from the operating system + during kubelet startup. + type: Counter + stabilityLevel: ALPHA +- name: reconstruct_volume_operations_total + help: The number of volumes that were attempted to be reconstructed from the operating + system during kubelet startup. This includes both successful and failed reconstruction. + type: Counter + stabilityLevel: ALPHA - name: scrape_error help: 1 if there was an error while getting container metrics, 0 otherwise type: Custom deprecatedVersion: 1.29.0 stabilityLevel: ALPHA +- name: volume_manager_selinux_container_errors_total + help: Number of errors when kubelet cannot compute SELinux context for a container. + Kubelet can't start such a Pod then and it will retry, therefore value of this + metric may not represent the actual nr. of containers. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode +- name: volume_manager_selinux_container_warnings_total + help: Number of errors when kubelet cannot compute SELinux context for a container + that are ignored. They will become real errors when SELinuxMountReadWriteOncePod + feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode +- name: volume_manager_selinux_pod_context_mismatch_errors_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. Kubelet can't start such a Pod then and it will retry, + therefore value of this metric may not represent the actual nr. of Pods. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode +- name: volume_manager_selinux_pod_context_mismatch_warnings_total + help: Number of errors when a Pod defines different SELinux contexts for its containers + that use the same volume. They are not errors yet, but they will become real errors + when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode +- name: volume_manager_selinux_volume_context_mismatch_errors_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. Kubelet can't start such a Pod then and it + will retry, therefore value of this metric may not represent the actual nr. of + Pods. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode + - volume_plugin +- name: volume_manager_selinux_volume_context_mismatch_warnings_total + help: Number of errors when a Pod uses a volume that is already mounted with a different + SELinux context than the Pod needs. They are not errors yet, but they will become + real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume + access modes. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode + - volume_plugin +- name: volume_manager_selinux_volumes_admitted_total + help: Number of volumes whose SELinux context was fine and will be mounted with + mount -o context option. + type: Gauge + stabilityLevel: ALPHA + labels: + - access_mode + - volume_plugin +- name: volume_manager_total_volumes + help: Number of volumes in Volume Manager + type: Custom + stabilityLevel: ALPHA + labels: + - plugin_name + - state - name: container_cpu_usage_seconds_total help: Cumulative cpu time consumed by the container in core-seconds type: Custom @@ -1480,77 +1708,150 @@ help: 1 if there was an error while getting container metrics, 0 otherwise type: Custom stabilityLevel: STABLE -- name: force_cleaned_failed_volume_operation_errors_total - help: The number of volumes that failed force cleanup after their reconstruction - failed during kubelet startup. +- name: csr_honored_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration that was honored, sliced + by signer (only kubernetes.io signer names are specifically identified) type: Counter stabilityLevel: ALPHA -- name: force_cleaned_failed_volume_operations_total - help: The number of volumes that were force cleaned after their reconstruction failed - during kubelet startup. This includes both successful and failed cleanups. + labels: + - signerName +- name: csr_requested_duration_total + subsystem: certificates_registry + namespace: apiserver + help: Total number of issued CSRs with a requested duration, sliced by signer (only + kubernetes.io signer names are specifically identified) type: Counter stabilityLevel: ALPHA -- name: http_inflight_requests - subsystem: kubelet - help: Number of the inflight http requests + labels: + - signerName +- name: ip_errors_total + subsystem: clusterip_repair + namespace: apiserver + help: 'Number of errors detected on clusterips by the repair loop broken down by + type of error: leak, repair, full, outOfRange, duplicate, unknown, invalid' + type: Counter + stabilityLevel: ALPHA + labels: + - type +- name: reconcile_errors_total + subsystem: clusterip_repair + namespace: apiserver + help: Number of reconciliation failures on the clusterip repair reconcile loop + type: Counter + stabilityLevel: ALPHA +- name: port_errors_total + subsystem: nodeport_repair + namespace: apiserver + help: 'Number of errors detected on ports by the repair loop broken down by type + of error: leak, repair, full, outOfRange, duplicate, unknown' + type: Counter + stabilityLevel: ALPHA + labels: + - type +- name: reconcile_errors_total + subsystem: nodeport_repair + namespace: apiserver + help: Number of reconciliation failures on the nodeport repair reconcile loop + type: Counter + stabilityLevel: ALPHA +- name: allocated_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of allocated IPs for Services type: Gauge stabilityLevel: ALPHA labels: - - long_running - - method - - path - - server_type -- name: http_requests_duration_seconds - subsystem: kubelet - help: Duration in seconds to serve http requests - type: Histogram - stabilityLevel: ALPHA - labels: - - long_running - - method - - path - - server_type - buckets: - - 0.005 - - 0.01 - - 0.025 - - 0.05 - - 0.1 - - 0.25 - - 0.5 - - 1 - - 2.5 - - 5 - - 10 -- name: http_requests_total - subsystem: kubelet - help: Number of the http requests received since the server started + - cidr +- name: allocation_errors_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of errors trying to allocate Cluster IPs type: Counter stabilityLevel: ALPHA labels: - - long_running - - method - - path - - server_type -- name: volume_metric_collection_duration_seconds - subsystem: kubelet - help: Duration in seconds to calculate volume stats - type: Histogram + - cidr + - scope +- name: allocation_total + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Number of Cluster IPs allocations + type: Counter stabilityLevel: ALPHA labels: - - metric_source - buckets: - - 0.005 - - 0.01 - - 0.025 - - 0.05 - - 0.1 - - 0.25 - - 0.5 - - 1 - - 2.5 - - 5 - - 10 + - cidr + - scope +- name: available_ips + subsystem: clusterip_allocator + namespace: kube_apiserver + help: Gauge measuring the number of available IPs for Services + type: Gauge + stabilityLevel: ALPHA + labels: + - cidr +- name: allocated_ports + subsystem: nodeport_allocator + namespace: kube_apiserver + help: Gauge measuring the number of allocated NodePorts for Services + type: Gauge + stabilityLevel: ALPHA +- name: allocation_errors_total + subsystem: nodeport_allocator + namespace: kube_apiserver + help: Number of errors trying to allocate NodePort + type: Counter + stabilityLevel: ALPHA + labels: + - scope +- name: allocation_total + subsystem: nodeport_allocator + namespace: kube_apiserver + help: Number of NodePort allocations + type: Counter + stabilityLevel: ALPHA + labels: + - scope +- name: available_ports + subsystem: nodeport_allocator + namespace: kube_apiserver + help: Gauge measuring the number of available NodePorts for Services + type: Gauge + stabilityLevel: ALPHA +- name: backend_tls_failure_total + subsystem: pod_logs + namespace: kube_apiserver + help: Total number of requests for pods/logs that failed due to kubelet server TLS + verification + type: Counter + stabilityLevel: ALPHA +- name: insecure_backend_total + subsystem: pod_logs + namespace: kube_apiserver + help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, + skip_tls_allowed, skip_tls_denied' + type: Counter + stabilityLevel: ALPHA + labels: + - usage +- name: pods_logs_backend_tls_failure_total + subsystem: pod_logs + namespace: kube_apiserver + help: Total number of requests for pods/logs that failed due to kubelet server TLS + verification + type: Counter + deprecatedVersion: 1.27.0 + stabilityLevel: ALPHA +- name: pods_logs_insecure_backend_total + subsystem: pod_logs + namespace: kube_apiserver + help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, + skip_tls_allowed, skip_tls_denied' + type: Counter + deprecatedVersion: 1.27.0 + stabilityLevel: ALPHA + labels: + - usage - name: network_programming_duration_seconds subsystem: kubeproxy help: In Cluster Network Programming Latency in seconds @@ -1775,224 +2076,20 @@ help: Cumulative proxy rules Service changes type: Counter stabilityLevel: ALPHA -- name: plugin_manager_total_plugins - help: Number of plugins in Plugin Manager - type: Custom - stabilityLevel: ALPHA - labels: - - socket_path - - state -- name: probe_duration_seconds - subsystem: prober - help: Duration in seconds for a probe response. - type: Histogram - stabilityLevel: ALPHA - labels: - - container - - namespace - - pod - - probe_type -- name: probe_total - subsystem: prober - help: Cumulative number of a liveness, readiness or startup probe for a container - by result. +- name: binder_cache_requests_total + subsystem: scheduler_volume + help: Total number for request volume binding cache type: Counter stabilityLevel: ALPHA labels: - - container - - namespace - - pod - - pod_uid - - probe_type - - result -- name: reconstruct_volume_operations_errors_total - help: The number of volumes that failed reconstruction from the operating system - during kubelet startup. - type: Counter - stabilityLevel: ALPHA -- name: reconstruct_volume_operations_total - help: The number of volumes that were attempted to be reconstructed from the operating - system during kubelet startup. This includes both successful and failed reconstruction. - type: Counter - stabilityLevel: ALPHA -- name: volume_manager_selinux_container_errors_total - help: Number of errors when kubelet cannot compute SELinux context for a container. - Kubelet can't start such a Pod then and it will retry, therefore value of this - metric may not represent the actual nr. of containers. - type: Gauge - stabilityLevel: ALPHA -- name: volume_manager_selinux_container_warnings_total - help: Number of errors when kubelet cannot compute SELinux context for a container - that are ignored. They will become real errors when SELinuxMountReadWriteOncePod - feature is expanded to all volume access modes. - type: Gauge - stabilityLevel: ALPHA -- name: volume_manager_selinux_pod_context_mismatch_errors_total - help: Number of errors when a Pod defines different SELinux contexts for its containers - that use the same volume. Kubelet can't start such a Pod then and it will retry, - therefore value of this metric may not represent the actual nr. of Pods. - type: Gauge - stabilityLevel: ALPHA -- name: volume_manager_selinux_pod_context_mismatch_warnings_total - help: Number of errors when a Pod defines different SELinux contexts for its containers - that use the same volume. They are not errors yet, but they will become real errors - when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes. - type: Gauge - stabilityLevel: ALPHA -- name: volume_manager_selinux_volume_context_mismatch_errors_total - help: Number of errors when a Pod uses a volume that is already mounted with a different - SELinux context than the Pod needs. Kubelet can't start such a Pod then and it - will retry, therefore value of this metric may not represent the actual nr. of - Pods. - type: Gauge - stabilityLevel: ALPHA - labels: - - volume_plugin -- name: volume_manager_selinux_volume_context_mismatch_warnings_total - help: Number of errors when a Pod uses a volume that is already mounted with a different - SELinux context than the Pod needs. They are not errors yet, but they will become - real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume - access modes. - type: Gauge - stabilityLevel: ALPHA - labels: - - volume_plugin -- name: volume_manager_selinux_volumes_admitted_total - help: Number of volumes whose SELinux context was fine and will be mounted with - mount -o context option. - type: Gauge - stabilityLevel: ALPHA - labels: - - volume_plugin -- name: volume_manager_total_volumes - help: Number of volumes in Volume Manager - type: Custom - stabilityLevel: ALPHA - labels: - - plugin_name - - state -- name: csr_honored_duration_total - subsystem: certificates_registry - namespace: apiserver - help: Total number of issued CSRs with a requested duration that was honored, sliced - by signer (only kubernetes.io signer names are specifically identified) + - operation +- name: scheduling_stage_error_total + subsystem: scheduler_volume + help: Volume scheduling stage error count type: Counter stabilityLevel: ALPHA labels: - - signerName -- name: csr_requested_duration_total - subsystem: certificates_registry - namespace: apiserver - help: Total number of issued CSRs with a requested duration, sliced by signer (only - kubernetes.io signer names are specifically identified) - type: Counter - stabilityLevel: ALPHA - labels: - - signerName -- name: ip_errors_total - subsystem: clusterip_repair - namespace: apiserver - help: 'Number of errors detected on clusterips by the repair loop broken down by - type of error: leak, repair, full, outOfRange, duplicate, unknown, invalid' - type: Counter - stabilityLevel: ALPHA - labels: - - type -- name: reconcile_errors_total - subsystem: clusterip_repair - namespace: apiserver - help: Number of reconciliation failures on the clusterip repair reconcile loop - type: Counter - stabilityLevel: ALPHA -- name: port_errors_total - subsystem: nodeport_repair - namespace: apiserver - help: 'Number of errors detected on ports by the repair loop broken down by type - of error: leak, repair, full, outOfRange, duplicate, unknown' - type: Counter - stabilityLevel: ALPHA - labels: - - type -- name: allocated_ips - subsystem: clusterip_allocator - namespace: kube_apiserver - help: Gauge measuring the number of allocated IPs for Services - type: Gauge - stabilityLevel: ALPHA - labels: - - cidr -- name: allocation_errors_total - subsystem: clusterip_allocator - namespace: kube_apiserver - help: Number of errors trying to allocate Cluster IPs - type: Counter - stabilityLevel: ALPHA - labels: - - cidr - - scope -- name: allocation_total - subsystem: clusterip_allocator - namespace: kube_apiserver - help: Number of Cluster IPs allocations - type: Counter - stabilityLevel: ALPHA - labels: - - cidr - - scope -- name: available_ips - subsystem: clusterip_allocator - namespace: kube_apiserver - help: Gauge measuring the number of available IPs for Services - type: Gauge - stabilityLevel: ALPHA - labels: - - cidr -- name: allocated_ports - subsystem: nodeport_allocator - namespace: kube_apiserver - help: Gauge measuring the number of allocated NodePorts for Services - type: Gauge - stabilityLevel: ALPHA -- name: available_ports - subsystem: nodeport_allocator - namespace: kube_apiserver - help: Gauge measuring the number of available NodePorts for Services - type: Gauge - stabilityLevel: ALPHA -- name: backend_tls_failure_total - subsystem: pod_logs - namespace: kube_apiserver - help: Total number of requests for pods/logs that failed due to kubelet server TLS - verification - type: Counter - stabilityLevel: ALPHA -- name: insecure_backend_total - subsystem: pod_logs - namespace: kube_apiserver - help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, - skip_tls_allowed, skip_tls_denied' - type: Counter - stabilityLevel: ALPHA - labels: - - usage -- name: pods_logs_backend_tls_failure_total - subsystem: pod_logs - namespace: kube_apiserver - help: Total number of requests for pods/logs that failed due to kubelet server TLS - verification - type: Counter - deprecatedVersion: 1.27.0 - stabilityLevel: ALPHA -- name: pods_logs_insecure_backend_total - subsystem: pod_logs - namespace: kube_apiserver - help: 'Total number of requests for pods/logs sliced by usage type: enforce_tls, - skip_tls_allowed, skip_tls_denied' - type: Counter - deprecatedVersion: 1.27.0 - stabilityLevel: ALPHA - labels: - - usage + - operation - name: goroutines subsystem: scheduler help: Number of running goroutines split by the work they do such as binding. @@ -2026,7 +2123,7 @@ - name: plugin_evaluation_total subsystem: scheduler help: Number of attempts to schedule pods by each plugin and the extension point - (available only in PreFilter and Filter.). + (available only in PreFilter, Filter, PreScore, and Score). type: Counter stabilityLevel: ALPHA labels: @@ -2101,20 +2198,6 @@ labels: - plugin - profile -- name: binder_cache_requests_total - subsystem: scheduler_volume - help: Total number for request volume binding cache - type: Counter - stabilityLevel: ALPHA - labels: - - operation -- name: scheduling_stage_error_total - subsystem: scheduler_volume - help: Volume scheduling stage error count - type: Counter - stabilityLevel: ALPHA - labels: - - operation - name: invalid_legacy_auto_token_uses_total subsystem: serviceaccount help: Cumulative invalid auto-generated legacy tokens used @@ -2251,7 +2334,7 @@ help: E2e latency for a pod being scheduled which may include multiple scheduling attempts. type: Histogram - deprecatedVersion: 1.28.0 + deprecatedVersion: 1.29.0 stabilityLevel: STABLE labels: - attempts @@ -2360,6 +2443,26 @@ - 120 - 300 - 600 +- name: graph_actions_duration_seconds + subsystem: node_authorizer + help: Histogram of duration of graph actions in node authorizer. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + buckets: + - 0.0001 + - 0.0002 + - 0.0004 + - 0.0008 + - 0.0016 + - 0.0032 + - 0.0064 + - 0.0128 + - 0.0256 + - 0.0512 + - 0.1024 + - 0.2048 - name: storage_operation_duration_seconds help: Storage operation duration type: Histogram @@ -2404,26 +2507,6 @@ - 120 - 300 - 600 -- name: graph_actions_duration_seconds - subsystem: node_authorizer - help: Histogram of duration of graph actions in node authorizer. - type: Histogram - stabilityLevel: ALPHA - labels: - - operation - buckets: - - 0.0001 - - 0.0002 - - 0.0004 - - 0.0008 - - 0.0016 - - 0.0032 - - 0.0064 - - 0.0128 - - 0.0256 - - 0.0512 - - 0.1024 - - 0.2048 - name: ratcheting_seconds subsystem: validation namespace: apiextensions_apiserver @@ -2621,47 +2704,6 @@ - operation - rejected - type -- name: check_duration_seconds - subsystem: validating_admission_policy - namespace: apiserver - help: Validation admission latency for individual validation expressions in seconds, - labeled by policy and further including binding, state and enforcement action - taken. - type: Histogram - stabilityLevel: ALPHA - labels: - - enforcement_action - - policy - - policy_binding - - state - buckets: - - 5e-07 - - 0.001 - - 0.01 - - 0.1 - - 1 -- name: check_total - subsystem: validating_admission_policy - namespace: apiserver - help: Validation admission policy check total, labeled by policy and further identified - by binding, enforcement action taken, and state. - type: Counter - stabilityLevel: ALPHA - labels: - - enforcement_action - - policy - - policy_binding - - state -- name: definition_total - subsystem: validating_admission_policy - namespace: apiserver - help: Validation admission policy count total, labeled by state and enforcement - action. - type: Counter - stabilityLevel: ALPHA - labels: - - enforcement_action - - state - name: controller_admission_duration_seconds subsystem: admission namespace: apiserver @@ -2720,10 +2762,6 @@ - 2.5 - 10 - 25 -- name: aggregator_discovery_aggregation_count_total - help: Counter of number of times discovery was aggregated - type: Counter - stabilityLevel: ALPHA - name: error_total subsystem: apiserver_audit help: Counter of audit events that failed to be audited properly. Plugin identifies @@ -2749,6 +2787,100 @@ help: Counter of apiserver requests rejected due to an error in audit logging backend. type: Counter stabilityLevel: ALPHA +- name: check_duration_seconds + subsystem: validating_admission_policy + namespace: apiserver + help: Validation admission latency for individual validation expressions in seconds, + labeled by policy and further including binding, state and enforcement action + taken. + type: Histogram + stabilityLevel: ALPHA + labels: + - enforcement_action + - policy + - policy_binding + - state + buckets: + - 5e-07 + - 0.001 + - 0.01 + - 0.1 + - 1 +- name: check_total + subsystem: validating_admission_policy + namespace: apiserver + help: Validation admission policy check total, labeled by policy and further identified + by binding, enforcement action taken, and state. + type: Counter + stabilityLevel: ALPHA + labels: + - enforcement_action + - policy + - policy_binding + - state +- name: definition_total + subsystem: validating_admission_policy + namespace: apiserver + help: Validation admission policy count total, labeled by state and enforcement + action. + type: Counter + stabilityLevel: ALPHA + labels: + - enforcement_action + - state +- name: aggregator_discovery_aggregation_count_total + help: Counter of number of times discovery was aggregated + type: Counter + stabilityLevel: ALPHA +- name: decisions_total + subsystem: authorization + namespace: apiserver + help: Total number of terminal decisions made by an authorizer split by authorizer + type, name, and decision. + type: Counter + stabilityLevel: ALPHA + labels: + - decision + - name + - type +- name: match_condition_evaluation_errors_total + subsystem: authorization + namespace: apiserver + help: Total number of errors when an authorization webhook encounters a match condition + error split by authorizer type and name. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - type +- name: match_condition_evaluation_seconds + subsystem: authorization + namespace: apiserver + help: Authorization match condition evaluation time in seconds, split by authorizer + type and name. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + - type + buckets: + - 0.001 + - 0.005 + - 0.01 + - 0.025 + - 0.1 + - 0.2 + - 0.25 +- name: match_condition_exclusions_total + subsystem: authorization + namespace: apiserver + help: Total number of exclusions when an authorization webhook is skipped because + match conditions exclude it. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - type - name: compilation_duration_seconds subsystem: cel namespace: apiserver @@ -2783,6 +2915,14 @@ - 7.776e+06 - 1.5552e+07 - 3.1104e+07 +- name: current_inqueue_requests + subsystem: apiserver + help: Maximal number of queued requests in this apiserver per request kind in last + second. + type: Gauge + stabilityLevel: ALPHA + labels: + - request_kind - name: apiserver_delegated_authn_request_duration_seconds help: Request latency in seconds. Broken down by status code. type: Histogram @@ -2825,139 +2965,6 @@ stabilityLevel: ALPHA labels: - code -- name: active_fetch_count - subsystem: token_cache - namespace: authentication - type: Gauge - stabilityLevel: ALPHA - labels: - - status -- name: fetch_total - subsystem: token_cache - namespace: authentication - type: Counter - stabilityLevel: ALPHA - labels: - - status -- name: request_duration_seconds - subsystem: token_cache - namespace: authentication - type: Histogram - stabilityLevel: ALPHA - labels: - - status -- name: request_total - subsystem: token_cache - namespace: authentication - type: Counter - stabilityLevel: ALPHA - labels: - - status -- name: cache_list_fetched_objects_total - namespace: apiserver - help: Number of objects read from watch cache in the course of serving a LIST request - type: Counter - stabilityLevel: ALPHA - labels: - - index - - resource_prefix -- name: cache_list_returned_objects_total - namespace: apiserver - help: Number of objects returned for a LIST request from watch cache - type: Counter - stabilityLevel: ALPHA - labels: - - resource_prefix -- name: cache_list_total - namespace: apiserver - help: Number of LIST requests served from watch cache - type: Counter - stabilityLevel: ALPHA - labels: - - index - - resource_prefix -- name: current_inqueue_requests - subsystem: apiserver - help: Maximal number of queued requests in this apiserver per request kind in last - second. - type: Gauge - stabilityLevel: ALPHA - labels: - - request_kind -- name: dial_duration_seconds - subsystem: egress_dialer - namespace: apiserver - help: Dial latency histogram in seconds, labeled by the protocol (http-connect or - grpc), transport (tcp or uds) - type: Histogram - stabilityLevel: ALPHA - labels: - - protocol - - transport - buckets: - - 0.005 - - 0.025 - - 0.1 - - 0.5 - - 2.5 - - 12.5 -- name: dial_failure_count - subsystem: egress_dialer - namespace: apiserver - help: Dial failure count, labeled by the protocol (http-connect or grpc), transport - (tcp or uds), and stage (connect or proxy). The stage indicates at which stage - the dial failed - type: Counter - stabilityLevel: ALPHA - labels: - - protocol - - stage - - transport -- name: dial_start_total - subsystem: egress_dialer - namespace: apiserver - help: Dial starts, labeled by the protocol (http-connect or grpc) and transport - (tcp or uds). - type: Counter - stabilityLevel: ALPHA - labels: - - protocol - - transport -- name: automatic_reload_failures_total - subsystem: encryption_config_controller - namespace: apiserver - help: Total number of failed automatic reloads of encryption configuration split - by apiserver identity. - type: Counter - stabilityLevel: ALPHA - labels: - - apiserver_id_hash -- name: automatic_reload_last_timestamp_seconds - subsystem: encryption_config_controller - namespace: apiserver - help: Timestamp of the last successful or failed automatic reload of encryption - configuration split by apiserver identity. - type: Gauge - stabilityLevel: ALPHA - labels: - - apiserver_id_hash - - status -- name: automatic_reload_success_total - subsystem: encryption_config_controller - namespace: apiserver - help: Total number of successful automatic reloads of encryption configuration split - by apiserver identity. - type: Counter - stabilityLevel: ALPHA - labels: - - apiserver_id_hash -- name: init_events_total - namespace: apiserver - help: Counter of init events processed in watch cache broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource - name: request_aborts_total subsystem: apiserver help: Number of requests which apiserver aborted possibly due to a timeout, for @@ -3158,94 +3165,11 @@ - resource - subresource - verb -- name: storage_db_total_size_in_bytes - subsystem: apiserver - help: Total size of the storage database file physically allocated in bytes. - type: Gauge - deprecatedVersion: 1.28.0 - stabilityLevel: ALPHA - labels: - - endpoint -- name: storage_decode_errors_total - namespace: apiserver - help: Number of stored object decode errors split by object type - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: storage_events_received_total - subsystem: apiserver - help: Number of etcd events received split by kind. - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: apiserver_storage_list_evaluated_objects_total - help: Number of objects tested in the course of serving a LIST request from storage - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: apiserver_storage_list_fetched_objects_total - help: Number of objects read from storage in the course of serving a LIST request - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: apiserver_storage_list_returned_objects_total - help: Number of objects returned for a LIST request from storage - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: apiserver_storage_list_total - help: Number of LIST requests served from storage - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: apiserver_storage_size_bytes - help: Size of the storage database file physically allocated in bytes. - type: Custom - stabilityLevel: ALPHA - labels: - - storage_cluster_id -- name: terminated_watchers_total - namespace: apiserver - help: Counter of watchers closed due to unresponsiveness broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource - name: tls_handshake_errors_total subsystem: apiserver help: Number of requests dropped with 'TLS handshake error from' error type: Counter stabilityLevel: ALPHA -- name: events_dispatched_total - subsystem: watch_cache - namespace: apiserver - help: Counter of events dispatched in watch cache broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: events_received_total - subsystem: watch_cache - namespace: apiserver - help: Counter of events received in watch cache broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: initializations_total - subsystem: watch_cache - namespace: apiserver - help: Counter of watch cache initializations broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource - name: watch_events_sizes subsystem: apiserver help: Watch event size distribution in bytes @@ -3336,6 +3260,34 @@ - 4.096 - 8.192 - 16.384 +- name: active_fetch_count + subsystem: token_cache + namespace: authentication + type: Gauge + stabilityLevel: ALPHA + labels: + - status +- name: fetch_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status +- name: request_duration_seconds + subsystem: token_cache + namespace: authentication + type: Histogram + stabilityLevel: ALPHA + labels: + - status +- name: request_total + subsystem: token_cache + namespace: authentication + type: Counter + stabilityLevel: ALPHA + labels: + - status - name: authorization_attempts_total help: Counter of authorization attempts broken down by result. It can be either 'allowed', 'denied', 'no-opinion' or 'error'. @@ -3365,69 +3317,6 @@ - 4.096 - 8.192 - 16.384 -- name: etcd_bookmark_counts - help: Number of etcd bookmarks (progress notify events) split by kind. - type: Gauge - stabilityLevel: ALPHA - labels: - - resource -- name: etcd_lease_object_counts - help: Number of objects attached to a single etcd lease. - type: Histogram - stabilityLevel: ALPHA - buckets: - - 10 - - 50 - - 100 - - 500 - - 1000 - - 2500 - - 5000 -- name: etcd_request_duration_seconds - help: Etcd request latency in seconds for each operation and object type. - type: Histogram - stabilityLevel: ALPHA - labels: - - operation - - type - buckets: - - 0.005 - - 0.025 - - 0.05 - - 0.1 - - 0.2 - - 0.4 - - 0.6 - - 0.8 - - 1 - - 1.25 - - 1.5 - - 2 - - 3 - - 4 - - 5 - - 6 - - 8 - - 10 - - 15 - - 20 - - 30 - - 45 - - 60 -- name: etcd_request_errors_total - help: Etcd failed request counts for each operation and object type. - type: Counter - stabilityLevel: ALPHA - labels: - - operation - - type -- name: etcd_requests_total - help: Etcd request counts for each operation and object type. - type: Counter - stabilityLevel: ALPHA - labels: - - operation - - type - name: field_validation_request_duration_seconds help: Response latency distribution in seconds for each field validation value type: Histogram @@ -3456,27 +3345,6 @@ - 30 - 45 - 60 -- name: capacity - subsystem: watch_cache - help: Total capacity of watch cache broken by resource type. - type: Gauge - stabilityLevel: ALPHA - labels: - - resource -- name: capacity_decrease_total - subsystem: watch_cache - help: Total number of watch cache capacity decrease events broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource -- name: capacity_increase_total - subsystem: watch_cache - help: Total number of watch cache capacity increase events broken by resource type. - type: Counter - stabilityLevel: ALPHA - labels: - - resource - name: current_inflight_requests subsystem: apiserver help: Maximal number of currently used inflight request limit of this apiserver @@ -3588,13 +3456,148 @@ - 1e+07 - 1e+08 - 1e+09 -- name: apiserver_storage_objects - help: Number of stored objects at the time of last check split by kind. In case - of a fetching error, the value will be -1. +- name: automatic_reload_last_timestamp_seconds + subsystem: authentication_config_controller + namespace: apiserver + help: Timestamp of the last automatic reload of authentication configuration split + by status and apiserver identity. type: Gauge - stabilityLevel: STABLE + stabilityLevel: ALPHA labels: - - resource + - apiserver_id_hash + - status +- name: automatic_reloads_total + subsystem: authentication_config_controller + namespace: apiserver + help: Total number of automatic reloads of authentication configuration split by + status and apiserver identity. + type: Counter + stabilityLevel: ALPHA + labels: + - apiserver_id_hash + - status +- name: automatic_reload_last_timestamp_seconds + subsystem: authorization_config_controller + namespace: apiserver + help: Timestamp of the last automatic reload of authorization configuration split + by status and apiserver identity. + type: Gauge + stabilityLevel: ALPHA + labels: + - apiserver_id_hash + - status +- name: automatic_reloads_total + subsystem: authorization_config_controller + namespace: apiserver + help: Total number of automatic reloads of authorization configuration split by + status and apiserver identity. + type: Counter + stabilityLevel: ALPHA + labels: + - apiserver_id_hash + - status +- name: cache_list_fetched_objects_total + namespace: apiserver + help: Number of objects read from watch cache in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: cache_list_returned_objects_total + namespace: apiserver + help: Number of objects returned for a LIST request from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - resource_prefix +- name: cache_list_total + namespace: apiserver + help: Number of LIST requests served from watch cache + type: Counter + stabilityLevel: ALPHA + labels: + - index + - resource_prefix +- name: dial_duration_seconds + subsystem: egress_dialer + namespace: apiserver + help: Dial latency histogram in seconds, labeled by the protocol (http-connect or + grpc), transport (tcp or uds) + type: Histogram + stabilityLevel: ALPHA + labels: + - protocol + - transport + buckets: + - 0.005 + - 0.025 + - 0.1 + - 0.5 + - 2.5 + - 12.5 +- name: dial_failure_count + subsystem: egress_dialer + namespace: apiserver + help: Dial failure count, labeled by the protocol (http-connect or grpc), transport + (tcp or uds), and stage (connect or proxy). The stage indicates at which stage + the dial failed + type: Counter + stabilityLevel: ALPHA + labels: + - protocol + - stage + - transport +- name: dial_start_total + subsystem: egress_dialer + namespace: apiserver + help: Dial starts, labeled by the protocol (http-connect or grpc) and transport + (tcp or uds). + type: Counter + stabilityLevel: ALPHA + labels: + - protocol + - transport +- name: automatic_reload_failures_total + subsystem: encryption_config_controller + namespace: apiserver + help: Total number of failed automatic reloads of encryption configuration split + by apiserver identity. + type: Counter + deprecatedVersion: 1.30.0 + stabilityLevel: ALPHA + labels: + - apiserver_id_hash +- name: automatic_reload_last_timestamp_seconds + subsystem: encryption_config_controller + namespace: apiserver + help: Timestamp of the last successful or failed automatic reload of encryption + configuration split by apiserver identity. + type: Gauge + stabilityLevel: ALPHA + labels: + - apiserver_id_hash + - status +- name: automatic_reload_success_total + subsystem: encryption_config_controller + namespace: apiserver + help: Total number of successful automatic reloads of encryption configuration split + by apiserver identity. + type: Counter + deprecatedVersion: 1.30.0 + stabilityLevel: ALPHA + labels: + - apiserver_id_hash +- name: automatic_reloads_total + subsystem: encryption_config_controller + namespace: apiserver + help: Total number of reload successes and failures of encryption configuration + split by apiserver identity. + type: Counter + stabilityLevel: ALPHA + labels: + - apiserver_id_hash + - status - name: dek_cache_fill_percent subsystem: envelope_encryption namespace: apiserver @@ -4040,6 +4043,13 @@ - 2 - 4 - 10 +- name: init_events_total + namespace: apiserver + help: Counter of init events processed in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource - name: rerouted_request_total subsystem: apiserver help: Total number of requests that were proxied to a peer kube apiserver because @@ -4075,12 +4085,58 @@ help: Total number of failed data encryption key(DEK) generation operations. type: Counter stabilityLevel: ALPHA +- name: storage_db_total_size_in_bytes + subsystem: apiserver + help: Total size of the storage database file physically allocated in bytes. + type: Gauge + deprecatedVersion: 1.28.0 + stabilityLevel: ALPHA + labels: + - endpoint +- name: storage_decode_errors_total + namespace: apiserver + help: Number of stored object decode errors split by object type + type: Counter + stabilityLevel: ALPHA + labels: + - resource - name: envelope_transformation_cache_misses_total subsystem: storage namespace: apiserver help: Total number of cache misses while accessing key decryption key(KEK). type: Counter stabilityLevel: ALPHA +- name: storage_events_received_total + subsystem: apiserver + help: Number of etcd events received split by kind. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_evaluated_objects_total + help: Number of objects tested in the course of serving a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_fetched_objects_total + help: Number of objects read from storage in the course of serving a LIST request + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_returned_objects_total + help: Number of objects returned for a LIST request from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: apiserver_storage_list_total + help: Number of LIST requests served from storage + type: Counter + stabilityLevel: ALPHA + labels: + - resource - name: transformation_duration_seconds subsystem: storage namespace: apiserver @@ -4129,22 +4185,151 @@ - status - transformation_type - transformer_prefix -- name: x509_insecure_sha1_total - subsystem: webhooks - namespace: apiserver - help: Counts the number of requests to servers with insecure SHA1 signatures in - their serving certificate OR the number of connection failures due to the insecure - SHA1 signatures (either/or, based on the runtime environment) +- name: stream_translator_requests_total + subsystem: apiserver + help: Total number of requests that were handled by the StreamTranslatorProxy, which + processes streaming RemoteCommand/V5 type: Counter stabilityLevel: ALPHA -- name: x509_missing_san_total - subsystem: webhooks + labels: + - code +- name: terminated_watchers_total namespace: apiserver - help: Counts the number of requests to servers missing SAN extension in their serving - certificate OR the number of connection failures due to the lack of x509 certificate - SAN extension missing (either/or, based on the runtime environment) + help: Counter of watchers closed due to unresponsiveness broken by resource type. type: Counter stabilityLevel: ALPHA + labels: + - resource +- name: events_dispatched_total + subsystem: watch_cache + namespace: apiserver + help: Counter of events dispatched in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: events_received_total + subsystem: watch_cache + namespace: apiserver + help: Counter of events received in watch cache broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: initializations_total + subsystem: watch_cache + namespace: apiserver + help: Counter of watch cache initializations broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: read_wait_seconds + subsystem: watch_cache + namespace: apiserver + help: Histogram of time spent waiting for a watch cache to become fresh. + type: Histogram + stabilityLevel: ALPHA + labels: + - resource + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 +- name: etcd_bookmark_counts + help: Number of etcd bookmarks (progress notify events) split by kind. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: etcd_lease_object_counts + help: Number of objects attached to a single etcd lease. + type: Histogram + stabilityLevel: ALPHA + buckets: + - 10 + - 50 + - 100 + - 500 + - 1000 + - 2500 + - 5000 +- name: etcd_request_duration_seconds + help: Etcd request latency in seconds for each operation and object type. + type: Histogram + stabilityLevel: ALPHA + labels: + - operation + - type + buckets: + - 0.005 + - 0.025 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.6 + - 0.8 + - 1 + - 1.25 + - 1.5 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 10 + - 15 + - 20 + - 30 + - 45 + - 60 +- name: etcd_request_errors_total + help: Etcd failed request counts for each operation and object type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation + - type +- name: etcd_requests_total + help: Etcd request counts for each operation and object type. + type: Counter + stabilityLevel: ALPHA + labels: + - operation + - type +- name: capacity + subsystem: watch_cache + help: Total capacity of watch cache broken by resource type. + type: Gauge + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_decrease_total + subsystem: watch_cache + help: Total number of watch cache capacity decrease events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource +- name: capacity_increase_total + subsystem: watch_cache + help: Total number of watch cache capacity increase events broken by resource type. + type: Counter + stabilityLevel: ALPHA + labels: + - resource - name: current_executing_requests subsystem: flowcontrol namespace: apiserver @@ -4227,6 +4412,98 @@ - 10 - 15 - 30 +- name: apiserver_storage_objects + help: Number of stored objects at the time of last check split by kind. In case + of a fetching error, the value will be -1. + type: Gauge + stabilityLevel: STABLE + labels: + - resource +- name: apiserver_storage_size_bytes + help: Size of the storage database file physically allocated in bytes. + type: Custom + stabilityLevel: STABLE + labels: + - storage_cluster_id +- name: jwt_authenticator_latency_seconds + subsystem: authentication + namespace: apiserver + help: Latency of jwt authentication operations in seconds. This is the time spent + authenticating a token for cache miss only (i.e. when the token is not found in + the cache). + type: Histogram + stabilityLevel: ALPHA + labels: + - jwt_issuer_hash + - result + buckets: + - 0.001 + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: webhook_duration_seconds + subsystem: authorization + namespace: apiserver + help: Request latency in seconds. + type: Histogram + stabilityLevel: ALPHA + labels: + - name + - result + buckets: + - 0.005 + - 0.01 + - 0.025 + - 0.05 + - 0.1 + - 0.25 + - 0.5 + - 1 + - 2.5 + - 5 + - 10 +- name: webhook_evaluations_fail_open_total + subsystem: authorization + namespace: apiserver + help: NoOpinion results due to webhook timeout or error. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - result +- name: webhook_evaluations_total + subsystem: authorization + namespace: apiserver + help: Round-trips to authorization webhooks. + type: Counter + stabilityLevel: ALPHA + labels: + - name + - result +- name: x509_insecure_sha1_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers with insecure SHA1 signatures in + their serving certificate OR the number of connection failures due to the insecure + SHA1 signatures (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA +- name: x509_missing_san_total + subsystem: webhooks + namespace: apiserver + help: Counts the number of requests to servers missing SAN extension in their serving + certificate OR the number of connection failures due to the lack of x509 certificate + SAN extension missing (either/or, based on the runtime environment) + type: Counter + stabilityLevel: ALPHA - name: request_duration_seconds subsystem: cloud_provider_webhook help: Request latency in seconds. Broken down by status code. @@ -4333,84 +4610,6 @@ - 4096 - 8192 - 16384 -- name: changes - subsystem: endpoint_slice_controller - help: Number of EndpointSlice changes - type: Counter - stabilityLevel: ALPHA - labels: - - operation -- name: desired_endpoint_slices - subsystem: endpoint_slice_controller - help: Number of EndpointSlices that would exist with perfect endpoint allocation - type: Gauge - stabilityLevel: ALPHA -- name: endpoints_added_per_sync - subsystem: endpoint_slice_controller - help: Number of endpoints added on each Service sync - type: Histogram - stabilityLevel: ALPHA - buckets: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 1024 - - 2048 - - 4096 - - 8192 - - 16384 - - 32768 -- name: endpoints_desired - subsystem: endpoint_slice_controller - help: Number of endpoints desired - type: Gauge - stabilityLevel: ALPHA -- name: endpoints_removed_per_sync - subsystem: endpoint_slice_controller - help: Number of endpoints removed on each Service sync - type: Histogram - stabilityLevel: ALPHA - buckets: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 1024 - - 2048 - - 4096 - - 8192 - - 16384 - - 32768 -- name: endpointslices_changed_per_sync - subsystem: endpoint_slice_controller - help: Number of EndpointSlices changed on each Service sync - type: Histogram - stabilityLevel: ALPHA - labels: - - topology -- name: num_endpoint_slices - subsystem: endpoint_slice_controller - help: Number of EndpointSlices - type: Gauge - stabilityLevel: ALPHA -- name: syncs - subsystem: endpoint_slice_controller - help: Number of EndpointSlice syncs - type: Counter - stabilityLevel: ALPHA - labels: - - result - name: kubernetes_build_info help: A metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes @@ -4767,95 +4966,92 @@ SAN extension missing (either/or, based on the runtime environment) type: Counter stabilityLevel: ALPHA -- name: api_request_duration_seconds - namespace: cloudprovider_azure - help: Latency of an Azure API call +- name: changes + subsystem: endpoint_slice_controller + help: Number of EndpointSlice changes + type: Counter + stabilityLevel: ALPHA + labels: + - operation +- name: desired_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: ALPHA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints removed on each Service sync + type: Histogram + stabilityLevel: ALPHA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpointslices_changed_per_sync + subsystem: endpoint_slice_controller + help: Number of EndpointSlices changed on each Service sync type: Histogram stabilityLevel: ALPHA labels: - - request - - resource_group - - source - - subscription_id - buckets: - - 0.1 - - 0.25 - - 0.5 - - 1 - - 2.5 - - 5 - - 10 - - 15 - - 25 - - 50 - - 120 - - 300 - - 600 - - 1200 -- name: api_request_errors - namespace: cloudprovider_azure - help: Number of errors for an Azure API call + - topology + - traffic_distribution +- name: num_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: ALPHA +- name: services_count_by_traffic_distribution + subsystem: endpoint_slice_controller + help: Number of Services using some specific trafficDistribution + type: Gauge + stabilityLevel: ALPHA + labels: + - traffic_distribution +- name: syncs + subsystem: endpoint_slice_controller + help: Number of EndpointSlice syncs type: Counter stabilityLevel: ALPHA labels: - - request - - resource_group - - source - - subscription_id -- name: api_request_ratelimited_count - namespace: cloudprovider_azure - help: Number of rate limited Azure API calls - type: Counter - stabilityLevel: ALPHA - labels: - - request - - resource_group - - source - - subscription_id -- name: api_request_throttled_count - namespace: cloudprovider_azure - help: Number of throttled Azure API calls - type: Counter - stabilityLevel: ALPHA - labels: - - request - - resource_group - - source - - subscription_id -- name: op_duration_seconds - namespace: cloudprovider_azure - help: Latency of an Azure service operation - type: Histogram - stabilityLevel: ALPHA - labels: - - request - - resource_group - - source - - subscription_id - buckets: - - 0.1 - - 0.2 - - 0.5 - - 1 - - 10 - - 20 - - 30 - - 40 - - 50 - - 60 - - 100 - - 200 - - 300 -- name: op_failure_count - namespace: cloudprovider_azure - help: Number of failed Azure service operations - type: Counter - stabilityLevel: ALPHA - labels: - - request - - resource_group - - source - - subscription_id + - result - name: cloudprovider_gce_api_request_duration_seconds help: Latency of a GCE API call type: Histogram @@ -4874,38 +5070,6 @@ - request - version - zone -- name: cloudprovider_vsphere_api_request_duration_seconds - help: Latency of vsphere api call - type: Histogram - stabilityLevel: ALPHA - labels: - - request -- name: cloudprovider_vsphere_api_request_errors - help: vsphere Api errors - type: Counter - stabilityLevel: ALPHA - labels: - - request -- name: cloudprovider_vsphere_operation_duration_seconds - help: Latency of vsphere operation call - type: Histogram - stabilityLevel: ALPHA - labels: - - operation -- name: cloudprovider_vsphere_operation_errors - help: vsphere operation errors - type: Counter - stabilityLevel: ALPHA - labels: - - operation -- name: cloudprovider_vsphere_vcenter_versions - help: Versions for connected vSphere vCenters - type: Custom - stabilityLevel: ALPHA - labels: - - hostname - - version - - build - name: get_token_count help: Counter of total Token() requests to the alternate token source type: Counter diff --git a/test/instrumentation/documentation/documentation.md b/test/instrumentation/documentation/documentation.md index b59776a4099..a0269c06c79 100644 --- a/test/instrumentation/documentation/documentation.md +++ b/test/instrumentation/documentation/documentation.md @@ -8,7 +8,7 @@ description: >- ## Metrics (v1.30) - + This page details the metrics that different Kubernetes components export. You can query the metrics endpoint for these components using an HTTP scrape, and fetch the current metrics data in Prometheus format. @@ -88,6 +88,13 @@ Stable metrics observe strict API contracts and no labels can be added or remove
  • Gauge
  • resource
  • +
    apiserver_storage_size_bytes
    +
    Size of the storage database file physically allocated in bytes.
    + +
    container_cpu_usage_seconds_total
    Cumulative cpu time consumed by the container in core-seconds
    scheduler_preemption_attempts_total
    Total preemption attempts in the cluster till now
    @@ -508,6 +515,90 @@ Alpha metrics do not have any API guarantees. These metrics must be used at your
  • Counter
  • +
    apiserver_authentication_config_controller_automatic_reload_last_timestamp_seconds
    +
    Timestamp of the last automatic reload of authentication configuration split by status and apiserver identity.
    + +
    +
    apiserver_authentication_config_controller_automatic_reloads_total
    +
    Total number of automatic reloads of authentication configuration split by status and apiserver identity.
    + +
    +
    apiserver_authentication_jwt_authenticator_latency_seconds
    +
    Latency of jwt authentication operations in seconds. This is the time spent authenticating a token for cache miss only (i.e. when the token is not found in the cache).
    + +
    +
    apiserver_authorization_config_controller_automatic_reload_last_timestamp_seconds
    +
    Timestamp of the last automatic reload of authorization configuration split by status and apiserver identity.
    + +
    +
    apiserver_authorization_config_controller_automatic_reloads_total
    +
    Total number of automatic reloads of authorization configuration split by status and apiserver identity.
    + +
    +
    apiserver_authorization_decisions_total
    +
    Total number of terminal decisions made by an authorizer split by authorizer type, name, and decision.
    + +
    +
    apiserver_authorization_match_condition_evaluation_errors_total
    +
    Total number of errors when an authorization webhook encounters a match condition error split by authorizer type and name.
    + +
    +
    apiserver_authorization_match_condition_evaluation_seconds
    +
    Authorization match condition evaluation time in seconds, split by authorizer type and name.
    + +
    +
    apiserver_authorization_match_condition_exclusions_total
    +
    Total number of exclusions when an authorization webhook is skipped because match conditions exclude it.
    + +
    +
    apiserver_authorization_webhook_duration_seconds
    +
    Request latency in seconds.
    + +
    +
    apiserver_authorization_webhook_evaluations_fail_open_total
    +
    NoOpinion results due to webhook timeout or error.
    + +
    +
    apiserver_authorization_webhook_evaluations_total
    +
    Round-trips to authorization webhooks.
    + +
    apiserver_cache_list_fetched_objects_total
    Number of objects read from watch cache in the course of serving a LIST request
    apiserver_encryption_config_controller_automatic_reload_last_timestamp_seconds
    Timestamp of the last successful or failed automatic reload of encryption configuration split by apiserver identity.
    @@ -674,7 +765,14 @@ Alpha metrics do not have any API guarantees. These metrics must be used at your +
  • apiserver_id_hash
  • 1.30.0
  • +
    +
    apiserver_encryption_config_controller_automatic_reloads_total
    +
    Total number of reload successes and failures of encryption configuration split by apiserver identity.
    +
    apiserver_envelope_encryption_dek_cache_fill_percent
    Percent of the cache slots currently occupied by cached DEKs.
    @@ -949,6 +1047,13 @@ Alpha metrics do not have any API guarantees. These metrics must be used at your
  • Counter
  • type
  • +
    apiserver_nodeport_repair_reconcile_errors_total
    +
    Number of reconciliation failures on the nodeport repair reconcile loop
    + +
    apiserver_request_aborts_total
    Number of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
    -
    apiserver_storage_size_bytes
    -
    Size of the storage database file physically allocated in bytes.
    - -
    apiserver_storage_transformation_duration_seconds
    Latencies in seconds of value transformation operations.
    +
    apiserver_stream_translator_requests_total
    +
    Total number of requests that were handled by the StreamTranslatorProxy, which processes streaming RemoteCommand/V5
    + +
    apiserver_terminated_watchers_total
    Counter of watchers closed due to unresponsiveness broken by resource type.
    +
    apiserver_watch_cache_read_wait_seconds
    +
    Histogram of time spent waiting for a watch cache to become fresh.
    + +
    apiserver_watch_events_sizes
    Watch event size distribution in bytes
    -
    cloudprovider_azure_api_request_duration_seconds
    -
    Latency of an Azure API call
    - -
    -
    cloudprovider_azure_api_request_errors
    -
    Number of errors for an Azure API call
    - -
    -
    cloudprovider_azure_api_request_ratelimited_count
    -
    Number of rate limited Azure API calls
    - -
    -
    cloudprovider_azure_api_request_throttled_count
    -
    Number of throttled Azure API calls
    - -
    -
    cloudprovider_azure_op_duration_seconds
    -
    Latency of an Azure service operation
    - -
    -
    cloudprovider_azure_op_failure_count
    -
    Number of failed Azure service operations
    - -
    cloudprovider_gce_api_request_duration_seconds
    Latency of a GCE API call
    -
    cloudprovider_vsphere_api_request_duration_seconds
    -
    Latency of vsphere api call
    - -
    -
    cloudprovider_vsphere_api_request_errors
    -
    vsphere Api errors
    - -
    -
    cloudprovider_vsphere_operation_duration_seconds
    -
    Latency of vsphere operation call
    - -
    -
    cloudprovider_vsphere_operation_errors
    -
    vsphere operation errors
    - -
    -
    cloudprovider_vsphere_vcenter_versions
    -
    Versions for connected vSphere vCenters
    - -
    container_swap_usage_bytes
    Current amount of the container swap usage in bytes. Reported only on non-windows systems
    endpoint_slice_controller_num_endpoint_slices
    Number of EndpointSlices
    @@ -1446,6 +1481,13 @@ Alpha metrics do not have any API guarantees. These metrics must be used at your
  • Gauge
  • +
    endpoint_slice_controller_services_count_by_traffic_distribution
    +
    Number of Services using some specific trafficDistribution
    + +
    endpoint_slice_controller_syncs
    Number of EndpointSlice syncs
    +
    job_controller_jobs_by_external_controller_total
    +
    The number of Jobs managed by an external controller
    + +
    job_controller_pod_failures_handled_by_failure_policy_total
    `The number of failed Pods handled by failure policy with, respect to the failure policy action applied based on the matched, rule. Possible values of the action label correspond to the, possible values for the failure policy rule action, which are:, "FailJob", "Ignore" and "Count".`
    +
    kube_apiserver_nodeport_allocator_allocation_errors_total
    +
    Number of errors trying to allocate NodePort
    + +
    +
    kube_apiserver_nodeport_allocator_allocation_total
    +
    Number of NodePort allocations
    + +
    kube_apiserver_nodeport_allocator_available_ports
    Gauge measuring the number of available NodePorts for Services
    +
    +
    kubelet_image_pull_duration_seconds
    +
    Duration in seconds to pull an image.
    +
    kubelet_lifecycle_handler_http_fallbacks_total
    The number of times lifecycle handlers successfully fell back to http from https.
    @@ -1936,6 +2006,20 @@ Alpha metrics do not have any API guarantees. These metrics must be used at your
  • Gauge
  • +
    kubelet_memory_manager_pinning_errors_total
    +
    The number of memory pages allocations which required pinning that failed.
    + +
    +
    kubelet_memory_manager_pinning_requests_total
    +
    The number of memory pages allocations which required pinning.
    + +
    kubelet_mirror_pods
    The number of mirror pods the kubelet will try to create (one per admitted static pod)
    +
    kubelet_sleep_action_terminated_early_total
    +
    The number of times lifecycle sleep handler got terminated before it finishes
    + +
    kubelet_started_containers_errors_total
    Cumulative number of errors when starting containers
    scheduler_plugin_evaluation_total
    -
    Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter and Filter.).
    +
    Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).
    volume_manager_selinux_container_warnings_total
    Number of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.
    +
  • access_mode
  • volume_manager_selinux_pod_context_mismatch_errors_total
    Number of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.
    +
  • access_mode
  • volume_manager_selinux_pod_context_mismatch_warnings_total
    Number of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.
    +
  • access_mode
  • volume_manager_selinux_volume_context_mismatch_errors_total
    Number of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.
    +
  • access_modevolume_plugin
  • volume_manager_selinux_volume_context_mismatch_warnings_total
    Number of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.
    +
  • access_modevolume_plugin
  • volume_manager_selinux_volumes_admitted_total
    Number of volumes whose SELinux context was fine and will be mounted with mount -o context option.
    +
  • access_modevolume_plugin
  • volume_manager_total_volumes
    Number of volumes in Volume Manager
    diff --git a/test/instrumentation/main.go b/test/instrumentation/main.go index 1df7b3a7632..d6a63fbf458 100644 --- a/test/instrumentation/main.go +++ b/test/instrumentation/main.go @@ -78,10 +78,11 @@ func main() { } ms, es := searchPathForStableMetrics(arg) for _, m := range ms { - if _, ok := stableMetricNames[m.Name]; !ok { + fqName := m.buildFQName() + if _, ok := stableMetricNames[fqName]; !ok { stableMetrics = append(stableMetrics, m) } - stableMetricNames[m.Name] = struct{}{} + stableMetricNames[fqName] = struct{}{} } errors = append(errors, es...) }